]> git.pld-linux.org Git - packages/kernel.git/blame - kernel-ext4.patch
- up to 2.6.27.10
[packages/kernel.git] / kernel-ext4.patch
CommitLineData
02f21861 1Patchset: 2.6.26-ext4-7
2
3This patch was created by combining the ext4-pushed-post-2.6.27-rc1.gz
4patches with the stable patches in 2.6.27-rc3-ext4-1 series.
5
6 Documentation/filesystems/ext4.txt | 131 ++-
7 fs/buffer.c | 19 +-
8 fs/ext4/acl.c | 188 ++--
9 fs/ext4/balloc.c | 221 +++--
10 fs/ext4/dir.c | 37 +-
11 fs/ext4/ext4.h | 64 +-
12 fs/ext4/ext4_extents.h | 5 +-
13 fs/ext4/ext4_i.h | 10 +-
14 fs/ext4/ext4_jbd2.h | 29 +-
15 fs/ext4/ext4_sb.h | 5 +-
16 fs/ext4/extents.c | 277 +++---
17 fs/ext4/file.c | 20 +-
18 fs/ext4/fsync.c | 4 +
19 fs/ext4/group.h | 2 +-
20 fs/ext4/ialloc.c | 169 +++-
21 fs/ext4/inode.c | 1931 ++++++++++++++++++++++++++++++------
22 fs/ext4/mballoc.c | 744 +++++++++++----
23 fs/ext4/mballoc.h | 10 +-
24 fs/ext4/migrate.c | 3 +-
25 fs/ext4/namei.c | 45 +-
26 fs/ext4/resize.c | 134 ++-
27 fs/ext4/super.c | 451 ++++++---
28 fs/ext4/xattr.c | 4 +-
29 fs/ext4/xattr_trusted.c | 4 +-
30 fs/ext4/xattr_user.c | 4 +-
31 fs/jbd2/checkpoint.c | 1 -
32 fs/jbd2/commit.c | 308 +++----
33 fs/jbd2/journal.c | 54 +-
34 fs/jbd2/transaction.c | 365 +++----
35 fs/mpage.c | 14 +-
36 include/linux/fs.h | 2 +
37 include/linux/jbd2.h | 73 +-
38 include/linux/mpage.h | 10 +
39 include/linux/percpu_counter.h | 12 +-
40 include/linux/writeback.h | 1 +
41 lib/percpu_counter.c | 7 +-
42 mm/filemap.c | 3 +-
43 mm/page-writeback.c | 3 +
44 38 files changed, 3822 insertions(+), 1542 deletions(-)
45
46diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
47index 0c5086d..0d53949 100644
48--- a/Documentation/filesystems/ext4.txt
49+++ b/Documentation/filesystems/ext4.txt
50@@ -13,72 +13,99 @@ Mailing list: linux-ext4@vger.kernel.org
51 1. Quick usage instructions:
52 ===========================
53
54- - Grab updated e2fsprogs from
55- ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs-interim/
56- This is a patchset on top of e2fsprogs-1.39, which can be found at
57+ - Compile and install the latest version of e2fsprogs (as of this
58+ writing version 1.41) from:
59+
60+ http://sourceforge.net/project/showfiles.php?group_id=2406
61+
62+ or
63+
64 ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/
65
66- - It's still mke2fs -j /dev/hda1
67+ or grab the latest git repository from:
68+
69+ git://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git
70+
71+ - Note that it is highly important to install the mke2fs.conf file
72+ that comes with the e2fsprogs 1.41.x sources in /etc/mke2fs.conf. If
73+ you have edited the /etc/mke2fs.conf file installed on your system,
74+ you will need to merge your changes with the version from e2fsprogs
75+ 1.41.x.
76+
77+ - Create a new filesystem using the ext4dev filesystem type:
78+
79+ # mke2fs -t ext4dev /dev/hda1
80+
81+ Or configure an existing ext3 filesystem to support extents and set
82+ the test_fs flag to indicate that it's ok for an in-development
83+ filesystem to touch this filesystem:
84
85- - mount /dev/hda1 /wherever -t ext4dev
86+ # tune2fs -O extents -E test_fs /dev/hda1
87
88- - To enable extents,
89+ If the filesystem was created with 128 byte inodes, it can be
90+ converted to use 256 byte for greater efficiency via:
91
92- mount /dev/hda1 /wherever -t ext4dev -o extents
93+ # tune2fs -I 256 /dev/hda1
94
95- - The filesystem is compatible with the ext3 driver until you add a file
96- which has extents (ie: `mount -o extents', then create a file).
97+ (Note: we currently do not have tools to convert an ext4dev
98+ filesystem back to ext3; so please do not do try this on production
99+ filesystems.)
100
101- NOTE: The "extents" mount flag is temporary. It will soon go away and
102- extents will be enabled by the "-o extents" flag to mke2fs or tune2fs
103+ - Mounting:
104+
105+ # mount -t ext4dev /dev/hda1 /wherever
106
107 - When comparing performance with other filesystems, remember that
108- ext3/4 by default offers higher data integrity guarantees than most. So
109- when comparing with a metadata-only journalling filesystem, use `mount -o
110- data=writeback'. And you might as well use `mount -o nobh' too along
111- with it. Making the journal larger than the mke2fs default often helps
112- performance with metadata-intensive workloads.
113+ ext3/4 by default offers higher data integrity guarantees than most.
114+ So when comparing with a metadata-only journalling filesystem, such
115+ as ext3, use `mount -o data=writeback'. And you might as well use
116+ `mount -o nobh' too along with it. Making the journal larger than
117+ the mke2fs default often helps performance with metadata-intensive
118+ workloads.
119
120 2. Features
121 ===========
122
123 2.1 Currently available
124
125-* ability to use filesystems > 16TB
126+* ability to use filesystems > 16TB (e2fsprogs support not available yet)
127 * extent format reduces metadata overhead (RAM, IO for access, transactions)
128 * extent format more robust in face of on-disk corruption due to magics,
129 * internal redunancy in tree
130-
131-2.1 Previously available, soon to be enabled by default by "mkefs.ext4":
132-
133-* dir_index and resize inode will be on by default
134-* large inodes will be used by default for fast EAs, nsec timestamps, etc
135+* improved file allocation (multi-block alloc)
136+* fix 32000 subdirectory limit
137+* nsec timestamps for mtime, atime, ctime, create time
138+* inode version field on disk (NFSv4, Lustre)
139+* reduced e2fsck time via uninit_bg feature
140+* journal checksumming for robustness, performance
141+* persistent file preallocation (e.g for streaming media, databases)
142+* ability to pack bitmaps and inode tables into larger virtual groups via the
143+ flex_bg feature
144+* large file support
145+* Inode allocation using large virtual block groups via flex_bg
146+* delayed allocation
147+* large block (up to pagesize) support
148+* efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force
149+ the ordering)
150
151 2.2 Candidate features for future inclusion
152
153-There are several under discussion, whether they all make it in is
154-partly a function of how much time everyone has to work on them:
155+* Online defrag (patches available but not well tested)
156+* reduced mke2fs time via lazy itable initialization in conjuction with
157+ the uninit_bg feature (capability to do this is available in e2fsprogs
158+ but a kernel thread to do lazy zeroing of unused inode table blocks
159+ after filesystem is first mounted is required for safety)
160
161-* improved file allocation (multi-block alloc, delayed alloc; basically done)
162-* fix 32000 subdirectory limit (patch exists, needs some e2fsck work)
163-* nsec timestamps for mtime, atime, ctime, create time (patch exists,
164- needs some e2fsck work)
165-* inode version field on disk (NFSv4, Lustre; prototype exists)
166-* reduced mke2fs/e2fsck time via uninitialized groups (prototype exists)
167-* journal checksumming for robustness, performance (prototype exists)
168-* persistent file preallocation (e.g for streaming media, databases)
169+There are several others under discussion, whether they all make it in is
170+partly a function of how much time everyone has to work on them. Features like
171+metadata checksumming have been discussed and planned for a bit but no patches
172+exist yet so I'm not sure they're in the near-term roadmap.
173
174-Features like metadata checksumming have been discussed and planned for
175-a bit but no patches exist yet so I'm not sure they're in the near-term
176-roadmap.
177+The big performance win will come with mballoc, delalloc and flex_bg
178+grouping of bitmaps and inode tables. Some test results available here:
179
180-The big performance win will come with mballoc and delalloc. CFS has
181-been using mballoc for a few years already with Lustre, and IBM + Bull
182-did a lot of benchmarking on it. The reason it isn't in the first set of
183-patches is partly a manageability issue, and partly because it doesn't
184-directly affect the on-disk format (outside of much better allocation)
185-so it isn't critical to get into the first round of changes. I believe
186-Alex is working on a new set of patches right now.
187+ - http://www.bullopensource.org/ext4/20080530/ffsb-write-2.6.26-rc2.html
188+ - http://www.bullopensource.org/ext4/20080530/ffsb-readwrite-2.6.26-rc2.html
189
190 3. Options
191 ==========
192@@ -222,9 +249,11 @@ stripe=n Number of filesystem blocks that mballoc will try
193 to use for allocation size and alignment. For RAID5/6
194 systems this should be the number of data
195 disks * RAID chunk size in file system blocks.
196-
197+delalloc (*) Deferring block allocation until write-out time.
198+nodelalloc Disable delayed allocation. Blocks are allocation
199+ when data is copied from user to page cache.
200 Data Mode
201----------
202+=========
203 There are 3 different data modes:
204
205 * writeback mode
206@@ -236,10 +265,10 @@ typically provide the best ext4 performance.
207
208 * ordered mode
209 In data=ordered mode, ext4 only officially journals metadata, but it logically
210-groups metadata and data blocks into a single unit called a transaction. When
211-it's time to write the new metadata out to disk, the associated data blocks
212-are written first. In general, this mode performs slightly slower than
213-writeback but significantly faster than journal mode.
214+groups metadata information related to data changes with the data blocks into a
215+single unit called a transaction. When it's time to write the new metadata
216+out to disk, the associated data blocks are written first. In general,
217+this mode performs slightly slower than writeback but significantly faster than journal mode.
218
219 * journal mode
220 data=journal mode provides full data and metadata journaling. All new data is
221@@ -247,7 +276,8 @@ written to the journal first, and then to its final location.
222 In the event of a crash, the journal can be replayed, bringing both data and
223 metadata into a consistent state. This mode is the slowest except when data
224 needs to be read from and written to disk at the same time where it
225-outperforms all others modes.
226+outperforms all others modes. Curently ext4 does not have delayed
227+allocation support if this data journalling mode is selected.
228
229 References
230 ==========
231@@ -256,7 +286,8 @@ kernel source: <file:fs/ext4/>
232 <file:fs/jbd2/>
233
234 programs: http://e2fsprogs.sourceforge.net/
235- http://ext2resize.sourceforge.net
236
237 useful links: http://fedoraproject.org/wiki/ext3-devel
238 http://www.bullopensource.org/ext4/
239+ http://ext4.wiki.kernel.org/index.php/Main_Page
240+ http://fedoraproject.org/wiki/Features/Ext4
241diff --git a/fs/buffer.c b/fs/buffer.c
242index 0f51c0f..5fa1512 100644
243--- a/fs/buffer.c
244+++ b/fs/buffer.c
245@@ -1691,11 +1691,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
246 */
247 clear_buffer_dirty(bh);
248 set_buffer_uptodate(bh);
249- } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
250+ } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
251+ buffer_dirty(bh)) {
252 WARN_ON(bh->b_size != blocksize);
253 err = get_block(inode, block, bh, 1);
254 if (err)
255 goto recover;
256+ clear_buffer_delay(bh);
257 if (buffer_new(bh)) {
258 /* blockdev mappings never come here */
259 clear_buffer_new(bh);
260@@ -1774,7 +1776,8 @@ recover:
261 bh = head;
262 /* Recovery: lock and submit the mapped buffers */
263 do {
264- if (buffer_mapped(bh) && buffer_dirty(bh)) {
265+ if (buffer_mapped(bh) && buffer_dirty(bh) &&
266+ !buffer_delay(bh)) {
267 lock_buffer(bh);
268 mark_buffer_async_write(bh);
269 } else {
270@@ -2061,6 +2064,7 @@ int generic_write_end(struct file *file, struct address_space *mapping,
271 struct page *page, void *fsdata)
272 {
273 struct inode *inode = mapping->host;
274+ int i_size_changed = 0;
275
276 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
277
278@@ -2073,12 +2077,21 @@ int generic_write_end(struct file *file, struct address_space *mapping,
279 */
280 if (pos+copied > inode->i_size) {
281 i_size_write(inode, pos+copied);
282- mark_inode_dirty(inode);
283+ i_size_changed = 1;
284 }
285
286 unlock_page(page);
287 page_cache_release(page);
288
289+ /*
290+ * Don't mark the inode dirty under page lock. First, it unnecessarily
291+ * makes the holding time of page lock longer. Second, it forces lock
292+ * ordering of page lock and transaction start for journaling
293+ * filesystems.
294+ */
295+ if (i_size_changed)
296+ mark_inode_dirty(inode);
297+
298 return copied;
299 }
300 EXPORT_SYMBOL(generic_write_end);
301diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
302index 3c8dab8..a234b54 100644
303--- a/fs/ext4/acl.c
304+++ b/fs/ext4/acl.c
305@@ -40,34 +40,35 @@ ext4_acl_from_disk(const void *value, size_t size)
306 acl = posix_acl_alloc(count, GFP_NOFS);
307 if (!acl)
308 return ERR_PTR(-ENOMEM);
309- for (n=0; n < count; n++) {
310+ for (n = 0; n < count; n++) {
311 ext4_acl_entry *entry =
312 (ext4_acl_entry *)value;
313 if ((char *)value + sizeof(ext4_acl_entry_short) > end)
314 goto fail;
315 acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
316 acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
317- switch(acl->a_entries[n].e_tag) {
318- case ACL_USER_OBJ:
319- case ACL_GROUP_OBJ:
320- case ACL_MASK:
321- case ACL_OTHER:
322- value = (char *)value +
323- sizeof(ext4_acl_entry_short);
324- acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
325- break;
326-
327- case ACL_USER:
328- case ACL_GROUP:
329- value = (char *)value + sizeof(ext4_acl_entry);
330- if ((char *)value > end)
331- goto fail;
332- acl->a_entries[n].e_id =
333- le32_to_cpu(entry->e_id);
334- break;
335-
336- default:
337+
338+ switch (acl->a_entries[n].e_tag) {
339+ case ACL_USER_OBJ:
340+ case ACL_GROUP_OBJ:
341+ case ACL_MASK:
342+ case ACL_OTHER:
343+ value = (char *)value +
344+ sizeof(ext4_acl_entry_short);
345+ acl->a_entries[n].e_id = ACL_UNDEFINED_ID;
346+ break;
347+
348+ case ACL_USER:
349+ case ACL_GROUP:
350+ value = (char *)value + sizeof(ext4_acl_entry);
351+ if ((char *)value > end)
352 goto fail;
353+ acl->a_entries[n].e_id =
354+ le32_to_cpu(entry->e_id);
355+ break;
356+
357+ default:
358+ goto fail;
359 }
360 }
361 if (value != end)
362@@ -96,27 +97,26 @@ ext4_acl_to_disk(const struct posix_acl *acl, size_t *size)
363 return ERR_PTR(-ENOMEM);
364 ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION);
365 e = (char *)ext_acl + sizeof(ext4_acl_header);
366- for (n=0; n < acl->a_count; n++) {
367+ for (n = 0; n < acl->a_count; n++) {
368 ext4_acl_entry *entry = (ext4_acl_entry *)e;
369 entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
370 entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
371- switch(acl->a_entries[n].e_tag) {
372- case ACL_USER:
373- case ACL_GROUP:
374- entry->e_id =
375- cpu_to_le32(acl->a_entries[n].e_id);
376- e += sizeof(ext4_acl_entry);
377- break;
378-
379- case ACL_USER_OBJ:
380- case ACL_GROUP_OBJ:
381- case ACL_MASK:
382- case ACL_OTHER:
383- e += sizeof(ext4_acl_entry_short);
384- break;
385-
386- default:
387- goto fail;
388+ switch (acl->a_entries[n].e_tag) {
389+ case ACL_USER:
390+ case ACL_GROUP:
391+ entry->e_id = cpu_to_le32(acl->a_entries[n].e_id);
392+ e += sizeof(ext4_acl_entry);
393+ break;
394+
395+ case ACL_USER_OBJ:
396+ case ACL_GROUP_OBJ:
397+ case ACL_MASK:
398+ case ACL_OTHER:
399+ e += sizeof(ext4_acl_entry_short);
400+ break;
401+
402+ default:
403+ goto fail;
404 }
405 }
406 return (char *)ext_acl;
407@@ -167,23 +167,23 @@ ext4_get_acl(struct inode *inode, int type)
408 if (!test_opt(inode->i_sb, POSIX_ACL))
409 return NULL;
410
411- switch(type) {
412- case ACL_TYPE_ACCESS:
413- acl = ext4_iget_acl(inode, &ei->i_acl);
414- if (acl != EXT4_ACL_NOT_CACHED)
415- return acl;
416- name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
417- break;
418-
419- case ACL_TYPE_DEFAULT:
420- acl = ext4_iget_acl(inode, &ei->i_default_acl);
421- if (acl != EXT4_ACL_NOT_CACHED)
422- return acl;
423- name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
424- break;
425-
426- default:
427- return ERR_PTR(-EINVAL);
428+ switch (type) {
429+ case ACL_TYPE_ACCESS:
430+ acl = ext4_iget_acl(inode, &ei->i_acl);
431+ if (acl != EXT4_ACL_NOT_CACHED)
432+ return acl;
433+ name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
434+ break;
435+
436+ case ACL_TYPE_DEFAULT:
437+ acl = ext4_iget_acl(inode, &ei->i_default_acl);
438+ if (acl != EXT4_ACL_NOT_CACHED)
439+ return acl;
440+ name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
441+ break;
442+
443+ default:
444+ return ERR_PTR(-EINVAL);
445 }
446 retval = ext4_xattr_get(inode, name_index, "", NULL, 0);
447 if (retval > 0) {
448@@ -201,14 +201,14 @@ ext4_get_acl(struct inode *inode, int type)
449 kfree(value);
450
451 if (!IS_ERR(acl)) {
452- switch(type) {
453- case ACL_TYPE_ACCESS:
454- ext4_iset_acl(inode, &ei->i_acl, acl);
455- break;
456-
457- case ACL_TYPE_DEFAULT:
458- ext4_iset_acl(inode, &ei->i_default_acl, acl);
459- break;
460+ switch (type) {
461+ case ACL_TYPE_ACCESS:
462+ ext4_iset_acl(inode, &ei->i_acl, acl);
463+ break;
464+
465+ case ACL_TYPE_DEFAULT:
466+ ext4_iset_acl(inode, &ei->i_default_acl, acl);
467+ break;
468 }
469 }
470 return acl;
471@@ -232,31 +232,31 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
472 if (S_ISLNK(inode->i_mode))
473 return -EOPNOTSUPP;
474
475- switch(type) {
476- case ACL_TYPE_ACCESS:
477- name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
478- if (acl) {
479- mode_t mode = inode->i_mode;
480- error = posix_acl_equiv_mode(acl, &mode);
481- if (error < 0)
482- return error;
483- else {
484- inode->i_mode = mode;
485- ext4_mark_inode_dirty(handle, inode);
486- if (error == 0)
487- acl = NULL;
488- }
489+ switch (type) {
490+ case ACL_TYPE_ACCESS:
491+ name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
492+ if (acl) {
493+ mode_t mode = inode->i_mode;
494+ error = posix_acl_equiv_mode(acl, &mode);
495+ if (error < 0)
496+ return error;
497+ else {
498+ inode->i_mode = mode;
499+ ext4_mark_inode_dirty(handle, inode);
500+ if (error == 0)
501+ acl = NULL;
502 }
503- break;
504+ }
505+ break;
506
507- case ACL_TYPE_DEFAULT:
508- name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
509- if (!S_ISDIR(inode->i_mode))
510- return acl ? -EACCES : 0;
511- break;
512+ case ACL_TYPE_DEFAULT:
513+ name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT;
514+ if (!S_ISDIR(inode->i_mode))
515+ return acl ? -EACCES : 0;
516+ break;
517
518- default:
519- return -EINVAL;
520+ default:
521+ return -EINVAL;
522 }
523 if (acl) {
524 value = ext4_acl_to_disk(acl, &size);
525@@ -269,14 +269,14 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
526
527 kfree(value);
528 if (!error) {
529- switch(type) {
530- case ACL_TYPE_ACCESS:
531- ext4_iset_acl(inode, &ei->i_acl, acl);
532- break;
533-
534- case ACL_TYPE_DEFAULT:
535- ext4_iset_acl(inode, &ei->i_default_acl, acl);
536- break;
537+ switch (type) {
538+ case ACL_TYPE_ACCESS:
539+ ext4_iset_acl(inode, &ei->i_acl, acl);
540+ break;
541+
542+ case ACL_TYPE_DEFAULT:
543+ ext4_iset_acl(inode, &ei->i_default_acl, acl);
544+ break;
545 }
546 }
547 return error;
548diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
549index 9cc80b9..e9fa960 100644
550--- a/fs/ext4/balloc.c
551+++ b/fs/ext4/balloc.c
552@@ -47,7 +47,7 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
553 ext4_group_t block_group)
554 {
555 ext4_group_t actual_group;
556- ext4_get_group_no_and_offset(sb, block, &actual_group, 0);
557+ ext4_get_group_no_and_offset(sb, block, &actual_group, NULL);
558 if (actual_group == block_group)
559 return 1;
560 return 0;
561@@ -121,12 +121,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
562 le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks);
563 }
564 } else { /* For META_BG_BLOCK_GROUPS */
565- int group_rel = (block_group -
566- le32_to_cpu(sbi->s_es->s_first_meta_bg)) %
567- EXT4_DESC_PER_BLOCK(sb);
568- if (group_rel == 0 || group_rel == 1 ||
569- (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1))
570- bit_max += 1;
571+ bit_max += ext4_bg_num_gdb(sb, block_group);
572 }
573
574 if (block_group == sbi->s_groups_count - 1) {
575@@ -295,7 +290,7 @@ err_out:
576 return 0;
577 }
578 /**
579- * read_block_bitmap()
580+ * ext4_read_block_bitmap()
581 * @sb: super block
582 * @block_group: given block group
583 *
584@@ -305,7 +300,7 @@ err_out:
585 * Return buffer_head on success or NULL in case of failure.
586 */
587 struct buffer_head *
588-read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
589+ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
590 {
591 struct ext4_group_desc * desc;
592 struct buffer_head * bh = NULL;
593@@ -319,25 +314,28 @@ read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
594 if (unlikely(!bh)) {
595 ext4_error(sb, __func__,
596 "Cannot read block bitmap - "
597- "block_group = %d, block_bitmap = %llu",
598- (int)block_group, (unsigned long long)bitmap_blk);
599+ "block_group = %lu, block_bitmap = %llu",
600+ block_group, bitmap_blk);
601 return NULL;
602 }
603 if (bh_uptodate_or_lock(bh))
604 return bh;
605
606+ spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
607 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
608 ext4_init_block_bitmap(sb, bh, block_group, desc);
609 set_buffer_uptodate(bh);
610 unlock_buffer(bh);
611+ spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
612 return bh;
613 }
614+ spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
615 if (bh_submit_read(bh) < 0) {
616 put_bh(bh);
617 ext4_error(sb, __func__,
618 "Cannot read block bitmap - "
619- "block_group = %d, block_bitmap = %llu",
620- (int)block_group, (unsigned long long)bitmap_blk);
621+ "block_group = %lu, block_bitmap = %llu",
622+ block_group, bitmap_blk);
623 return NULL;
624 }
625 ext4_valid_block_bitmap(sb, desc, block_group, bh);
626@@ -409,8 +407,7 @@ restart:
627 prev = rsv;
628 }
629 printk("Window map complete.\n");
630- if (bad)
631- BUG();
632+ BUG_ON(bad);
633 }
634 #define rsv_window_dump(root, verbose) \
635 __rsv_window_dump((root), (verbose), __func__)
636@@ -694,7 +691,7 @@ do_more:
637 count -= overflow;
638 }
639 brelse(bitmap_bh);
640- bitmap_bh = read_block_bitmap(sb, block_group);
641+ bitmap_bh = ext4_read_block_bitmap(sb, block_group);
642 if (!bitmap_bh)
643 goto error_return;
644 desc = ext4_get_group_desc (sb, block_group, &gd_bh);
645@@ -810,6 +807,13 @@ do_more:
646 spin_unlock(sb_bgl_lock(sbi, block_group));
647 percpu_counter_add(&sbi->s_freeblocks_counter, count);
648
649+ if (sbi->s_log_groups_per_flex) {
650+ ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
651+ spin_lock(sb_bgl_lock(sbi, flex_group));
652+ sbi->s_flex_groups[flex_group].free_blocks += count;
653+ spin_unlock(sb_bgl_lock(sbi, flex_group));
654+ }
655+
656 /* We dirtied the bitmap block */
657 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
658 err = ext4_journal_dirty_metadata(handle, bitmap_bh);
659@@ -1598,23 +1602,38 @@ out:
660
661 /**
662 * ext4_has_free_blocks()
663- * @sbi: in-core super block structure.
664+ * @sbi: in-core super block structure.
665+ * @nblocks: number of neeed blocks
666 *
667- * Check if filesystem has at least 1 free block available for allocation.
668+ * Check if filesystem has free blocks available for allocation.
669+ * Return the number of blocks avaible for allocation for this request
670+ * On success, return nblocks
671 */
672-static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
673+ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
674+ ext4_fsblk_t nblocks)
675 {
676- ext4_fsblk_t free_blocks, root_blocks;
677+ ext4_fsblk_t free_blocks;
678+ ext4_fsblk_t root_blocks = 0;
679
680 free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
681- root_blocks = ext4_r_blocks_count(sbi->s_es);
682- if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
683+
684+ if (!capable(CAP_SYS_RESOURCE) &&
685 sbi->s_resuid != current->fsuid &&
686- (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
687+ (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid)))
688+ root_blocks = ext4_r_blocks_count(sbi->s_es);
689+#ifdef CONFIG_SMP
690+ if (free_blocks - root_blocks < FBC_BATCH)
691+ free_blocks =
692+ percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
693+#endif
694+ if (free_blocks <= root_blocks)
695+ /* we don't have free space */
696 return 0;
697- }
698- return 1;
699-}
700+ if (free_blocks - root_blocks < nblocks)
701+ return free_blocks - root_blocks;
702+ return nblocks;
703+ }
704+
705
706 /**
707 * ext4_should_retry_alloc()
708@@ -1630,7 +1649,7 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi)
709 */
710 int ext4_should_retry_alloc(struct super_block *sb, int *retries)
711 {
712- if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3)
713+ if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
714 return 0;
715
716 jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
717@@ -1639,20 +1658,24 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries)
718 }
719
720 /**
721- * ext4_new_blocks_old() -- core block(s) allocation function
722+ * ext4_old_new_blocks() -- core block bitmap based block allocation function
723+ *
724 * @handle: handle to this transaction
725 * @inode: file inode
726 * @goal: given target block(filesystem wide)
727 * @count: target number of blocks to allocate
728 * @errp: error code
729 *
730- * ext4_new_blocks uses a goal block to assist allocation. It tries to
731- * allocate block(s) from the block group contains the goal block first. If that
732- * fails, it will try to allocate block(s) from other block groups without
733- * any specific goal block.
734+ * ext4_old_new_blocks uses a goal block to assist allocation and look up
735+ * the block bitmap directly to do block allocation. It tries to
736+ * allocate block(s) from the block group contains the goal block first. If
737+ * that fails, it will try to allocate block(s) from other block groups
738+ * without any specific goal block.
739+ *
740+ * This function is called when -o nomballoc mount option is enabled
741 *
742 */
743-ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
744+ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
745 ext4_fsblk_t goal, unsigned long *count, int *errp)
746 {
747 struct buffer_head *bitmap_bh = NULL;
748@@ -1676,13 +1699,26 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
749 ext4_group_t ngroups;
750 unsigned long num = *count;
751
752- *errp = -ENOSPC;
753 sb = inode->i_sb;
754 if (!sb) {
755+ *errp = -ENODEV;
756 printk("ext4_new_block: nonexistent device");
757 return 0;
758 }
759
760+ sbi = EXT4_SB(sb);
761+ if (!EXT4_I(inode)->i_delalloc_reserved_flag) {
762+ /*
763+ * With delalloc we already reserved the blocks
764+ */
765+ *count = ext4_has_free_blocks(sbi, *count);
766+ }
767+ if (*count == 0) {
768+ *errp = -ENOSPC;
769+ return 0; /*return with ENOSPC error */
770+ }
771+ num = *count;
772+
773 /*
774 * Check quota for allocation of this block.
775 */
776@@ -1706,11 +1742,6 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
777 if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
778 my_rsv = &block_i->rsv_window_node;
779
780- if (!ext4_has_free_blocks(sbi)) {
781- *errp = -ENOSPC;
782- goto out;
783- }
784-
785 /*
786 * First, test whether the goal block is free.
787 */
788@@ -1734,7 +1765,7 @@ retry_alloc:
789 my_rsv = NULL;
790
791 if (free_blocks > 0) {
792- bitmap_bh = read_block_bitmap(sb, group_no);
793+ bitmap_bh = ext4_read_block_bitmap(sb, group_no);
794 if (!bitmap_bh)
795 goto io_error;
796 grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle,
797@@ -1770,7 +1801,7 @@ retry_alloc:
798 continue;
799
800 brelse(bitmap_bh);
801- bitmap_bh = read_block_bitmap(sb, group_no);
802+ bitmap_bh = ext4_read_block_bitmap(sb, group_no);
803 if (!bitmap_bh)
804 goto io_error;
805 /*
806@@ -1882,7 +1913,15 @@ allocated:
807 le16_add_cpu(&gdp->bg_free_blocks_count, -num);
808 gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp);
809 spin_unlock(sb_bgl_lock(sbi, group_no));
810- percpu_counter_sub(&sbi->s_freeblocks_counter, num);
811+ if (!EXT4_I(inode)->i_delalloc_reserved_flag)
812+ percpu_counter_sub(&sbi->s_freeblocks_counter, num);
813+
814+ if (sbi->s_log_groups_per_flex) {
815+ ext4_group_t flex_group = ext4_flex_group(sbi, group_no);
816+ spin_lock(sb_bgl_lock(sbi, flex_group));
817+ sbi->s_flex_groups[flex_group].free_blocks -= num;
818+ spin_unlock(sb_bgl_lock(sbi, flex_group));
819+ }
820
821 BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
822 err = ext4_journal_dirty_metadata(handle, gdp_bh);
823@@ -1915,46 +1954,104 @@ out:
824 return 0;
825 }
826
827-ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode,
828- ext4_fsblk_t goal, int *errp)
829+#define EXT4_META_BLOCK 0x1
830+
831+static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode,
832+ ext4_lblk_t iblock, ext4_fsblk_t goal,
833+ unsigned long *count, int *errp, int flags)
834 {
835 struct ext4_allocation_request ar;
836 ext4_fsblk_t ret;
837
838 if (!test_opt(inode->i_sb, MBALLOC)) {
839- unsigned long count = 1;
840- ret = ext4_new_blocks_old(handle, inode, goal, &count, errp);
841- return ret;
842+ return ext4_old_new_blocks(handle, inode, goal, count, errp);
843 }
844
845 memset(&ar, 0, sizeof(ar));
846+ /* Fill with neighbour allocated blocks */
847+
848 ar.inode = inode;
849 ar.goal = goal;
850- ar.len = 1;
851+ ar.len = *count;
852+ ar.logical = iblock;
853+
854+ if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK))
855+ /* enable in-core preallocation for data block allocation */
856+ ar.flags = EXT4_MB_HINT_DATA;
857+ else
858+ /* disable in-core preallocation for non-regular files */
859+ ar.flags = 0;
860+
861 ret = ext4_mb_new_blocks(handle, &ar, errp);
862+ *count = ar.len;
863 return ret;
864 }
865
866-ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
867+/*
868+ * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
869+ *
870+ * @handle: handle to this transaction
871+ * @inode: file inode
872+ * @goal: given target block(filesystem wide)
873+ * @count: total number of blocks need
874+ * @errp: error code
875+ *
876+ * Return 1st allocated block numberon success, *count stores total account
877+ * error stores in errp pointer
878+ */
879+ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
880 ext4_fsblk_t goal, unsigned long *count, int *errp)
881 {
882- struct ext4_allocation_request ar;
883 ext4_fsblk_t ret;
884-
885- if (!test_opt(inode->i_sb, MBALLOC)) {
886- ret = ext4_new_blocks_old(handle, inode, goal, count, errp);
887- return ret;
888+ ret = do_blk_alloc(handle, inode, 0, goal,
889+ count, errp, EXT4_META_BLOCK);
890+ /*
891+ * Account for the allocated meta blocks
892+ */
893+ if (!(*errp)) {
894+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
895+ EXT4_I(inode)->i_allocated_meta_blocks += *count;
896+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
897 }
898-
899- memset(&ar, 0, sizeof(ar));
900- ar.inode = inode;
901- ar.goal = goal;
902- ar.len = *count;
903- ret = ext4_mb_new_blocks(handle, &ar, errp);
904- *count = ar.len;
905 return ret;
906 }
907
908+/*
909+ * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks
910+ *
911+ * @handle: handle to this transaction
912+ * @inode: file inode
913+ * @goal: given target block(filesystem wide)
914+ * @errp: error code
915+ *
916+ * Return allocated block number on success
917+ */
918+ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
919+ ext4_fsblk_t goal, int *errp)
920+{
921+ unsigned long count = 1;
922+ return ext4_new_meta_blocks(handle, inode, goal, &count, errp);
923+}
924+
925+/*
926+ * ext4_new_blocks() -- allocate data blocks
927+ *
928+ * @handle: handle to this transaction
929+ * @inode: file inode
930+ * @goal: given target block(filesystem wide)
931+ * @count: total number of blocks need
932+ * @errp: error code
933+ *
934+ * Return 1st allocated block numberon success, *count stores total account
935+ * error stores in errp pointer
936+ */
937+
938+ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
939+ ext4_lblk_t iblock, ext4_fsblk_t goal,
940+ unsigned long *count, int *errp)
941+{
942+ return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0);
943+}
944
945 /**
946 * ext4_count_free_blocks() -- count filesystem free blocks
947@@ -1986,7 +2083,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
948 continue;
949 desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
950 brelse(bitmap_bh);
951- bitmap_bh = read_block_bitmap(sb, i);
952+ bitmap_bh = ext4_read_block_bitmap(sb, i);
953 if (bitmap_bh == NULL)
954 continue;
955
956diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
957index 2bf0331..ec8e33b 100644
958--- a/fs/ext4/dir.c
959+++ b/fs/ext4/dir.c
960@@ -129,7 +129,8 @@ static int ext4_readdir(struct file * filp,
961 struct buffer_head *bh = NULL;
962
963 map_bh.b_state = 0;
964- err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0);
965+ err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
966+ 0, 0, 0);
967 if (err > 0) {
968 pgoff_t index = map_bh.b_blocknr >>
969 (PAGE_CACHE_SHIFT - inode->i_blkbits);
970@@ -272,7 +273,7 @@ static void free_rb_tree_fname(struct rb_root *root)
971
972 while (n) {
973 /* Do the node's children first */
974- if ((n)->rb_left) {
975+ if (n->rb_left) {
976 n = n->rb_left;
977 continue;
978 }
979@@ -301,24 +302,18 @@ static void free_rb_tree_fname(struct rb_root *root)
980 parent->rb_right = NULL;
981 n = parent;
982 }
983- root->rb_node = NULL;
984 }
985
986
987-static struct dir_private_info *create_dir_info(loff_t pos)
988+static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos)
989 {
990 struct dir_private_info *p;
991
992- p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL);
993+ p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL);
994 if (!p)
995 return NULL;
996- p->root.rb_node = NULL;
997- p->curr_node = NULL;
998- p->extra_fname = NULL;
999- p->last_pos = 0;
1000 p->curr_hash = pos2maj_hash(pos);
1001 p->curr_minor_hash = pos2min_hash(pos);
1002- p->next_hash = 0;
1003 return p;
1004 }
1005
1006@@ -416,7 +411,7 @@ static int call_filldir(struct file * filp, void * dirent,
1007 get_dtype(sb, fname->file_type));
1008 if (error) {
1009 filp->f_pos = curr_pos;
1010- info->extra_fname = fname->next;
1011+ info->extra_fname = fname;
1012 return error;
1013 }
1014 fname = fname->next;
1015@@ -433,7 +428,7 @@ static int ext4_dx_readdir(struct file * filp,
1016 int ret;
1017
1018 if (!info) {
1019- info = create_dir_info(filp->f_pos);
1020+ info = ext4_htree_create_dir_info(filp->f_pos);
1021 if (!info)
1022 return -ENOMEM;
1023 filp->private_data = info;
1024@@ -455,11 +450,21 @@ static int ext4_dx_readdir(struct file * filp,
1025 * If there are any leftover names on the hash collision
1026 * chain, return them first.
1027 */
1028- if (info->extra_fname &&
1029- call_filldir(filp, dirent, filldir, info->extra_fname))
1030- goto finished;
1031+ if (info->extra_fname) {
1032+ if (call_filldir(filp, dirent, filldir, info->extra_fname))
1033+ goto finished;
1034
1035- if (!info->curr_node)
1036+ info->extra_fname = NULL;
1037+ info->curr_node = rb_next(info->curr_node);
1038+ if (!info->curr_node) {
1039+ if (info->next_hash == ~0) {
1040+ filp->f_pos = EXT4_HTREE_EOF;
1041+ goto finished;
1042+ }
1043+ info->curr_hash = info->next_hash;
1044+ info->curr_minor_hash = 0;
1045+ }
1046+ } else if (!info->curr_node)
1047 info->curr_node = rb_first(&info->root);
1048
1049 while (1) {
1050diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
1051index 8158083..2950032 100644
1052--- a/fs/ext4/ext4.h
1053+++ b/fs/ext4/ext4.h
1054@@ -22,7 +22,7 @@
1055 #include "ext4_i.h"
1056
1057 /*
1058- * The second extended filesystem constants/structures
1059+ * The fourth extended filesystem constants/structures
1060 */
1061
1062 /*
1063@@ -45,7 +45,7 @@
1064 #define ext4_debug(f, a...) \
1065 do { \
1066 printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \
1067- __FILE__, __LINE__, __FUNCTION__); \
1068+ __FILE__, __LINE__, __func__); \
1069 printk (KERN_DEBUG f, ## a); \
1070 } while (0)
1071 #else
1072@@ -74,6 +74,9 @@
1073 #define EXT4_MB_HINT_GOAL_ONLY 256
1074 /* goal is meaningful */
1075 #define EXT4_MB_HINT_TRY_GOAL 512
1076+/* blocks already pre-reserved by delayed allocation */
1077+#define EXT4_MB_DELALLOC_RESERVED 1024
1078+
1079
1080 struct ext4_allocation_request {
1081 /* target inode for block we're allocating */
1082@@ -170,6 +173,15 @@ struct ext4_group_desc
1083 __u32 bg_reserved2[3];
1084 };
1085
1086+/*
1087+ * Structure of a flex block group info
1088+ */
1089+
1090+struct flex_groups {
1091+ __u32 free_inodes;
1092+ __u32 free_blocks;
1093+};
1094+
1095 #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
1096 #define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */
1097 #define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */
1098@@ -527,6 +539,7 @@ do { \
1099 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
1100 #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */
1101 #define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */
1102+#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
1103 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
1104 #ifndef _LINUX_EXT2_FS_H
1105 #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
1106@@ -647,7 +660,10 @@ struct ext4_super_block {
1107 __le16 s_mmp_interval; /* # seconds to wait in MMP checking */
1108 __le64 s_mmp_block; /* Block for multi-mount protection */
1109 __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/
1110- __u32 s_reserved[163]; /* Padding to the end of the block */
1111+ __u8 s_log_groups_per_flex; /* FLEX_BG group size */
1112+ __u8 s_reserved_char_pad2;
1113+ __le16 s_reserved_pad;
1114+ __u32 s_reserved[162]; /* Padding to the end of the block */
1115 };
1116
1117 #ifdef __KERNEL__
1118@@ -958,12 +974,17 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
1119 extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
1120 extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
1121 ext4_group_t group);
1122-extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode,
1123+extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode,
1124 ext4_fsblk_t goal, int *errp);
1125-extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode,
1126+extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
1127 ext4_fsblk_t goal, unsigned long *count, int *errp);
1128-extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode,
1129+extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode,
1130+ ext4_lblk_t iblock, ext4_fsblk_t goal,
1131+ unsigned long *count, int *errp);
1132+extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode,
1133 ext4_fsblk_t goal, unsigned long *count, int *errp);
1134+extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
1135+ ext4_fsblk_t nblocks);
1136 extern void ext4_free_blocks (handle_t *handle, struct inode *inode,
1137 ext4_fsblk_t block, unsigned long count, int metadata);
1138 extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb,
1139@@ -1016,6 +1037,10 @@ extern int __init init_ext4_mballoc(void);
1140 extern void exit_ext4_mballoc(void);
1141 extern void ext4_mb_free_blocks(handle_t *, struct inode *,
1142 unsigned long, unsigned long, int, unsigned long *);
1143+extern int ext4_mb_add_more_groupinfo(struct super_block *sb,
1144+ ext4_group_t i, struct ext4_group_desc *desc);
1145+extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
1146+ ext4_grpblk_t add);
1147
1148
1149 /* inode.c */
1150@@ -1033,19 +1058,25 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
1151 extern struct inode *ext4_iget(struct super_block *, unsigned long);
1152 extern int ext4_write_inode (struct inode *, int);
1153 extern int ext4_setattr (struct dentry *, struct iattr *);
1154+extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
1155+ struct kstat *stat);
1156 extern void ext4_delete_inode (struct inode *);
1157 extern int ext4_sync_inode (handle_t *, struct inode *);
1158 extern void ext4_discard_reservation (struct inode *);
1159 extern void ext4_dirty_inode(struct inode *);
1160 extern int ext4_change_inode_journal_flag(struct inode *, int);
1161 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
1162+extern int ext4_can_truncate(struct inode *inode);
1163 extern void ext4_truncate (struct inode *);
1164 extern void ext4_set_inode_flags(struct inode *);
1165 extern void ext4_get_inode_flags(struct ext4_inode_info *);
1166 extern void ext4_set_aops(struct inode *inode);
1167 extern int ext4_writepage_trans_blocks(struct inode *);
1168-extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
1169+extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
1170+extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
1171+extern int ext4_block_truncate_page(handle_t *handle,
1172 struct address_space *mapping, loff_t from);
1173+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
1174
1175 /* ioctl.c */
1176 extern long ext4_ioctl(struct file *, unsigned int, unsigned long);
1177@@ -1159,10 +1190,21 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
1178 }
1179
1180
1181+static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
1182+ ext4_group_t block_group)
1183+{
1184+ return block_group >> sbi->s_log_groups_per_flex;
1185+}
1186+
1187+static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
1188+{
1189+ return 1 << sbi->s_log_groups_per_flex;
1190+}
1191+
1192 #define ext4_std_error(sb, errno) \
1193 do { \
1194 if ((errno)) \
1195- __ext4_std_error((sb), __FUNCTION__, (errno)); \
1196+ __ext4_std_error((sb), __func__, (errno)); \
1197 } while (0)
1198
1199 /*
1200@@ -1187,11 +1229,13 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations;
1201 /* extents.c */
1202 extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
1203 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
1204+extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
1205+ int chunk);
1206 extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1207 ext4_lblk_t iblock,
1208 unsigned long max_blocks, struct buffer_head *bh_result,
1209 int create, int extend_disksize);
1210-extern void ext4_ext_truncate(struct inode *, struct page *);
1211+extern void ext4_ext_truncate(struct inode *);
1212 extern void ext4_ext_init(struct super_block *);
1213 extern void ext4_ext_release(struct super_block *);
1214 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1215@@ -1199,7 +1243,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
1216 extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
1217 sector_t block, unsigned long max_blocks,
1218 struct buffer_head *bh, int create,
1219- int extend_disksize);
1220+ int extend_disksize, int flag);
1221 #endif /* __KERNEL__ */
1222
1223 #endif /* _EXT4_H */
1224diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
1225index 75333b5..d33dc56 100644
1226--- a/fs/ext4/ext4_extents.h
1227+++ b/fs/ext4/ext4_extents.h
1228@@ -212,10 +212,13 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext)
1229 (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN));
1230 }
1231
1232+extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks);
1233 extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *);
1234 extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t);
1235 extern int ext4_extent_tree_init(handle_t *, struct inode *);
1236-extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *);
1237+extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
1238+ int num,
1239+ struct ext4_ext_path *path);
1240 extern int ext4_ext_try_to_merge(struct inode *inode,
1241 struct ext4_ext_path *path,
1242 struct ext4_extent *);
1243diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
1244index 26a4ae2..ef7409f 100644
1245--- a/fs/ext4/ext4_i.h
1246+++ b/fs/ext4/ext4_i.h
1247@@ -79,7 +79,7 @@ struct ext4_ext_cache {
1248 };
1249
1250 /*
1251- * third extended file system inode data in memory
1252+ * fourth extended file system inode data in memory
1253 */
1254 struct ext4_inode_info {
1255 __le32 i_data[15]; /* unconverted */
1256@@ -150,6 +150,7 @@ struct ext4_inode_info {
1257 */
1258 struct rw_semaphore i_data_sem;
1259 struct inode vfs_inode;
1260+ struct jbd2_inode jinode;
1261
1262 unsigned long i_ext_generation;
1263 struct ext4_ext_cache i_cached_extent;
1264@@ -162,6 +163,13 @@ struct ext4_inode_info {
1265 /* mballoc */
1266 struct list_head i_prealloc_list;
1267 spinlock_t i_prealloc_lock;
1268+
1269+ /* allocation reservation info for delalloc */
1270+ unsigned long i_reserved_data_blocks;
1271+ unsigned long i_reserved_meta_blocks;
1272+ unsigned long i_allocated_meta_blocks;
1273+ unsigned short i_delalloc_reserved_flag;
1274+ spinlock_t i_block_reservation_lock;
1275 };
1276
1277 #endif /* _EXT4_I */
1278diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
1279index 9255a7d..b455c68 100644
1280--- a/fs/ext4/ext4_jbd2.h
1281+++ b/fs/ext4/ext4_jbd2.h
1282@@ -51,6 +51,14 @@
1283 EXT4_XATTR_TRANS_BLOCKS - 2 + \
1284 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
1285
1286+/*
1287+ * Define the number of metadata blocks we need to account to modify data.
1288+ *
1289+ * This include super block, inode block, quota blocks and xattr blocks
1290+ */
1291+#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \
1292+ 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
1293+
1294 /* Delete operations potentially hit one directory's namespace plus an
1295 * entire inode, plus arbitrary amounts of bitmap/indirection data. Be
1296 * generous. We can grow the delete transaction later if necessary. */
1297@@ -142,19 +150,17 @@ int __ext4_journal_dirty_metadata(const char *where,
1298 handle_t *handle, struct buffer_head *bh);
1299
1300 #define ext4_journal_get_undo_access(handle, bh) \
1301- __ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh))
1302+ __ext4_journal_get_undo_access(__func__, (handle), (bh))
1303 #define ext4_journal_get_write_access(handle, bh) \
1304- __ext4_journal_get_write_access(__FUNCTION__, (handle), (bh))
1305+ __ext4_journal_get_write_access(__func__, (handle), (bh))
1306 #define ext4_journal_revoke(handle, blocknr, bh) \
1307- __ext4_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh))
1308+ __ext4_journal_revoke(__func__, (handle), (blocknr), (bh))
1309 #define ext4_journal_get_create_access(handle, bh) \
1310- __ext4_journal_get_create_access(__FUNCTION__, (handle), (bh))
1311+ __ext4_journal_get_create_access(__func__, (handle), (bh))
1312 #define ext4_journal_dirty_metadata(handle, bh) \
1313- __ext4_journal_dirty_metadata(__FUNCTION__, (handle), (bh))
1314+ __ext4_journal_dirty_metadata(__func__, (handle), (bh))
1315 #define ext4_journal_forget(handle, bh) \
1316- __ext4_journal_forget(__FUNCTION__, (handle), (bh))
1317-
1318-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh);
1319+ __ext4_journal_forget(__func__, (handle), (bh))
1320
1321 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
1322 int __ext4_journal_stop(const char *where, handle_t *handle);
1323@@ -165,7 +171,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks)
1324 }
1325
1326 #define ext4_journal_stop(handle) \
1327- __ext4_journal_stop(__FUNCTION__, (handle))
1328+ __ext4_journal_stop(__func__, (handle))
1329
1330 static inline handle_t *ext4_journal_current_handle(void)
1331 {
1332@@ -192,6 +198,11 @@ static inline int ext4_journal_force_commit(journal_t *journal)
1333 return jbd2_journal_force_commit(journal);
1334 }
1335
1336+static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
1337+{
1338+ return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode);
1339+}
1340+
1341 /* super.c */
1342 int ext4_force_commit(struct super_block *sb);
1343
1344diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
1345index 5802e69..6300226 100644
1346--- a/fs/ext4/ext4_sb.h
1347+++ b/fs/ext4/ext4_sb.h
1348@@ -25,7 +25,7 @@
1349 #include <linux/rbtree.h>
1350
1351 /*
1352- * third extended-fs super-block data in memory
1353+ * fourth extended-fs super-block data in memory
1354 */
1355 struct ext4_sb_info {
1356 unsigned long s_desc_size; /* Size of a group descriptor in bytes */
1357@@ -143,6 +143,9 @@ struct ext4_sb_info {
1358
1359 /* locality groups */
1360 struct ext4_locality_group *s_locality_groups;
1361+
1362+ unsigned int s_log_groups_per_flex;
1363+ struct flex_groups *s_flex_groups;
1364 };
1365
1366 #endif /* _EXT4_SB */
1367diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
1368index 47929c4..b24d3c5 100644
1369--- a/fs/ext4/extents.c
1370+++ b/fs/ext4/extents.c
1371@@ -92,17 +92,16 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
1372 ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
1373 }
1374
1375-static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed)
1376+static int ext4_ext_journal_restart(handle_t *handle, int needed)
1377 {
1378 int err;
1379
1380 if (handle->h_buffer_credits > needed)
1381- return handle;
1382- if (!ext4_journal_extend(handle, needed))
1383- return handle;
1384- err = ext4_journal_restart(handle, needed);
1385-
1386- return handle;
1387+ return 0;
1388+ err = ext4_journal_extend(handle, needed);
1389+ if (err <= 0)
1390+ return err;
1391+ return ext4_journal_restart(handle, needed);
1392 }
1393
1394 /*
1395@@ -180,15 +179,18 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
1396 return bg_start + colour + block;
1397 }
1398
1399+/*
1400+ * Allocation for a meta data block
1401+ */
1402 static ext4_fsblk_t
1403-ext4_ext_new_block(handle_t *handle, struct inode *inode,
1404+ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
1405 struct ext4_ext_path *path,
1406 struct ext4_extent *ex, int *err)
1407 {
1408 ext4_fsblk_t goal, newblock;
1409
1410 goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block));
1411- newblock = ext4_new_block(handle, inode, goal, err);
1412+ newblock = ext4_new_meta_block(handle, inode, goal, err);
1413 return newblock;
1414 }
1415
1416@@ -246,6 +248,36 @@ static int ext4_ext_space_root_idx(struct inode *inode)
1417 return size;
1418 }
1419
1420+/*
1421+ * Calculate the number of metadata blocks needed
1422+ * to allocate @blocks
1423+ * Worse case is one block per extent
1424+ */
1425+int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
1426+{
1427+ int lcap, icap, rcap, leafs, idxs, num;
1428+ int newextents = blocks;
1429+
1430+ rcap = ext4_ext_space_root_idx(inode);
1431+ lcap = ext4_ext_space_block(inode);
1432+ icap = ext4_ext_space_block_idx(inode);
1433+
1434+ /* number of new leaf blocks needed */
1435+ num = leafs = (newextents + lcap - 1) / lcap;
1436+
1437+ /*
1438+ * Worse case, we need separate index block(s)
1439+ * to link all new leaf blocks
1440+ */
1441+ idxs = (leafs + icap - 1) / icap;
1442+ do {
1443+ num += idxs;
1444+ idxs = (idxs + icap - 1) / icap;
1445+ } while (idxs > rcap);
1446+
1447+ return num;
1448+}
1449+
1450 static int
1451 ext4_ext_max_entries(struct inode *inode, int depth)
1452 {
1453@@ -524,6 +556,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
1454 alloc = 1;
1455 }
1456 path[0].p_hdr = eh;
1457+ path[0].p_bh = NULL;
1458
1459 i = depth;
1460 /* walk through the tree */
1461@@ -552,12 +585,14 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
1462 }
1463
1464 path[ppos].p_depth = i;
1465- path[ppos].p_hdr = eh;
1466 path[ppos].p_ext = NULL;
1467 path[ppos].p_idx = NULL;
1468
1469 /* find extent */
1470 ext4_ext_binsearch(inode, path + ppos, block);
1471+ /* if not an empty leaf */
1472+ if (path[ppos].p_ext)
1473+ path[ppos].p_block = ext_pblock(path[ppos].p_ext);
1474
1475 ext4_ext_show_path(inode, path);
1476
1477@@ -688,7 +723,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
1478 /* allocate all needed blocks */
1479 ext_debug("allocate %d blocks for indexes/leaf\n", depth - at);
1480 for (a = 0; a < depth - at; a++) {
1481- newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
1482+ newblock = ext4_ext_new_meta_block(handle, inode, path,
1483+ newext, &err);
1484 if (newblock == 0)
1485 goto cleanup;
1486 ablocks[a] = newblock;
1487@@ -884,7 +920,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
1488 ext4_fsblk_t newblock;
1489 int err = 0;
1490
1491- newblock = ext4_ext_new_block(handle, inode, path, newext, &err);
1492+ newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err);
1493 if (newblock == 0)
1494 return err;
1495
1496@@ -981,6 +1017,8 @@ repeat:
1497 /* if we found index with free entry, then use that
1498 * entry: create all needed subtree and add new leaf */
1499 err = ext4_ext_split(handle, inode, path, newext, i);
1500+ if (err)
1501+ goto out;
1502
1503 /* refill path */
1504 ext4_ext_drop_refs(path);
1505@@ -1403,7 +1441,7 @@ unsigned int ext4_ext_check_overlap(struct inode *inode,
1506
1507 /*
1508 * get the next allocated block if the extent in the path
1509- * is before the requested block(s)
1510+ * is before the requested block(s)
1511 */
1512 if (b2 < b1) {
1513 b2 = ext4_ext_next_allocated_block(path);
1514@@ -1709,54 +1747,61 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1515 }
1516
1517 /*
1518- * ext4_ext_calc_credits_for_insert:
1519- * This routine returns max. credits that the extent tree can consume.
1520- * It should be OK for low-performance paths like ->writepage()
1521- * To allow many writing processes to fit into a single transaction,
1522- * the caller should calculate credits under i_data_sem and
1523- * pass the actual path.
1524+ * ext4_ext_calc_credits_for_single_extent:
1525+ * This routine returns max. credits that needed to insert an extent
1526+ * to the extent tree.
1527+ * When pass the actual path, the caller should calculate credits
1528+ * under i_data_sem.
1529 */
1530-int ext4_ext_calc_credits_for_insert(struct inode *inode,
1531+int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
1532 struct ext4_ext_path *path)
1533 {
1534- int depth, needed;
1535-
1536 if (path) {
1537+ int depth = ext_depth(inode);
1538+ int ret = 0;
1539+
1540 /* probably there is space in leaf? */
1541- depth = ext_depth(inode);
1542 if (le16_to_cpu(path[depth].p_hdr->eh_entries)
1543- < le16_to_cpu(path[depth].p_hdr->eh_max))
1544- return 1;
1545- }
1546+ < le16_to_cpu(path[depth].p_hdr->eh_max)) {
1547
1548- /*
1549- * given 32-bit logical block (4294967296 blocks), max. tree
1550- * can be 4 levels in depth -- 4 * 340^4 == 53453440000.
1551- * Let's also add one more level for imbalance.
1552- */
1553- depth = 5;
1554-
1555- /* allocation of new data block(s) */
1556- needed = 2;
1557+ /*
1558+ * There are some space in the leaf tree, no
1559+ * need to account for leaf block credit
1560+ *
1561+ * bitmaps and block group descriptor blocks
1562+ * and other metadat blocks still need to be
1563+ * accounted.
1564+ */
1565+ /* 1 bitmap, 1 block group descriptor */
1566+ ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb);
1567+ }
1568+ }
1569
1570- /*
1571- * tree can be full, so it would need to grow in depth:
1572- * we need one credit to modify old root, credits for
1573- * new root will be added in split accounting
1574- */
1575- needed += 1;
1576+ return ext4_chunk_trans_blocks(inode, nrblocks);
1577+}
1578
1579- /*
1580- * Index split can happen, we would need:
1581- * allocate intermediate indexes (bitmap + group)
1582- * + change two blocks at each level, but root (already included)
1583- */
1584- needed += (depth * 2) + (depth * 2);
1585+/*
1586+ * How many index/leaf blocks need to change/allocate to modify nrblocks?
1587+ *
1588+ * if nrblocks are fit in a single extent (chunk flag is 1), then
1589+ * in the worse case, each tree level index/leaf need to be changed
1590+ * if the tree split due to insert a new extent, then the old tree
1591+ * index/leaf need to be updated too
1592+ *
1593+ * If the nrblocks are discontiguous, they could cause
1594+ * the whole tree split more than once, but this is really rare.
1595+ */
1596+int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
1597+{
1598+ int index;
1599+ int depth = ext_depth(inode);
1600
1601- /* any allocation modifies superblock */
1602- needed += 1;
1603+ if (chunk)
1604+ index = depth * 2;
1605+ else
1606+ index = depth * 3;
1607
1608- return needed;
1609+ return index;
1610 }
1611
1612 static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
1613@@ -1872,22 +1917,22 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
1614 BUG_ON(b != ex_ee_block + ex_ee_len - 1);
1615 }
1616
1617- /* at present, extent can't cross block group: */
1618- /* leaf + bitmap + group desc + sb + inode */
1619- credits = 5;
1620+ /*
1621+ * 3 for leaf, sb, and inode plus 2 (bmap and group
1622+ * descriptor) for each block group; assume two block
1623+ * groups plus ex_ee_len/blocks_per_block_group for
1624+ * the worst case
1625+ */
1626+ credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb));
1627 if (ex == EXT_FIRST_EXTENT(eh)) {
1628 correct_index = 1;
1629 credits += (ext_depth(inode)) + 1;
1630 }
1631-#ifdef CONFIG_QUOTA
1632 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
1633-#endif
1634
1635- handle = ext4_ext_journal_restart(handle, credits);
1636- if (IS_ERR(handle)) {
1637- err = PTR_ERR(handle);
1638+ err = ext4_ext_journal_restart(handle, credits);
1639+ if (err)
1640 goto out;
1641- }
1642
1643 err = ext4_ext_get_access(handle, inode, path + depth);
1644 if (err)
1645@@ -2287,7 +2332,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
1646 unsigned int newdepth;
1647 /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
1648 if (allocated <= EXT4_EXT_ZERO_LEN) {
1649- /* Mark first half uninitialized.
1650+ /*
1651+ * iblock == ee_block is handled by the zerouout
1652+ * at the beginning.
1653+ * Mark first half uninitialized.
1654 * Mark second half initialized and zero out the
1655 * initialized extent
1656 */
1657@@ -2310,7 +2358,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
1658 ex->ee_len = orig_ex.ee_len;
1659 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
1660 ext4_ext_dirty(handle, inode, path + depth);
1661- /* zeroed the full extent */
1662+ /* blocks available from iblock */
1663 return allocated;
1664
1665 } else if (err)
1666@@ -2338,6 +2386,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
1667 err = PTR_ERR(path);
1668 return err;
1669 }
1670+ /* get the second half extent details */
1671 ex = path[depth].p_ext;
1672 err = ext4_ext_get_access(handle, inode,
1673 path + depth);
1674@@ -2367,6 +2416,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
1675 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
1676 ext4_ext_dirty(handle, inode, path + depth);
1677 /* zeroed the full extent */
1678+ /* blocks available from iblock */
1679 return allocated;
1680
1681 } else if (err)
1682@@ -2382,23 +2432,22 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
1683 */
1684 orig_ex.ee_len = cpu_to_le16(ee_len -
1685 ext4_ext_get_actual_len(ex3));
1686- if (newdepth != depth) {
1687- depth = newdepth;
1688- ext4_ext_drop_refs(path);
1689- path = ext4_ext_find_extent(inode, iblock, path);
1690- if (IS_ERR(path)) {
1691- err = PTR_ERR(path);
1692- goto out;
1693- }
1694- eh = path[depth].p_hdr;
1695- ex = path[depth].p_ext;
1696- if (ex2 != &newex)
1697- ex2 = ex;
1698-
1699- err = ext4_ext_get_access(handle, inode, path + depth);
1700- if (err)
1701- goto out;
1702+ depth = newdepth;
1703+ ext4_ext_drop_refs(path);
1704+ path = ext4_ext_find_extent(inode, iblock, path);
1705+ if (IS_ERR(path)) {
1706+ err = PTR_ERR(path);
1707+ goto out;
1708 }
1709+ eh = path[depth].p_hdr;
1710+ ex = path[depth].p_ext;
1711+ if (ex2 != &newex)
1712+ ex2 = ex;
1713+
1714+ err = ext4_ext_get_access(handle, inode, path + depth);
1715+ if (err)
1716+ goto out;
1717+
1718 allocated = max_blocks;
1719
1720 /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
1721@@ -2416,6 +2465,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
1722 ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
1723 ext4_ext_dirty(handle, inode, path + depth);
1724 /* zero out the first half */
1725+ /* blocks available from iblock */
1726 return allocated;
1727 }
1728 }
1729@@ -2529,6 +2579,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1730 int err = 0, depth, ret;
1731 unsigned long allocated = 0;
1732 struct ext4_allocation_request ar;
1733+ loff_t disksize;
1734
1735 __clear_bit(BH_New, &bh_result->b_state);
1736 ext_debug("blocks %u/%lu requested for inode %u\n",
1737@@ -2616,8 +2667,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1738 */
1739 if (allocated > max_blocks)
1740 allocated = max_blocks;
1741- /* mark the buffer unwritten */
1742- __set_bit(BH_Unwritten, &bh_result->b_state);
1743+ set_buffer_unwritten(bh_result);
1744 goto out2;
1745 }
1746
1747@@ -2716,14 +2766,19 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
1748 goto out2;
1749 }
1750
1751- if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize)
1752- EXT4_I(inode)->i_disksize = inode->i_size;
1753-
1754 /* previous routine could use block we allocated */
1755 newblock = ext_pblock(&newex);
1756 allocated = ext4_ext_get_actual_len(&newex);
1757 outnew:
1758- __set_bit(BH_New, &bh_result->b_state);
1759+ if (extend_disksize) {
1760+ disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
1761+ if (disksize > i_size_read(inode))
1762+ disksize = i_size_read(inode);
1763+ if (disksize > EXT4_I(inode)->i_disksize)
1764+ EXT4_I(inode)->i_disksize = disksize;
1765+ }
1766+
1767+ set_buffer_new(bh_result);
1768
1769 /* Cache only when it is _not_ an uninitialized extent */
1770 if (create != EXT4_CREATE_UNINITIALIZED_EXT)
1771@@ -2733,7 +2788,7 @@ out:
1772 if (allocated > max_blocks)
1773 allocated = max_blocks;
1774 ext4_ext_show_leaf(inode, path);
1775- __set_bit(BH_Mapped, &bh_result->b_state);
1776+ set_buffer_mapped(bh_result);
1777 bh_result->b_bdev = inode->i_sb->s_bdev;
1778 bh_result->b_blocknr = newblock;
1779 out2:
1780@@ -2744,7 +2799,7 @@ out2:
1781 return err ? err : allocated;
1782 }
1783
1784-void ext4_ext_truncate(struct inode * inode, struct page *page)
1785+void ext4_ext_truncate(struct inode *inode)
1786 {
1787 struct address_space *mapping = inode->i_mapping;
1788 struct super_block *sb = inode->i_sb;
1789@@ -2755,33 +2810,27 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
1790 /*
1791 * probably first extent we're gonna free will be last in block
1792 */
1793- err = ext4_writepage_trans_blocks(inode) + 3;
1794+ err = ext4_writepage_trans_blocks(inode);
1795 handle = ext4_journal_start(inode, err);
1796- if (IS_ERR(handle)) {
1797- if (page) {
1798- clear_highpage(page);
1799- flush_dcache_page(page);
1800- unlock_page(page);
1801- page_cache_release(page);
1802- }
1803+ if (IS_ERR(handle))
1804 return;
1805- }
1806
1807- if (page)
1808- ext4_block_truncate_page(handle, page, mapping, inode->i_size);
1809+ if (inode->i_size & (sb->s_blocksize - 1))
1810+ ext4_block_truncate_page(handle, mapping, inode->i_size);
1811+
1812+ if (ext4_orphan_add(handle, inode))
1813+ goto out_stop;
1814
1815 down_write(&EXT4_I(inode)->i_data_sem);
1816 ext4_ext_invalidate_cache(inode);
1817
1818- ext4_mb_discard_inode_preallocations(inode);
1819+ ext4_discard_reservation(inode);
1820
1821 /*
1822 * TODO: optimization is possible here.
1823 * Probably we need not scan at all,
1824 * because page truncation is enough.
1825 */
1826- if (ext4_orphan_add(handle, inode))
1827- goto out_stop;
1828
1829 /* we have to know where to truncate from in crash case */
1830 EXT4_I(inode)->i_disksize = inode->i_size;
1831@@ -2798,6 +2847,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page)
1832 handle->h_sync = 1;
1833
1834 out_stop:
1835+ up_write(&EXT4_I(inode)->i_data_sem);
1836 /*
1837 * If this was a simple ftruncate() and the file will remain alive,
1838 * then we need to clear up the orphan record which we created above.
1839@@ -2808,33 +2858,11 @@ out_stop:
1840 if (inode->i_nlink)
1841 ext4_orphan_del(handle, inode);
1842
1843- up_write(&EXT4_I(inode)->i_data_sem);
1844 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
1845 ext4_mark_inode_dirty(handle, inode);
1846 ext4_journal_stop(handle);
1847 }
1848
1849-/*
1850- * ext4_ext_writepage_trans_blocks:
1851- * calculate max number of blocks we could modify
1852- * in order to allocate new block for an inode
1853- */
1854-int ext4_ext_writepage_trans_blocks(struct inode *inode, int num)
1855-{
1856- int needed;
1857-
1858- needed = ext4_ext_calc_credits_for_insert(inode, NULL);
1859-
1860- /* caller wants to allocate num blocks, but note it includes sb */
1861- needed = needed * num - (num - 1);
1862-
1863-#ifdef CONFIG_QUOTA
1864- needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
1865-#endif
1866-
1867- return needed;
1868-}
1869-
1870 static void ext4_falloc_update_inode(struct inode *inode,
1871 int mode, loff_t new_size, int update_ctime)
1872 {
1873@@ -2895,10 +2923,9 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
1874 max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
1875 - block;
1876 /*
1877- * credits to insert 1 extent into extent tree + buffers to be able to
1878- * modify 1 super block, 1 block bitmap and 1 group descriptor.
1879+ * credits to insert 1 extent into extent tree
1880 */
1881- credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3;
1882+ credits = ext4_chunk_trans_blocks(inode, max_blocks);
1883 mutex_lock(&inode->i_mutex);
1884 retry:
1885 while (ret >= 0 && ret < max_blocks) {
1886@@ -2911,7 +2938,7 @@ retry:
1887 }
1888 ret = ext4_get_blocks_wrap(handle, inode, block,
1889 max_blocks, &map_bh,
1890- EXT4_CREATE_UNINITIALIZED_EXT, 0);
1891+ EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
1892 if (ret <= 0) {
1893 #ifdef EXT4FS_DEBUG
1894 WARN_ON(ret <= 0);
1895diff --git a/fs/ext4/file.c b/fs/ext4/file.c
1896index 4159be6..430eb79 100644
1897--- a/fs/ext4/file.c
1898+++ b/fs/ext4/file.c
1899@@ -123,6 +123,23 @@ force_commit:
1900 return ret;
1901 }
1902
1903+static struct vm_operations_struct ext4_file_vm_ops = {
1904+ .fault = filemap_fault,
1905+ .page_mkwrite = ext4_page_mkwrite,
1906+};
1907+
1908+static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
1909+{
1910+ struct address_space *mapping = file->f_mapping;
1911+
1912+ if (!mapping->a_ops->readpage)
1913+ return -ENOEXEC;
1914+ file_accessed(file);
1915+ vma->vm_ops = &ext4_file_vm_ops;
1916+ vma->vm_flags |= VM_CAN_NONLINEAR;
1917+ return 0;
1918+}
1919+
1920 const struct file_operations ext4_file_operations = {
1921 .llseek = generic_file_llseek,
1922 .read = do_sync_read,
1923@@ -133,7 +150,7 @@ const struct file_operations ext4_file_operations = {
1924 #ifdef CONFIG_COMPAT
1925 .compat_ioctl = ext4_compat_ioctl,
1926 #endif
1927- .mmap = generic_file_mmap,
1928+ .mmap = ext4_file_mmap,
1929 .open = generic_file_open,
1930 .release = ext4_release_file,
1931 .fsync = ext4_sync_file,
1932@@ -144,6 +161,7 @@ const struct file_operations ext4_file_operations = {
1933 const struct inode_operations ext4_file_inode_operations = {
1934 .truncate = ext4_truncate,
1935 .setattr = ext4_setattr,
1936+ .getattr = ext4_getattr,
1937 #ifdef CONFIG_EXT4DEV_FS_XATTR
1938 .setxattr = generic_setxattr,
1939 .getxattr = generic_getxattr,
1940diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
1941index 1c8ba48..a45c373 100644
1942--- a/fs/ext4/fsync.c
1943+++ b/fs/ext4/fsync.c
1944@@ -27,6 +27,7 @@
1945 #include <linux/sched.h>
1946 #include <linux/writeback.h>
1947 #include <linux/jbd2.h>
1948+#include <linux/blkdev.h>
1949 #include "ext4.h"
1950 #include "ext4_jbd2.h"
1951
1952@@ -45,6 +46,7 @@
1953 int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
1954 {
1955 struct inode *inode = dentry->d_inode;
1956+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
1957 int ret = 0;
1958
1959 J_ASSERT(ext4_journal_current_handle() == NULL);
1960@@ -85,6 +87,8 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync)
1961 .nr_to_write = 0, /* sys_fsync did this */
1962 };
1963 ret = sync_inode(inode, &wbc);
1964+ if (journal && (journal->j_flags & JBD2_BARRIER))
1965+ blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
1966 }
1967 out:
1968 return ret;
1969diff --git a/fs/ext4/group.h b/fs/ext4/group.h
1970index 7eb0604..c2c0a8d 100644
1971--- a/fs/ext4/group.h
1972+++ b/fs/ext4/group.h
1973@@ -13,7 +13,7 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
1974 struct ext4_group_desc *gdp);
1975 extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
1976 struct ext4_group_desc *gdp);
1977-struct buffer_head *read_block_bitmap(struct super_block *sb,
1978+struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
1979 ext4_group_t block_group);
1980 extern unsigned ext4_init_block_bitmap(struct super_block *sb,
1981 struct buffer_head *bh,
1982diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
1983index c6efbab..f344834 100644
1984--- a/fs/ext4/ialloc.c
1985+++ b/fs/ext4/ialloc.c
1986@@ -97,34 +97,44 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh,
1987 * Return buffer_head of bitmap on success or NULL.
1988 */
1989 static struct buffer_head *
1990-read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
1991+ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
1992 {
1993 struct ext4_group_desc *desc;
1994 struct buffer_head *bh = NULL;
1995+ ext4_fsblk_t bitmap_blk;
1996
1997 desc = ext4_get_group_desc(sb, block_group, NULL);
1998 if (!desc)
1999- goto error_out;
2000+ return NULL;
2001+ bitmap_blk = ext4_inode_bitmap(sb, desc);
2002+ bh = sb_getblk(sb, bitmap_blk);
2003+ if (unlikely(!bh)) {
2004+ ext4_error(sb, __func__,
2005+ "Cannot read inode bitmap - "
2006+ "block_group = %lu, inode_bitmap = %llu",
2007+ block_group, bitmap_blk);
2008+ return NULL;
2009+ }
2010+ if (bh_uptodate_or_lock(bh))
2011+ return bh;
2012+
2013+ spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
2014 if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
2015- bh = sb_getblk(sb, ext4_inode_bitmap(sb, desc));
2016- if (!buffer_uptodate(bh)) {
2017- lock_buffer(bh);
2018- if (!buffer_uptodate(bh)) {
2019- ext4_init_inode_bitmap(sb, bh, block_group,
2020- desc);
2021- set_buffer_uptodate(bh);
2022- }
2023- unlock_buffer(bh);
2024- }
2025- } else {
2026- bh = sb_bread(sb, ext4_inode_bitmap(sb, desc));
2027+ ext4_init_inode_bitmap(sb, bh, block_group, desc);
2028+ set_buffer_uptodate(bh);
2029+ unlock_buffer(bh);
2030+ spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
2031+ return bh;
2032 }
2033- if (!bh)
2034- ext4_error(sb, "read_inode_bitmap",
2035+ spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
2036+ if (bh_submit_read(bh) < 0) {
2037+ put_bh(bh);
2038+ ext4_error(sb, __func__,
2039 "Cannot read inode bitmap - "
2040 "block_group = %lu, inode_bitmap = %llu",
2041- block_group, ext4_inode_bitmap(sb, desc));
2042-error_out:
2043+ block_group, bitmap_blk);
2044+ return NULL;
2045+ }
2046 return bh;
2047 }
2048
2049@@ -157,6 +167,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
2050 struct ext4_super_block * es;
2051 struct ext4_sb_info *sbi;
2052 int fatal = 0, err;
2053+ ext4_group_t flex_group;
2054
2055 if (atomic_read(&inode->i_count) > 1) {
2056 printk ("ext4_free_inode: inode has count=%d\n",
2057@@ -199,7 +210,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
2058 }
2059 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
2060 bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
2061- bitmap_bh = read_inode_bitmap(sb, block_group);
2062+ bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
2063 if (!bitmap_bh)
2064 goto error_return;
2065
2066@@ -232,6 +243,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode)
2067 if (is_directory)
2068 percpu_counter_dec(&sbi->s_dirs_counter);
2069
2070+ if (sbi->s_log_groups_per_flex) {
2071+ flex_group = ext4_flex_group(sbi, block_group);
2072+ spin_lock(sb_bgl_lock(sbi, flex_group));
2073+ sbi->s_flex_groups[flex_group].free_inodes++;
2074+ spin_unlock(sb_bgl_lock(sbi, flex_group));
2075+ }
2076 }
2077 BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata");
2078 err = ext4_journal_dirty_metadata(handle, bh2);
2079@@ -286,6 +303,80 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
2080 return ret;
2081 }
2082
2083+#define free_block_ratio 10
2084+
2085+static int find_group_flex(struct super_block *sb, struct inode *parent,
2086+ ext4_group_t *best_group)
2087+{
2088+ struct ext4_sb_info *sbi = EXT4_SB(sb);
2089+ struct ext4_group_desc *desc;
2090+ struct buffer_head *bh;
2091+ struct flex_groups *flex_group = sbi->s_flex_groups;
2092+ ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
2093+ ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
2094+ ext4_group_t ngroups = sbi->s_groups_count;
2095+ int flex_size = ext4_flex_bg_size(sbi);
2096+ ext4_group_t best_flex = parent_fbg_group;
2097+ int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
2098+ int flexbg_free_blocks;
2099+ int flex_freeb_ratio;
2100+ ext4_group_t n_fbg_groups;
2101+ ext4_group_t i;
2102+
2103+ n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
2104+ sbi->s_log_groups_per_flex;
2105+
2106+find_close_to_parent:
2107+ flexbg_free_blocks = flex_group[best_flex].free_blocks;
2108+ flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
2109+ if (flex_group[best_flex].free_inodes &&
2110+ flex_freeb_ratio > free_block_ratio)
2111+ goto found_flexbg;
2112+
2113+ if (best_flex && best_flex == parent_fbg_group) {
2114+ best_flex--;
2115+ goto find_close_to_parent;
2116+ }
2117+
2118+ for (i = 0; i < n_fbg_groups; i++) {
2119+ if (i == parent_fbg_group || i == parent_fbg_group - 1)
2120+ continue;
2121+
2122+ flexbg_free_blocks = flex_group[i].free_blocks;
2123+ flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
2124+
2125+ if (flex_freeb_ratio > free_block_ratio &&
2126+ flex_group[i].free_inodes) {
2127+ best_flex = i;
2128+ goto found_flexbg;
2129+ }
2130+
2131+ if (flex_group[best_flex].free_inodes == 0 ||
2132+ (flex_group[i].free_blocks >
2133+ flex_group[best_flex].free_blocks &&
2134+ flex_group[i].free_inodes))
2135+ best_flex = i;
2136+ }
2137+
2138+ if (!flex_group[best_flex].free_inodes ||
2139+ !flex_group[best_flex].free_blocks)
2140+ return -1;
2141+
2142+found_flexbg:
2143+ for (i = best_flex * flex_size; i < ngroups &&
2144+ i < (best_flex + 1) * flex_size; i++) {
2145+ desc = ext4_get_group_desc(sb, i, &bh);
2146+ if (le16_to_cpu(desc->bg_free_inodes_count)) {
2147+ *best_group = i;
2148+ goto out;
2149+ }
2150+ }
2151+
2152+ return -1;
2153+out:
2154+ return 0;
2155+}
2156+
2157 /*
2158 * Orlov's allocator for directories.
2159 *
2160@@ -501,6 +592,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
2161 struct inode *ret;
2162 ext4_group_t i;
2163 int free = 0;
2164+ ext4_group_t flex_group;
2165
2166 /* Cannot create files in a deleted directory */
2167 if (!dir || !dir->i_nlink)
2168@@ -514,6 +606,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
2169
2170 sbi = EXT4_SB(sb);
2171 es = sbi->s_es;
2172+
2173+ if (sbi->s_log_groups_per_flex) {
2174+ ret2 = find_group_flex(sb, dir, &group);
2175+ goto got_group;
2176+ }
2177+
2178 if (S_ISDIR(mode)) {
2179 if (test_opt (sb, OLDALLOC))
2180 ret2 = find_group_dir(sb, dir, &group);
2181@@ -522,6 +620,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
2182 } else
2183 ret2 = find_group_other(sb, dir, &group);
2184
2185+got_group:
2186 err = -ENOSPC;
2187 if (ret2 == -1)
2188 goto out;
2189@@ -534,7 +633,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode)
2190 goto fail;
2191
2192 brelse(bitmap_bh);
2193- bitmap_bh = read_inode_bitmap(sb, group);
2194+ bitmap_bh = ext4_read_inode_bitmap(sb, group);
2195 if (!bitmap_bh)
2196 goto fail;
2197
2198@@ -600,7 +699,7 @@ got:
2199 /* We may have to initialize the block bitmap if it isn't already */
2200 if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
2201 gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2202- struct buffer_head *block_bh = read_block_bitmap(sb, group);
2203+ struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group);
2204
2205 BUFFER_TRACE(block_bh, "get block bitmap access");
2206 err = ext4_journal_get_write_access(handle, block_bh);
2207@@ -639,7 +738,7 @@ got:
2208
2209 /* When marking the block group with
2210 * ~EXT4_BG_INODE_UNINIT we don't want to depend
2211- * on the value of bg_itable_unsed even though
2212+ * on the value of bg_itable_unused even though
2213 * mke2fs could have initialized the same for us.
2214 * Instead we calculated the value below
2215 */
2216@@ -676,6 +775,13 @@ got:
2217 percpu_counter_inc(&sbi->s_dirs_counter);
2218 sb->s_dirt = 1;
2219
2220+ if (sbi->s_log_groups_per_flex) {
2221+ flex_group = ext4_flex_group(sbi, group);
2222+ spin_lock(sb_bgl_lock(sbi, flex_group));
2223+ sbi->s_flex_groups[flex_group].free_inodes--;
2224+ spin_unlock(sb_bgl_lock(sbi, flex_group));
2225+ }
2226+
2227 inode->i_uid = current->fsuid;
2228 if (test_opt (sb, GRPID))
2229 inode->i_gid = dir->i_gid;
2230@@ -740,14 +846,10 @@ got:
2231 goto fail_free_drop;
2232
2233 if (test_opt(sb, EXTENTS)) {
2234- /* set extent flag only for diretory, file and normal symlink*/
2235+ /* set extent flag only for directory, file and normal symlink*/
2236 if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
2237 EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
2238 ext4_ext_tree_init(handle, inode);
2239- err = ext4_update_incompat_feature(handle, sb,
2240- EXT4_FEATURE_INCOMPAT_EXTENTS);
2241- if (err)
2242- goto fail_free_drop;
2243 }
2244 }
2245
2246@@ -799,7 +901,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
2247
2248 block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
2249 bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
2250- bitmap_bh = read_inode_bitmap(sb, block_group);
2251+ bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
2252 if (!bitmap_bh) {
2253 ext4_warning(sb, __func__,
2254 "inode bitmap error for orphan %lu", ino);
2255@@ -817,6 +919,14 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
2256 if (IS_ERR(inode))
2257 goto iget_failed;
2258
2259+ /*
2260+ * If the orphans has i_nlinks > 0 then it should be able to be
2261+ * truncated, otherwise it won't be removed from the orphan list
2262+ * during processing and an infinite loop will result.
2263+ */
2264+ if (inode->i_nlink && !ext4_can_truncate(inode))
2265+ goto bad_orphan;
2266+
2267 if (NEXT_ORPHAN(inode) > max_ino)
2268 goto bad_orphan;
2269 brelse(bitmap_bh);
2270@@ -838,6 +948,7 @@ bad_orphan:
2271 printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n",
2272 NEXT_ORPHAN(inode));
2273 printk(KERN_NOTICE "max_ino=%lu\n", max_ino);
2274+ printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink);
2275 /* Avoid freeing blocks if we got a bad deleted inode */
2276 if (inode->i_nlink == 0)
2277 inode->i_blocks = 0;
2278@@ -868,7 +979,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb)
2279 continue;
2280 desc_count += le16_to_cpu(gdp->bg_free_inodes_count);
2281 brelse(bitmap_bh);
2282- bitmap_bh = read_inode_bitmap(sb, i);
2283+ bitmap_bh = ext4_read_inode_bitmap(sb, i);
2284 if (!bitmap_bh)
2285 continue;
2286
2287diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
2288index 8d97077..3c0195a 100644
2289--- a/fs/ext4/inode.c
2290+++ b/fs/ext4/inode.c
2291@@ -32,12 +32,25 @@
2292 #include <linux/string.h>
2293 #include <linux/buffer_head.h>
2294 #include <linux/writeback.h>
2295+#include <linux/pagevec.h>
2296 #include <linux/mpage.h>
2297 #include <linux/uio.h>
2298 #include <linux/bio.h>
2299 #include "ext4_jbd2.h"
2300 #include "xattr.h"
2301 #include "acl.h"
2302+#include "ext4_extents.h"
2303+
2304+#define MPAGE_DA_EXTENT_TAIL 0x01
2305+
2306+static inline int ext4_begin_ordered_truncate(struct inode *inode,
2307+ loff_t new_size)
2308+{
2309+ return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
2310+ new_size);
2311+}
2312+
2313+static void ext4_invalidatepage(struct page *page, unsigned long offset);
2314
2315 /*
2316 * Test whether an inode is a fast symlink.
2317@@ -180,14 +193,18 @@ static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
2318 void ext4_delete_inode (struct inode * inode)
2319 {
2320 handle_t *handle;
2321+ int err;
2322
2323+ if (ext4_should_order_data(inode))
2324+ ext4_begin_ordered_truncate(inode, 0);
2325 truncate_inode_pages(&inode->i_data, 0);
2326
2327 if (is_bad_inode(inode))
2328 goto no_delete;
2329
2330- handle = start_transaction(inode);
2331+ handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3);
2332 if (IS_ERR(handle)) {
2333+ ext4_std_error(inode->i_sb, PTR_ERR(handle));
2334 /*
2335 * If we're going to skip the normal cleanup, we still need to
2336 * make sure that the in-core orphan linked list is properly
2337@@ -200,8 +217,34 @@ void ext4_delete_inode (struct inode * inode)
2338 if (IS_SYNC(inode))
2339 handle->h_sync = 1;
2340 inode->i_size = 0;
2341+ err = ext4_mark_inode_dirty(handle, inode);
2342+ if (err) {
2343+ ext4_warning(inode->i_sb, __func__,
2344+ "couldn't mark inode dirty (err %d)", err);
2345+ goto stop_handle;
2346+ }
2347 if (inode->i_blocks)
2348 ext4_truncate(inode);
2349+
2350+ /*
2351+ * ext4_ext_truncate() doesn't reserve any slop when it
2352+ * restarts journal transactions; therefore there may not be
2353+ * enough credits left in the handle to remove the inode from
2354+ * the orphan list and set the dtime field.
2355+ */
2356+ if (handle->h_buffer_credits < 3) {
2357+ err = ext4_journal_extend(handle, 3);
2358+ if (err > 0)
2359+ err = ext4_journal_restart(handle, 3);
2360+ if (err != 0) {
2361+ ext4_warning(inode->i_sb, __func__,
2362+ "couldn't extend journal (err %d)", err);
2363+ stop_handle:
2364+ ext4_journal_stop(handle);
2365+ goto no_delete;
2366+ }
2367+ }
2368+
2369 /*
2370 * Kill off the orphan record which ext4_truncate created.
2371 * AKPM: I think this can be inside the above `if'.
2372@@ -508,11 +551,12 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
2373 * direct blocks
2374 */
2375 static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
2376- ext4_fsblk_t goal, int indirect_blks, int blks,
2377- ext4_fsblk_t new_blocks[4], int *err)
2378+ ext4_lblk_t iblock, ext4_fsblk_t goal,
2379+ int indirect_blks, int blks,
2380+ ext4_fsblk_t new_blocks[4], int *err)
2381 {
2382 int target, i;
2383- unsigned long count = 0;
2384+ unsigned long count = 0, blk_allocated = 0;
2385 int index = 0;
2386 ext4_fsblk_t current_block = 0;
2387 int ret = 0;
2388@@ -525,12 +569,13 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
2389 * the first direct block of this branch. That's the
2390 * minimum number of blocks need to allocate(required)
2391 */
2392- target = blks + indirect_blks;
2393-
2394- while (1) {
2395+ /* first we try to allocate the indirect blocks */
2396+ target = indirect_blks;
2397+ while (target > 0) {
2398 count = target;
2399 /* allocating blocks for indirect blocks and direct blocks */
2400- current_block = ext4_new_blocks(handle,inode,goal,&count,err);
2401+ current_block = ext4_new_meta_blocks(handle, inode,
2402+ goal, &count, err);
2403 if (*err)
2404 goto failed_out;
2405
2406@@ -540,16 +585,48 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
2407 new_blocks[index++] = current_block++;
2408 count--;
2409 }
2410-
2411- if (count > 0)
2412+ if (count > 0) {
2413+ /*
2414+ * save the new block number
2415+ * for the first direct block
2416+ */
2417+ new_blocks[index] = current_block;
2418+ printk(KERN_INFO "%s returned more blocks than "
2419+ "requested\n", __func__);
2420+ WARN_ON(1);
2421 break;
2422+ }
2423 }
2424
2425- /* save the new block number for the first direct block */
2426- new_blocks[index] = current_block;
2427-
2428+ target = blks - count ;
2429+ blk_allocated = count;
2430+ if (!target)
2431+ goto allocated;
2432+ /* Now allocate data blocks */
2433+ count = target;
2434+ /* allocating blocks for data blocks */
2435+ current_block = ext4_new_blocks(handle, inode, iblock,
2436+ goal, &count, err);
2437+ if (*err && (target == blks)) {
2438+ /*
2439+ * if the allocation failed and we didn't allocate
2440+ * any blocks before
2441+ */
2442+ goto failed_out;
2443+ }
2444+ if (!*err) {
2445+ if (target == blks) {
2446+ /*
2447+ * save the new block number
2448+ * for the first direct block
2449+ */
2450+ new_blocks[index] = current_block;
2451+ }
2452+ blk_allocated += count;
2453+ }
2454+allocated:
2455 /* total number of blocks allocated for direct blocks */
2456- ret = count;
2457+ ret = blk_allocated;
2458 *err = 0;
2459 return ret;
2460 failed_out:
2461@@ -584,8 +661,9 @@ failed_out:
2462 * as described above and return 0.
2463 */
2464 static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
2465- int indirect_blks, int *blks, ext4_fsblk_t goal,
2466- ext4_lblk_t *offsets, Indirect *branch)
2467+ ext4_lblk_t iblock, int indirect_blks,
2468+ int *blks, ext4_fsblk_t goal,
2469+ ext4_lblk_t *offsets, Indirect *branch)
2470 {
2471 int blocksize = inode->i_sb->s_blocksize;
2472 int i, n = 0;
2473@@ -595,7 +673,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
2474 ext4_fsblk_t new_blocks[4];
2475 ext4_fsblk_t current_block;
2476
2477- num = ext4_alloc_blocks(handle, inode, goal, indirect_blks,
2478+ num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks,
2479 *blks, new_blocks, &err);
2480 if (err)
2481 return err;
2482@@ -799,6 +877,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
2483 struct ext4_inode_info *ei = EXT4_I(inode);
2484 int count = 0;
2485 ext4_fsblk_t first_block = 0;
2486+ loff_t disksize;
2487
2488
2489 J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
2490@@ -855,8 +934,9 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
2491 /*
2492 * Block out ext4_truncate while we alter the tree
2493 */
2494- err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal,
2495- offsets + (partial - chain), partial);
2496+ err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
2497+ &count, goal,
2498+ offsets + (partial - chain), partial);
2499
2500 /*
2501 * The ext4_splice_branch call will free and forget any buffers
2502@@ -873,8 +953,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
2503 * protect it if you're about to implement concurrent
2504 * ext4_get_block() -bzzz
2505 */
2506- if (!err && extend_disksize && inode->i_size > ei->i_disksize)
2507- ei->i_disksize = inode->i_size;
2508+ if (!err && extend_disksize) {
2509+ disksize = ((loff_t) iblock + count) << inode->i_blkbits;
2510+ if (disksize > i_size_read(inode))
2511+ disksize = i_size_read(inode);
2512+ if (disksize > ei->i_disksize)
2513+ ei->i_disksize = disksize;
2514+ }
2515 if (err)
2516 goto cleanup;
2517
2518@@ -897,23 +982,74 @@ out:
2519 return err;
2520 }
2521
2522-/* Maximum number of blocks we map for direct IO at once. */
2523-#define DIO_MAX_BLOCKS 4096
2524 /*
2525- * Number of credits we need for writing DIO_MAX_BLOCKS:
2526- * We need sb + group descriptor + bitmap + inode -> 4
2527- * For B blocks with A block pointers per block we need:
2528- * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect).
2529- * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25.
2530+ * Calculate the number of metadata blocks need to reserve
2531+ * to allocate @blocks for non extent file based file
2532 */
2533-#define DIO_CREDITS 25
2534+static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks)
2535+{
2536+ int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb);
2537+ int ind_blks, dind_blks, tind_blks;
2538+
2539+ /* number of new indirect blocks needed */
2540+ ind_blks = (blocks + icap - 1) / icap;
2541+
2542+ dind_blks = (ind_blks + icap - 1) / icap;
2543
2544+ tind_blks = 1;
2545+
2546+ return ind_blks + dind_blks + tind_blks;
2547+}
2548
2549 /*
2550+ * Calculate the number of metadata blocks need to reserve
2551+ * to allocate given number of blocks
2552+ */
2553+static int ext4_calc_metadata_amount(struct inode *inode, int blocks)
2554+{
2555+ if (!blocks)
2556+ return 0;
2557+
2558+ if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
2559+ return ext4_ext_calc_metadata_amount(inode, blocks);
2560+
2561+ return ext4_indirect_calc_metadata_amount(inode, blocks);
2562+}
2563+
2564+static void ext4_da_update_reserve_space(struct inode *inode, int used)
2565+{
2566+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2567+ int total, mdb, mdb_free;
2568+
2569+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2570+ /* recalculate the number of metablocks still need to be reserved */
2571+ total = EXT4_I(inode)->i_reserved_data_blocks - used;
2572+ mdb = ext4_calc_metadata_amount(inode, total);
2573+
2574+ /* figure out how many metablocks to release */
2575+ BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
2576+ mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
2577+
2578+ /* Account for allocated meta_blocks */
2579+ mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks;
2580+
2581+ /* update fs free blocks counter for truncate case */
2582+ percpu_counter_add(&sbi->s_freeblocks_counter, mdb_free);
2583+
2584+ /* update per-inode reservations */
2585+ BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks);
2586+ EXT4_I(inode)->i_reserved_data_blocks -= used;
2587+
2588+ BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
2589+ EXT4_I(inode)->i_reserved_meta_blocks = mdb;
2590+ EXT4_I(inode)->i_allocated_meta_blocks = 0;
2591+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2592+}
2593+
2594+/*
2595+ * The ext4_get_blocks_wrap() function try to look up the requested blocks,
2596+ * and returns if the blocks are already mapped.
2597 *
2598- *
2599- * ext4_ext4 get_block() wrapper function
2600- * It will do a look up first, and returns if the blocks already mapped.
2601 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
2602 * and store the allocated blocks in the result buffer head and mark it
2603 * mapped.
2604@@ -934,7 +1070,7 @@ out:
2605 */
2606 int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
2607 unsigned long max_blocks, struct buffer_head *bh,
2608- int create, int extend_disksize)
2609+ int create, int extend_disksize, int flag)
2610 {
2611 int retval;
2612
2613@@ -975,6 +1111,15 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
2614 * with create == 1 flag.
2615 */
2616 down_write((&EXT4_I(inode)->i_data_sem));
2617+
2618+ /*
2619+ * if the caller is from delayed allocation writeout path
2620+ * we have already reserved fs blocks for allocation
2621+ * let the underlying get_block() function know to
2622+ * avoid double accounting
2623+ */
2624+ if (flag)
2625+ EXT4_I(inode)->i_delalloc_reserved_flag = 1;
2626 /*
2627 * We need to check for EXT4 here because migrate
2628 * could have changed the inode type in between
2629@@ -996,23 +1141,39 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
2630 ~EXT4_EXT_MIGRATE;
2631 }
2632 }
2633+
2634+ if (flag) {
2635+ EXT4_I(inode)->i_delalloc_reserved_flag = 0;
2636+ /*
2637+ * Update reserved blocks/metadata blocks
2638+ * after successful block allocation
2639+ * which were deferred till now
2640+ */
2641+ if ((retval > 0) && buffer_delay(bh))
2642+ ext4_da_update_reserve_space(inode, retval);
2643+ }
2644+
2645 up_write((&EXT4_I(inode)->i_data_sem));
2646 return retval;
2647 }
2648
2649+/* Maximum number of blocks we map for direct IO at once. */
2650+#define DIO_MAX_BLOCKS 4096
2651+
2652 static int ext4_get_block(struct inode *inode, sector_t iblock,
2653 struct buffer_head *bh_result, int create)
2654 {
2655 handle_t *handle = ext4_journal_current_handle();
2656 int ret = 0, started = 0;
2657 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
2658+ int dio_credits;
2659
2660 if (create && !handle) {
2661 /* Direct IO write... */
2662 if (max_blocks > DIO_MAX_BLOCKS)
2663 max_blocks = DIO_MAX_BLOCKS;
2664- handle = ext4_journal_start(inode, DIO_CREDITS +
2665- 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb));
2666+ dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
2667+ handle = ext4_journal_start(inode, dio_credits);
2668 if (IS_ERR(handle)) {
2669 ret = PTR_ERR(handle);
2670 goto out;
2671@@ -1021,7 +1182,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock,
2672 }
2673
2674 ret = ext4_get_blocks_wrap(handle, inode, iblock,
2675- max_blocks, bh_result, create, 0);
2676+ max_blocks, bh_result, create, 0, 0);
2677 if (ret > 0) {
2678 bh_result->b_size = (ret << inode->i_blkbits);
2679 ret = 0;
2680@@ -1047,7 +1208,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
2681 dummy.b_blocknr = -1000;
2682 buffer_trace_init(&dummy.b_history);
2683 err = ext4_get_blocks_wrap(handle, inode, block, 1,
2684- &dummy, create, 1);
2685+ &dummy, create, 1, 0);
2686 /*
2687 * ext4_get_blocks_handle() returns number of blocks
2688 * mapped. 0 in case of a HOLE.
2689@@ -1203,19 +1364,20 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
2690 to = from + len;
2691
2692 retry:
2693- page = __grab_cache_page(mapping, index);
2694- if (!page)
2695- return -ENOMEM;
2696- *pagep = page;
2697-
2698 handle = ext4_journal_start(inode, needed_blocks);
2699 if (IS_ERR(handle)) {
2700- unlock_page(page);
2701- page_cache_release(page);
2702 ret = PTR_ERR(handle);
2703 goto out;
2704 }
2705
2706+ page = __grab_cache_page(mapping, index);
2707+ if (!page) {
2708+ ext4_journal_stop(handle);
2709+ ret = -ENOMEM;
2710+ goto out;
2711+ }
2712+ *pagep = page;
2713+
2714 ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
2715 ext4_get_block);
2716
2717@@ -1225,8 +1387,8 @@ retry:
2718 }
2719
2720 if (ret) {
2721- ext4_journal_stop(handle);
2722 unlock_page(page);
2723+ ext4_journal_stop(handle);
2724 page_cache_release(page);
2725 }
2726
2727@@ -1236,15 +1398,6 @@ out:
2728 return ret;
2729 }
2730
2731-int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
2732-{
2733- int err = jbd2_journal_dirty_data(handle, bh);
2734- if (err)
2735- ext4_journal_abort_handle(__func__, __func__,
2736- bh, handle, err);
2737- return err;
2738-}
2739-
2740 /* For write_end() in data=journal mode */
2741 static int write_end_fn(handle_t *handle, struct buffer_head *bh)
2742 {
2743@@ -1255,29 +1408,6 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
2744 }
2745
2746 /*
2747- * Generic write_end handler for ordered and writeback ext4 journal modes.
2748- * We can't use generic_write_end, because that unlocks the page and we need to
2749- * unlock the page after ext4_journal_stop, but ext4_journal_stop must run
2750- * after block_write_end.
2751- */
2752-static int ext4_generic_write_end(struct file *file,
2753- struct address_space *mapping,
2754- loff_t pos, unsigned len, unsigned copied,
2755- struct page *page, void *fsdata)
2756-{
2757- struct inode *inode = file->f_mapping->host;
2758-
2759- copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2760-
2761- if (pos+copied > inode->i_size) {
2762- i_size_write(inode, pos+copied);
2763- mark_inode_dirty(inode);
2764- }
2765-
2766- return copied;
2767-}
2768-
2769-/*
2770 * We need to pick up the new inode size which generic_commit_write gave us
2771 * `file' can be NULL - eg, when called from page_symlink().
2772 *
2773@@ -1290,15 +1420,10 @@ static int ext4_ordered_write_end(struct file *file,
2774 struct page *page, void *fsdata)
2775 {
2776 handle_t *handle = ext4_journal_current_handle();
2777- struct inode *inode = file->f_mapping->host;
2778- unsigned from, to;
2779+ struct inode *inode = mapping->host;
2780 int ret = 0, ret2;
2781
2782- from = pos & (PAGE_CACHE_SIZE - 1);
2783- to = from + len;
2784-
2785- ret = walk_page_buffers(handle, page_buffers(page),
2786- from, to, NULL, ext4_journal_dirty_data);
2787+ ret = ext4_jbd2_file_inode(handle, inode);
2788
2789 if (ret == 0) {
2790 /*
2791@@ -1311,7 +1436,7 @@ static int ext4_ordered_write_end(struct file *file,
2792 new_i_size = pos + copied;
2793 if (new_i_size > EXT4_I(inode)->i_disksize)
2794 EXT4_I(inode)->i_disksize = new_i_size;
2795- ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
2796+ ret2 = generic_write_end(file, mapping, pos, len, copied,
2797 page, fsdata);
2798 copied = ret2;
2799 if (ret2 < 0)
2800@@ -1320,8 +1445,6 @@ static int ext4_ordered_write_end(struct file *file,
2801 ret2 = ext4_journal_stop(handle);
2802 if (!ret)
2803 ret = ret2;
2804- unlock_page(page);
2805- page_cache_release(page);
2806
2807 return ret ? ret : copied;
2808 }
2809@@ -1332,7 +1455,7 @@ static int ext4_writeback_write_end(struct file *file,
2810 struct page *page, void *fsdata)
2811 {
2812 handle_t *handle = ext4_journal_current_handle();
2813- struct inode *inode = file->f_mapping->host;
2814+ struct inode *inode = mapping->host;
2815 int ret = 0, ret2;
2816 loff_t new_i_size;
2817
2818@@ -1340,7 +1463,7 @@ static int ext4_writeback_write_end(struct file *file,
2819 if (new_i_size > EXT4_I(inode)->i_disksize)
2820 EXT4_I(inode)->i_disksize = new_i_size;
2821
2822- ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
2823+ ret2 = generic_write_end(file, mapping, pos, len, copied,
2824 page, fsdata);
2825 copied = ret2;
2826 if (ret2 < 0)
2827@@ -1349,8 +1472,6 @@ static int ext4_writeback_write_end(struct file *file,
2828 ret2 = ext4_journal_stop(handle);
2829 if (!ret)
2830 ret = ret2;
2831- unlock_page(page);
2832- page_cache_release(page);
2833
2834 return ret ? ret : copied;
2835 }
2836@@ -1389,15 +1510,1028 @@ static int ext4_journalled_write_end(struct file *file,
2837 ret = ret2;
2838 }
2839
2840+ unlock_page(page);
2841 ret2 = ext4_journal_stop(handle);
2842 if (!ret)
2843 ret = ret2;
2844- unlock_page(page);
2845 page_cache_release(page);
2846
2847 return ret ? ret : copied;
2848 }
2849
2850+static int ext4_da_reserve_space(struct inode *inode, int nrblocks)
2851+{
2852+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2853+ unsigned long md_needed, mdblocks, total = 0;
2854+
2855+ /*
2856+ * recalculate the amount of metadata blocks to reserve
2857+ * in order to allocate nrblocks
2858+ * worse case is one extent per block
2859+ */
2860+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2861+ total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks;
2862+ mdblocks = ext4_calc_metadata_amount(inode, total);
2863+ BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks);
2864+
2865+ md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks;
2866+ total = md_needed + nrblocks;
2867+
2868+ if (ext4_has_free_blocks(sbi, total) < total) {
2869+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2870+ return -ENOSPC;
2871+ }
2872+ /* reduce fs free blocks counter */
2873+ percpu_counter_sub(&sbi->s_freeblocks_counter, total);
2874+
2875+ EXT4_I(inode)->i_reserved_data_blocks += nrblocks;
2876+ EXT4_I(inode)->i_reserved_meta_blocks = mdblocks;
2877+
2878+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2879+ return 0; /* success */
2880+}
2881+
2882+static void ext4_da_release_space(struct inode *inode, int to_free)
2883+{
2884+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2885+ int total, mdb, mdb_free, release;
2886+
2887+ if (!to_free)
2888+ return; /* Nothing to release, exit */
2889+
2890+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
2891+
2892+ if (!EXT4_I(inode)->i_reserved_data_blocks) {
2893+ /*
2894+ * if there is no reserved blocks, but we try to free some
2895+ * then the counter is messed up somewhere.
2896+ * but since this function is called from invalidate
2897+ * page, it's harmless to return without any action
2898+ */
2899+ printk(KERN_INFO "ext4 delalloc try to release %d reserved "
2900+ "blocks for inode %lu, but there is no reserved "
2901+ "data blocks\n", to_free, inode->i_ino);
2902+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2903+ return;
2904+ }
2905+
2906+ /* recalculate the number of metablocks still need to be reserved */
2907+ total = EXT4_I(inode)->i_reserved_data_blocks - to_free;
2908+ mdb = ext4_calc_metadata_amount(inode, total);
2909+
2910+ /* figure out how many metablocks to release */
2911+ BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
2912+ mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb;
2913+
2914+ release = to_free + mdb_free;
2915+
2916+ /* update fs free blocks counter for truncate case */
2917+ percpu_counter_add(&sbi->s_freeblocks_counter, release);
2918+
2919+ /* update per-inode reservations */
2920+ BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks);
2921+ EXT4_I(inode)->i_reserved_data_blocks -= to_free;
2922+
2923+ BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks);
2924+ EXT4_I(inode)->i_reserved_meta_blocks = mdb;
2925+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
2926+}
2927+
2928+static void ext4_da_page_release_reservation(struct page *page,
2929+ unsigned long offset)
2930+{
2931+ int to_release = 0;
2932+ struct buffer_head *head, *bh;
2933+ unsigned int curr_off = 0;
2934+
2935+ head = page_buffers(page);
2936+ bh = head;
2937+ do {
2938+ unsigned int next_off = curr_off + bh->b_size;
2939+
2940+ if ((offset <= curr_off) && (buffer_delay(bh))) {
2941+ to_release++;
2942+ clear_buffer_delay(bh);
2943+ }
2944+ curr_off = next_off;
2945+ } while ((bh = bh->b_this_page) != head);
2946+ ext4_da_release_space(page->mapping->host, to_release);
2947+}
2948+
2949+/*
2950+ * Delayed allocation stuff
2951+ */
2952+
2953+struct mpage_da_data {
2954+ struct inode *inode;
2955+ struct buffer_head lbh; /* extent of blocks */
2956+ unsigned long first_page, next_page; /* extent of pages */
2957+ get_block_t *get_block;
2958+ struct writeback_control *wbc;
2959+ int io_done;
2960+ long pages_written;
2961+};
2962+
2963+/*
2964+ * mpage_da_submit_io - walks through extent of pages and try to write
2965+ * them with writepage() call back
2966+ *
2967+ * @mpd->inode: inode
2968+ * @mpd->first_page: first page of the extent
2969+ * @mpd->next_page: page after the last page of the extent
2970+ * @mpd->get_block: the filesystem's block mapper function
2971+ *
2972+ * By the time mpage_da_submit_io() is called we expect all blocks
2973+ * to be allocated. this may be wrong if allocation failed.
2974+ *
2975+ * As pages are already locked by write_cache_pages(), we can't use it
2976+ */
2977+static int mpage_da_submit_io(struct mpage_da_data *mpd)
2978+{
2979+ struct address_space *mapping = mpd->inode->i_mapping;
2980+ int ret = 0, err, nr_pages, i;
2981+ unsigned long index, end;
2982+ struct pagevec pvec;
2983+
2984+ BUG_ON(mpd->next_page <= mpd->first_page);
2985+ pagevec_init(&pvec, 0);
2986+ index = mpd->first_page;
2987+ end = mpd->next_page - 1;
2988+
2989+ while (index <= end) {
2990+ /* XXX: optimize tail */
2991+ nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
2992+ if (nr_pages == 0)
2993+ break;
2994+ for (i = 0; i < nr_pages; i++) {
2995+ struct page *page = pvec.pages[i];
2996+
2997+ index = page->index;
2998+ if (index > end)
2999+ break;
3000+ index++;
3001+
3002+ err = mapping->a_ops->writepage(page, mpd->wbc);
3003+ if (!err)
3004+ mpd->pages_written++;
3005+ /*
3006+ * In error case, we have to continue because
3007+ * remaining pages are still locked
3008+ * XXX: unlock and re-dirty them?
3009+ */
3010+ if (ret == 0)
3011+ ret = err;
3012+ }
3013+ pagevec_release(&pvec);
3014+ }
3015+ return ret;
3016+}
3017+
3018+/*
3019+ * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
3020+ *
3021+ * @mpd->inode - inode to walk through
3022+ * @exbh->b_blocknr - first block on a disk
3023+ * @exbh->b_size - amount of space in bytes
3024+ * @logical - first logical block to start assignment with
3025+ *
3026+ * the function goes through all passed space and put actual disk
3027+ * block numbers into buffer heads, dropping BH_Delay
3028+ */
3029+static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
3030+ struct buffer_head *exbh)
3031+{
3032+ struct inode *inode = mpd->inode;
3033+ struct address_space *mapping = inode->i_mapping;
3034+ int blocks = exbh->b_size >> inode->i_blkbits;
3035+ sector_t pblock = exbh->b_blocknr, cur_logical;
3036+ struct buffer_head *head, *bh;
3037+ pgoff_t index, end;
3038+ struct pagevec pvec;
3039+ int nr_pages, i;
3040+
3041+ index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
3042+ end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
3043+ cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
3044+
3045+ pagevec_init(&pvec, 0);
3046+
3047+ while (index <= end) {
3048+ /* XXX: optimize tail */
3049+ nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
3050+ if (nr_pages == 0)
3051+ break;
3052+ for (i = 0; i < nr_pages; i++) {
3053+ struct page *page = pvec.pages[i];
3054+
3055+ index = page->index;
3056+ if (index > end)
3057+ break;
3058+ index++;
3059+
3060+ BUG_ON(!PageLocked(page));
3061+ BUG_ON(PageWriteback(page));
3062+ BUG_ON(!page_has_buffers(page));
3063+
3064+ bh = page_buffers(page);
3065+ head = bh;
3066+
3067+ /* skip blocks out of the range */
3068+ do {
3069+ if (cur_logical >= logical)
3070+ break;
3071+ cur_logical++;
3072+ } while ((bh = bh->b_this_page) != head);
3073+
3074+ do {
3075+ if (cur_logical >= logical + blocks)
3076+ break;
3077+ if (buffer_delay(bh)) {
3078+ bh->b_blocknr = pblock;
3079+ clear_buffer_delay(bh);
3080+ bh->b_bdev = inode->i_sb->s_bdev;
3081+ } else if (buffer_unwritten(bh)) {
3082+ bh->b_blocknr = pblock;
3083+ clear_buffer_unwritten(bh);
3084+ set_buffer_mapped(bh);
3085+ set_buffer_new(bh);
3086+ bh->b_bdev = inode->i_sb->s_bdev;
3087+ } else if (buffer_mapped(bh))
3088+ BUG_ON(bh->b_blocknr != pblock);
3089+
3090+ cur_logical++;
3091+ pblock++;
3092+ } while ((bh = bh->b_this_page) != head);
3093+ }
3094+ pagevec_release(&pvec);
3095+ }
3096+}
3097+
3098+
3099+/*
3100+ * __unmap_underlying_blocks - just a helper function to unmap
3101+ * set of blocks described by @bh
3102+ */
3103+static inline void __unmap_underlying_blocks(struct inode *inode,
3104+ struct buffer_head *bh)
3105+{
3106+ struct block_device *bdev = inode->i_sb->s_bdev;
3107+ int blocks, i;
3108+
3109+ blocks = bh->b_size >> inode->i_blkbits;
3110+ for (i = 0; i < blocks; i++)
3111+ unmap_underlying_metadata(bdev, bh->b_blocknr + i);
3112+}
3113+
3114+/*
3115+ * mpage_da_map_blocks - go through given space
3116+ *
3117+ * @mpd->lbh - bh describing space
3118+ * @mpd->get_block - the filesystem's block mapper function
3119+ *
3120+ * The function skips space we know is already mapped to disk blocks.
3121+ *
3122+ */
3123+static void mpage_da_map_blocks(struct mpage_da_data *mpd)
3124+{
3125+ int err = 0;
3126+ struct buffer_head *lbh = &mpd->lbh;
3127+ sector_t next = lbh->b_blocknr;
3128+ struct buffer_head new;
3129+
3130+ /*
3131+ * We consider only non-mapped and non-allocated blocks
3132+ */
3133+ if (buffer_mapped(lbh) && !buffer_delay(lbh))
3134+ return;
3135+
3136+ new.b_state = lbh->b_state;
3137+ new.b_blocknr = 0;
3138+ new.b_size = lbh->b_size;
3139+
3140+ /*
3141+ * If we didn't accumulate anything
3142+ * to write simply return
3143+ */
3144+ if (!new.b_size)
3145+ return;
3146+ err = mpd->get_block(mpd->inode, next, &new, 1);
3147+ if (err)
3148+ return;
3149+ BUG_ON(new.b_size == 0);
3150+
3151+ if (buffer_new(&new))
3152+ __unmap_underlying_blocks(mpd->inode, &new);
3153+
3154+ /*
3155+ * If blocks are delayed marked, we need to
3156+ * put actual blocknr and drop delayed bit
3157+ */
3158+ if (buffer_delay(lbh) || buffer_unwritten(lbh))
3159+ mpage_put_bnr_to_bhs(mpd, next, &new);
3160+
3161+ return;
3162+}
3163+
3164+#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
3165+ (1 << BH_Delay) | (1 << BH_Unwritten))
3166+
3167+/*
3168+ * mpage_add_bh_to_extent - try to add one more block to extent of blocks
3169+ *
3170+ * @mpd->lbh - extent of blocks
3171+ * @logical - logical number of the block in the file
3172+ * @bh - bh of the block (used to access block's state)
3173+ *
3174+ * the function is used to collect contig. blocks in same state
3175+ */
3176+static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
3177+ sector_t logical, struct buffer_head *bh)
3178+{
3179+ sector_t next;
3180+ size_t b_size = bh->b_size;
3181+ struct buffer_head *lbh = &mpd->lbh;
3182+ int nrblocks = lbh->b_size >> mpd->inode->i_blkbits;
3183+
3184+ /* check if thereserved journal credits might overflow */
3185+ if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
3186+ if (nrblocks >= EXT4_MAX_TRANS_DATA) {
3187+ /*
3188+ * With non-extent format we are limited by the journal
3189+ * credit available. Total credit needed to insert
3190+ * nrblocks contiguous blocks is dependent on the
3191+ * nrblocks. So limit nrblocks.
3192+ */
3193+ goto flush_it;
3194+ } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) >
3195+ EXT4_MAX_TRANS_DATA) {
3196+ /*
3197+ * Adding the new buffer_head would make it cross the
3198+ * allowed limit for which we have journal credit
3199+ * reserved. So limit the new bh->b_size
3200+ */
3201+ b_size = (EXT4_MAX_TRANS_DATA - nrblocks) <<
3202+ mpd->inode->i_blkbits;
3203+ /* we will do mpage_da_submit_io in the next loop */
3204+ }
3205+ }
3206+ /*
3207+ * First block in the extent
3208+ */
3209+ if (lbh->b_size == 0) {
3210+ lbh->b_blocknr = logical;
3211+ lbh->b_size = b_size;
3212+ lbh->b_state = bh->b_state & BH_FLAGS;
3213+ return;
3214+ }
3215+
3216+ next = lbh->b_blocknr + nrblocks;
3217+ /*
3218+ * Can we merge the block to our big extent?
3219+ */
3220+ if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
3221+ lbh->b_size += b_size;
3222+ return;
3223+ }
3224+
3225+flush_it:
3226+ /*
3227+ * We couldn't merge the block to our extent, so we
3228+ * need to flush current extent and start new one
3229+ */
3230+ mpage_da_map_blocks(mpd);
3231+ mpage_da_submit_io(mpd);
3232+ mpd->io_done = 1;
3233+ return;
3234+}
3235+
3236+/*
3237+ * __mpage_da_writepage - finds extent of pages and blocks
3238+ *
3239+ * @page: page to consider
3240+ * @wbc: not used, we just follow rules
3241+ * @data: context
3242+ *
3243+ * The function finds extents of pages and scan them for all blocks.
3244+ */
3245+static int __mpage_da_writepage(struct page *page,
3246+ struct writeback_control *wbc, void *data)
3247+{
3248+ struct mpage_da_data *mpd = data;
3249+ struct inode *inode = mpd->inode;
3250+ struct buffer_head *bh, *head, fake;
3251+ sector_t logical;
3252+
3253+ if (mpd->io_done) {
3254+ /*
3255+ * Rest of the page in the page_vec
3256+ * redirty then and skip then. We will
3257+ * try to to write them again after
3258+ * starting a new transaction
3259+ */
3260+ redirty_page_for_writepage(wbc, page);
3261+ unlock_page(page);
3262+ return MPAGE_DA_EXTENT_TAIL;
3263+ }
3264+ /*
3265+ * Can we merge this page to current extent?
3266+ */
3267+ if (mpd->next_page != page->index) {
3268+ /*
3269+ * Nope, we can't. So, we map non-allocated blocks
3270+ * and start IO on them using writepage()
3271+ */
3272+ if (mpd->next_page != mpd->first_page) {
3273+ mpage_da_map_blocks(mpd);
3274+ mpage_da_submit_io(mpd);
3275+ /*
3276+ * skip rest of the page in the page_vec
3277+ */
3278+ mpd->io_done = 1;
3279+ redirty_page_for_writepage(wbc, page);
3280+ unlock_page(page);
3281+ return MPAGE_DA_EXTENT_TAIL;
3282+ }
3283+
3284+ /*
3285+ * Start next extent of pages ...
3286+ */
3287+ mpd->first_page = page->index;
3288+
3289+ /*
3290+ * ... and blocks
3291+ */
3292+ mpd->lbh.b_size = 0;
3293+ mpd->lbh.b_state = 0;
3294+ mpd->lbh.b_blocknr = 0;
3295+ }
3296+
3297+ mpd->next_page = page->index + 1;
3298+ logical = (sector_t) page->index <<
3299+ (PAGE_CACHE_SHIFT - inode->i_blkbits);
3300+
3301+ if (!page_has_buffers(page)) {
3302+ /*
3303+ * There is no attached buffer heads yet (mmap?)
3304+ * we treat the page asfull of dirty blocks
3305+ */
3306+ bh = &fake;
3307+ bh->b_size = PAGE_CACHE_SIZE;
3308+ bh->b_state = 0;
3309+ set_buffer_dirty(bh);
3310+ set_buffer_uptodate(bh);
3311+ mpage_add_bh_to_extent(mpd, logical, bh);
3312+ if (mpd->io_done)
3313+ return MPAGE_DA_EXTENT_TAIL;
3314+ } else {
3315+ /*
3316+ * Page with regular buffer heads, just add all dirty ones
3317+ */
3318+ head = page_buffers(page);
3319+ bh = head;
3320+ do {
3321+ BUG_ON(buffer_locked(bh));
3322+ if (buffer_dirty(bh) &&
3323+ (!buffer_mapped(bh) || buffer_delay(bh))) {
3324+ mpage_add_bh_to_extent(mpd, logical, bh);
3325+ if (mpd->io_done)
3326+ return MPAGE_DA_EXTENT_TAIL;
3327+ }
3328+ logical++;
3329+ } while ((bh = bh->b_this_page) != head);
3330+ }
3331+
3332+ return 0;
3333+}
3334+
3335+/*
3336+ * mpage_da_writepages - walk the list of dirty pages of the given
3337+ * address space, allocates non-allocated blocks, maps newly-allocated
3338+ * blocks to existing bhs and issue IO them
3339+ *
3340+ * @mapping: address space structure to write
3341+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
3342+ * @get_block: the filesystem's block mapper function.
3343+ *
3344+ * This is a library function, which implements the writepages()
3345+ * address_space_operation.
3346+ */
3347+static int mpage_da_writepages(struct address_space *mapping,
3348+ struct writeback_control *wbc,
3349+ get_block_t get_block)
3350+{
3351+ struct mpage_da_data mpd;
3352+ long to_write;
3353+ int ret;
3354+
3355+ if (!get_block)
3356+ return generic_writepages(mapping, wbc);
3357+
3358+ mpd.wbc = wbc;
3359+ mpd.inode = mapping->host;
3360+ mpd.lbh.b_size = 0;
3361+ mpd.lbh.b_state = 0;
3362+ mpd.lbh.b_blocknr = 0;
3363+ mpd.first_page = 0;
3364+ mpd.next_page = 0;
3365+ mpd.get_block = get_block;
3366+ mpd.io_done = 0;
3367+ mpd.pages_written = 0;
3368+
3369+ to_write = wbc->nr_to_write;
3370+
3371+ ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd);
3372+
3373+ /*
3374+ * Handle last extent of pages
3375+ */
3376+ if (!mpd.io_done && mpd.next_page != mpd.first_page) {
3377+ mpage_da_map_blocks(&mpd);
3378+ mpage_da_submit_io(&mpd);
3379+ }
3380+
3381+ wbc->nr_to_write = to_write - mpd.pages_written;
3382+ return ret;
3383+}
3384+
3385+/*
3386+ * this is a special callback for ->write_begin() only
3387+ * it's intention is to return mapped block or reserve space
3388+ */
3389+static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
3390+ struct buffer_head *bh_result, int create)
3391+{
3392+ int ret = 0;
3393+
3394+ BUG_ON(create == 0);
3395+ BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
3396+
3397+ /*
3398+ * first, we need to know whether the block is allocated already
3399+ * preallocated blocks are unmapped but should treated
3400+ * the same as allocated blocks.
3401+ */
3402+ ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0);
3403+ if ((ret == 0) && !buffer_delay(bh_result)) {
3404+ /* the block isn't (pre)allocated yet, let's reserve space */
3405+ /*
3406+ * XXX: __block_prepare_write() unmaps passed block,
3407+ * is it OK?
3408+ */
3409+ ret = ext4_da_reserve_space(inode, 1);
3410+ if (ret)
3411+ /* not enough space to reserve */
3412+ return ret;
3413+
3414+ map_bh(bh_result, inode->i_sb, 0);
3415+ set_buffer_new(bh_result);
3416+ set_buffer_delay(bh_result);
3417+ } else if (ret > 0) {
3418+ bh_result->b_size = (ret << inode->i_blkbits);
3419+ ret = 0;
3420+ }
3421+
3422+ return ret;
3423+}
3424+#define EXT4_DELALLOC_RSVED 1
3425+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
3426+ struct buffer_head *bh_result, int create)
3427+{
3428+ int ret;
3429+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3430+ loff_t disksize = EXT4_I(inode)->i_disksize;
3431+ handle_t *handle = NULL;
3432+
3433+ handle = ext4_journal_current_handle();
3434+ if (!handle) {
3435+ ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
3436+ bh_result, 0, 0, 0);
3437+ BUG_ON(!ret);
3438+ } else {
3439+ ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
3440+ bh_result, create, 0, EXT4_DELALLOC_RSVED);
3441+ }
3442+
3443+ if (ret > 0) {
3444+ bh_result->b_size = (ret << inode->i_blkbits);
3445+
3446+ /*
3447+ * Update on-disk size along with block allocation
3448+ * we don't use 'extend_disksize' as size may change
3449+ * within already allocated block -bzzz
3450+ */
3451+ disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
3452+ if (disksize > i_size_read(inode))
3453+ disksize = i_size_read(inode);
3454+ if (disksize > EXT4_I(inode)->i_disksize) {
3455+ /*
3456+ * XXX: replace with spinlock if seen contended -bzzz
3457+ */
3458+ down_write(&EXT4_I(inode)->i_data_sem);
3459+ if (disksize > EXT4_I(inode)->i_disksize)
3460+ EXT4_I(inode)->i_disksize = disksize;
3461+ up_write(&EXT4_I(inode)->i_data_sem);
3462+
3463+ if (EXT4_I(inode)->i_disksize == disksize) {
3464+ ret = ext4_mark_inode_dirty(handle, inode);
3465+ return ret;
3466+ }
3467+ }
3468+ ret = 0;
3469+ }
3470+ return ret;
3471+}
3472+
3473+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
3474+{
3475+ /*
3476+ * unmapped buffer is possible for holes.
3477+ * delay buffer is possible with delayed allocation
3478+ */
3479+ return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
3480+}
3481+
3482+static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
3483+ struct buffer_head *bh_result, int create)
3484+{
3485+ int ret = 0;
3486+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
3487+
3488+ /*
3489+ * we don't want to do block allocation in writepage
3490+ * so call get_block_wrap with create = 0
3491+ */
3492+ ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
3493+ bh_result, 0, 0, 0);
3494+ if (ret > 0) {
3495+ bh_result->b_size = (ret << inode->i_blkbits);
3496+ ret = 0;
3497+ }
3498+ return ret;
3499+}
3500+
3501+/*
3502+ * get called vi ext4_da_writepages after taking page lock (have journal handle)
3503+ * get called via journal_submit_inode_data_buffers (no journal handle)
3504+ * get called via shrink_page_list via pdflush (no journal handle)
3505+ * or grab_page_cache when doing write_begin (have journal handle)
3506+ */
3507+static int ext4_da_writepage(struct page *page,
3508+ struct writeback_control *wbc)
3509+{
3510+ int ret = 0;
3511+ loff_t size;
3512+ unsigned long len;
3513+ struct buffer_head *page_bufs;
3514+ struct inode *inode = page->mapping->host;
3515+
3516+ size = i_size_read(inode);
3517+ if (page->index == size >> PAGE_CACHE_SHIFT)
3518+ len = size & ~PAGE_CACHE_MASK;
3519+ else
3520+ len = PAGE_CACHE_SIZE;
3521+
3522+ if (page_has_buffers(page)) {
3523+ page_bufs = page_buffers(page);
3524+ if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
3525+ ext4_bh_unmapped_or_delay)) {
3526+ /*
3527+ * We don't want to do block allocation
3528+ * So redirty the page and return
3529+ * We may reach here when we do a journal commit
3530+ * via journal_submit_inode_data_buffers.
3531+ * If we don't have mapping block we just ignore
3532+ * them. We can also reach here via shrink_page_list
3533+ */
3534+ redirty_page_for_writepage(wbc, page);
3535+ unlock_page(page);
3536+ return 0;
3537+ }
3538+ } else {
3539+ /*
3540+ * The test for page_has_buffers() is subtle:
3541+ * We know the page is dirty but it lost buffers. That means
3542+ * that at some moment in time after write_begin()/write_end()
3543+ * has been called all buffers have been clean and thus they
3544+ * must have been written at least once. So they are all
3545+ * mapped and we can happily proceed with mapping them
3546+ * and writing the page.
3547+ *
3548+ * Try to initialize the buffer_heads and check whether
3549+ * all are mapped and non delay. We don't want to
3550+ * do block allocation here.
3551+ */
3552+ ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
3553+ ext4_normal_get_block_write);
3554+ if (!ret) {
3555+ page_bufs = page_buffers(page);
3556+ /* check whether all are mapped and non delay */
3557+ if (walk_page_buffers(NULL, page_bufs, 0, len, NULL,
3558+ ext4_bh_unmapped_or_delay)) {
3559+ redirty_page_for_writepage(wbc, page);
3560+ unlock_page(page);
3561+ return 0;
3562+ }
3563+ } else {
3564+ /*
3565+ * We can't do block allocation here
3566+ * so just redity the page and unlock
3567+ * and return
3568+ */
3569+ redirty_page_for_writepage(wbc, page);
3570+ unlock_page(page);
3571+ return 0;
3572+ }
3573+ }
3574+
3575+ if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
3576+ ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
3577+ else
3578+ ret = block_write_full_page(page,
3579+ ext4_normal_get_block_write,
3580+ wbc);
3581+
3582+ return ret;
3583+}
3584+
3585+/*
3586+ * This is called via ext4_da_writepages() to
3587+ * calulate the total number of credits to reserve to fit
3588+ * a single extent allocation into a single transaction,
3589+ * ext4_da_writpeages() will loop calling this before
3590+ * the block allocation.
3591+ */
3592+
3593+static int ext4_da_writepages_trans_blocks(struct inode *inode)
3594+{
3595+ int max_blocks = EXT4_I(inode)->i_reserved_data_blocks;
3596+
3597+ /*
3598+ * With non-extent format the journal credit needed to
3599+ * insert nrblocks contiguous block is dependent on
3600+ * number of contiguous block. So we will limit
3601+ * number of contiguous block to a sane value
3602+ */
3603+ if (!(inode->i_flags & EXT4_EXTENTS_FL) &&
3604+ (max_blocks > EXT4_MAX_TRANS_DATA))
3605+ max_blocks = EXT4_MAX_TRANS_DATA;
3606+
3607+ return ext4_chunk_trans_blocks(inode, max_blocks);
3608+}
3609+
3610+static int ext4_da_writepages(struct address_space *mapping,
3611+ struct writeback_control *wbc)
3612+{
3613+ handle_t *handle = NULL;
3614+ loff_t range_start = 0;
3615+ struct inode *inode = mapping->host;
3616+ int needed_blocks, ret = 0, nr_to_writebump = 0;
3617+ long to_write, pages_skipped = 0;
3618+ struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
3619+
3620+ /*
3621+ * No pages to write? This is mainly a kludge to avoid starting
3622+ * a transaction for special inodes like journal inode on last iput()
3623+ * because that could violate lock ordering on umount
3624+ */
3625+ if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
3626+ return 0;
3627+ /*
3628+ * Make sure nr_to_write is >= sbi->s_mb_stream_request
3629+ * This make sure small files blocks are allocated in
3630+ * single attempt. This ensure that small files
3631+ * get less fragmented.
3632+ */
3633+ if (wbc->nr_to_write < sbi->s_mb_stream_request) {
3634+ nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
3635+ wbc->nr_to_write = sbi->s_mb_stream_request;
3636+ }
3637+
3638+ if (!wbc->range_cyclic)
3639+ /*
3640+ * If range_cyclic is not set force range_cont
3641+ * and save the old writeback_index
3642+ */
3643+ wbc->range_cont = 1;
3644+
3645+ range_start = wbc->range_start;
3646+ pages_skipped = wbc->pages_skipped;
3647+
3648+restart_loop:
3649+ to_write = wbc->nr_to_write;
3650+ while (!ret && to_write > 0) {
3651+
3652+ /*
3653+ * we insert one extent at a time. So we need
3654+ * credit needed for single extent allocation.
3655+ * journalled mode is currently not supported
3656+ * by delalloc
3657+ */
3658+ BUG_ON(ext4_should_journal_data(inode));
3659+ needed_blocks = ext4_da_writepages_trans_blocks(inode);
3660+
3661+ /* start a new transaction*/
3662+ handle = ext4_journal_start(inode, needed_blocks);
3663+ if (IS_ERR(handle)) {
3664+ ret = PTR_ERR(handle);
3665+ printk(KERN_EMERG "%s: jbd2_start: "
3666+ "%ld pages, ino %lu; err %d\n", __func__,
3667+ wbc->nr_to_write, inode->i_ino, ret);
3668+ dump_stack();
3669+ goto out_writepages;
3670+ }
3671+ if (ext4_should_order_data(inode)) {
3672+ /*
3673+ * With ordered mode we need to add
3674+ * the inode to the journal handl
3675+ * when we do block allocation.
3676+ */
3677+ ret = ext4_jbd2_file_inode(handle, inode);
3678+ if (ret) {
3679+ ext4_journal_stop(handle);
3680+ goto out_writepages;
3681+ }
3682+ }
3683+
3684+ to_write -= wbc->nr_to_write;
3685+ ret = mpage_da_writepages(mapping, wbc,
3686+ ext4_da_get_block_write);
3687+ ext4_journal_stop(handle);
3688+ if (ret == MPAGE_DA_EXTENT_TAIL) {
3689+ /*
3690+ * got one extent now try with
3691+ * rest of the pages
3692+ */
3693+ to_write += wbc->nr_to_write;
3694+ ret = 0;
3695+ } else if (wbc->nr_to_write) {
3696+ /*
3697+ * There is no more writeout needed
3698+ * or we requested for a noblocking writeout
3699+ * and we found the device congested
3700+ */
3701+ to_write += wbc->nr_to_write;
3702+ break;
3703+ }
3704+ wbc->nr_to_write = to_write;
3705+ }
3706+
3707+ if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) {
3708+ /* We skipped pages in this loop */
3709+ wbc->range_start = range_start;
3710+ wbc->nr_to_write = to_write +
3711+ wbc->pages_skipped - pages_skipped;
3712+ wbc->pages_skipped = pages_skipped;
3713+ goto restart_loop;
3714+ }
3715+
3716+out_writepages:
3717+ wbc->nr_to_write = to_write - nr_to_writebump;
3718+ wbc->range_start = range_start;
3719+ return ret;
3720+}
3721+
3722+static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
3723+ loff_t pos, unsigned len, unsigned flags,
3724+ struct page **pagep, void **fsdata)
3725+{
3726+ int ret, retries = 0;
3727+ struct page *page;
3728+ pgoff_t index;
3729+ unsigned from, to;
3730+ struct inode *inode = mapping->host;
3731+ handle_t *handle;
3732+
3733+ index = pos >> PAGE_CACHE_SHIFT;
3734+ from = pos & (PAGE_CACHE_SIZE - 1);
3735+ to = from + len;
3736+
3737+retry:
3738+ /*
3739+ * With delayed allocation, we don't log the i_disksize update
3740+ * if there is delayed block allocation. But we still need
3741+ * to journalling the i_disksize update if writes to the end
3742+ * of file which has an already mapped buffer.
3743+ */
3744+ handle = ext4_journal_start(inode, 1);
3745+ if (IS_ERR(handle)) {
3746+ ret = PTR_ERR(handle);
3747+ goto out;
3748+ }
3749+
3750+ page = __grab_cache_page(mapping, index);
3751+ if (!page) {
3752+ ext4_journal_stop(handle);
3753+ ret = -ENOMEM;
3754+ goto out;
3755+ }
3756+ *pagep = page;
3757+
3758+ ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
3759+ ext4_da_get_block_prep);
3760+ if (ret < 0) {
3761+ unlock_page(page);
3762+ ext4_journal_stop(handle);
3763+ page_cache_release(page);
3764+ }
3765+
3766+ if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
3767+ goto retry;
3768+out:
3769+ return ret;
3770+}
3771+
3772+/*
3773+ * Check if we should update i_disksize
3774+ * when write to the end of file but not require block allocation
3775+ */
3776+static int ext4_da_should_update_i_disksize(struct page *page,
3777+ unsigned long offset)
3778+{
3779+ struct buffer_head *bh;
3780+ struct inode *inode = page->mapping->host;
3781+ unsigned int idx;
3782+ int i;
3783+
3784+ bh = page_buffers(page);
3785+ idx = offset >> inode->i_blkbits;
3786+
3787+ for (i=0; i < idx; i++)
3788+ bh = bh->b_this_page;
3789+
3790+ if (!buffer_mapped(bh) || (buffer_delay(bh)))
3791+ return 0;
3792+ return 1;
3793+}
3794+
3795+static int ext4_da_write_end(struct file *file,
3796+ struct address_space *mapping,
3797+ loff_t pos, unsigned len, unsigned copied,
3798+ struct page *page, void *fsdata)
3799+{
3800+ struct inode *inode = mapping->host;
3801+ int ret = 0, ret2;
3802+ handle_t *handle = ext4_journal_current_handle();
3803+ loff_t new_i_size;
3804+ unsigned long start, end;
3805+
3806+ start = pos & (PAGE_CACHE_SIZE - 1);
3807+ end = start + copied -1;
3808+
3809+ /*
3810+ * generic_write_end() will run mark_inode_dirty() if i_size
3811+ * changes. So let's piggyback the i_disksize mark_inode_dirty
3812+ * into that.
3813+ */
3814+
3815+ new_i_size = pos + copied;
3816+ if (new_i_size > EXT4_I(inode)->i_disksize) {
3817+ if (ext4_da_should_update_i_disksize(page, end)) {
3818+ down_write(&EXT4_I(inode)->i_data_sem);
3819+ if (new_i_size > EXT4_I(inode)->i_disksize) {
3820+ /*
3821+ * Updating i_disksize when extending file
3822+ * without needing block allocation
3823+ */
3824+ if (ext4_should_order_data(inode))
3825+ ret = ext4_jbd2_file_inode(handle,
3826+ inode);
3827+
3828+ EXT4_I(inode)->i_disksize = new_i_size;
3829+ }
3830+ up_write(&EXT4_I(inode)->i_data_sem);
3831+ }
3832+ }
3833+ ret2 = generic_write_end(file, mapping, pos, len, copied,
3834+ page, fsdata);
3835+ copied = ret2;
3836+ if (ret2 < 0)
3837+ ret = ret2;
3838+ ret2 = ext4_journal_stop(handle);
3839+ if (!ret)
3840+ ret = ret2;
3841+
3842+ return ret ? ret : copied;
3843+}
3844+
3845+static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
3846+{
3847+ /*
3848+ * Drop reserved blocks
3849+ */
3850+ BUG_ON(!PageLocked(page));
3851+ if (!page_has_buffers(page))
3852+ goto out;
3853+
3854+ ext4_da_page_release_reservation(page, offset);
3855+
3856+out:
3857+ ext4_invalidatepage(page, offset);
3858+
3859+ return;
3860+}
3861+
3862+
3863 /*
3864 * bmap() is special. It gets used by applications such as lilo and by
3865 * the swapper to find the on-disk block of a specific piece of data.
3866@@ -1418,6 +2552,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
3867 journal_t *journal;
3868 int err;
3869
3870+ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
3871+ test_opt(inode->i_sb, DELALLOC)) {
3872+ /*
3873+ * With delalloc we want to sync the file
3874+ * so that we can make sure we allocate
3875+ * blocks for file
3876+ */
3877+ filemap_write_and_wait(mapping);
3878+ }
3879+
3880 if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) {
3881 /*
3882 * This is a REALLY heavyweight approach, but the use of
3883@@ -1462,21 +2606,17 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
3884 return 0;
3885 }
3886
3887-static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
3888-{
3889- if (buffer_mapped(bh))
3890- return ext4_journal_dirty_data(handle, bh);
3891- return 0;
3892-}
3893-
3894 /*
3895- * Note that we always start a transaction even if we're not journalling
3896- * data. This is to preserve ordering: any hole instantiation within
3897- * __block_write_full_page -> ext4_get_block() should be journalled
3898- * along with the data so we don't crash and then get metadata which
3899- * refers to old data.
3900+ * Note that we don't need to start a transaction unless we're journaling data
3901+ * because we should have holes filled from ext4_page_mkwrite(). We even don't
3902+ * need to file the inode to the transaction's list in ordered mode because if
3903+ * we are writing back data added by write(), the inode is already there and if
3904+ * we are writing back data modified via mmap(), noone guarantees in which
3905+ * transaction the data will hit the disk. In case we are journaling data, we
3906+ * cannot start transaction directly because transaction start ranks above page
3907+ * lock so we have to do some magic.
3908 *
3909- * In all journalling modes block_write_full_page() will start the I/O.
3910+ * In all journaling modes block_write_full_page() will start the I/O.
3911 *
3912 * Problem:
3913 *
3914@@ -1518,105 +2658,103 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
3915 * disastrous. Any write() or metadata operation will sync the fs for
3916 * us.
3917 *
3918- * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
3919- * we don't need to open a transaction here.
3920 */
3921-static int ext4_ordered_writepage(struct page *page,
3922+static int __ext4_normal_writepage(struct page *page,
3923 struct writeback_control *wbc)
3924 {
3925 struct inode *inode = page->mapping->host;
3926- struct buffer_head *page_bufs;
3927- handle_t *handle = NULL;
3928- int ret = 0;
3929- int err;
3930
3931- J_ASSERT(PageLocked(page));
3932-
3933- /*
3934- * We give up here if we're reentered, because it might be for a
3935- * different filesystem.
3936- */
3937- if (ext4_journal_current_handle())
3938- goto out_fail;
3939-
3940- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
3941+ if (test_opt(inode->i_sb, NOBH))
3942+ return nobh_writepage(page,
3943+ ext4_normal_get_block_write, wbc);
3944+ else
3945+ return block_write_full_page(page,
3946+ ext4_normal_get_block_write,
3947+ wbc);
3948+}
3949
3950- if (IS_ERR(handle)) {
3951- ret = PTR_ERR(handle);
3952- goto out_fail;
3953- }
3954+static int ext4_normal_writepage(struct page *page,
3955+ struct writeback_control *wbc)
3956+{
3957+ struct inode *inode = page->mapping->host;
3958+ loff_t size = i_size_read(inode);
3959+ loff_t len;
3960
3961- if (!page_has_buffers(page)) {
3962- create_empty_buffers(page, inode->i_sb->s_blocksize,
3963- (1 << BH_Dirty)|(1 << BH_Uptodate));
3964+ J_ASSERT(PageLocked(page));
3965+ if (page->index == size >> PAGE_CACHE_SHIFT)
3966+ len = size & ~PAGE_CACHE_MASK;
3967+ else
3968+ len = PAGE_CACHE_SIZE;
3969+
3970+ if (page_has_buffers(page)) {
3971+ /* if page has buffers it should all be mapped
3972+ * and allocated. If there are not buffers attached
3973+ * to the page we know the page is dirty but it lost
3974+ * buffers. That means that at some moment in time
3975+ * after write_begin() / write_end() has been called
3976+ * all buffers have been clean and thus they must have been
3977+ * written at least once. So they are all mapped and we can
3978+ * happily proceed with mapping them and writing the page.
3979+ */
3980+ BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
3981+ ext4_bh_unmapped_or_delay));
3982 }
3983- page_bufs = page_buffers(page);
3984- walk_page_buffers(handle, page_bufs, 0,
3985- PAGE_CACHE_SIZE, NULL, bget_one);
3986-
3987- ret = block_write_full_page(page, ext4_get_block, wbc);
3988-
3989- /*
3990- * The page can become unlocked at any point now, and
3991- * truncate can then come in and change things. So we
3992- * can't touch *page from now on. But *page_bufs is
3993- * safe due to elevated refcount.
3994- */
3995
3996- /*
3997- * And attach them to the current transaction. But only if
3998- * block_write_full_page() succeeded. Otherwise they are unmapped,
3999- * and generally junk.
4000- */
4001- if (ret == 0) {
4002- err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
4003- NULL, jbd2_journal_dirty_data_fn);
4004- if (!ret)
4005- ret = err;
4006- }
4007- walk_page_buffers(handle, page_bufs, 0,
4008- PAGE_CACHE_SIZE, NULL, bput_one);
4009- err = ext4_journal_stop(handle);
4010- if (!ret)
4011- ret = err;
4012- return ret;
4013+ if (!ext4_journal_current_handle())
4014+ return __ext4_normal_writepage(page, wbc);
4015
4016-out_fail:
4017 redirty_page_for_writepage(wbc, page);
4018 unlock_page(page);
4019- return ret;
4020+ return 0;
4021 }
4022
4023-static int ext4_writeback_writepage(struct page *page,
4024+static int __ext4_journalled_writepage(struct page *page,
4025 struct writeback_control *wbc)
4026 {
4027- struct inode *inode = page->mapping->host;
4028+ struct address_space *mapping = page->mapping;
4029+ struct inode *inode = mapping->host;
4030+ struct buffer_head *page_bufs;
4031 handle_t *handle = NULL;
4032 int ret = 0;
4033 int err;
4034
4035- if (ext4_journal_current_handle())
4036- goto out_fail;
4037+ ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
4038+ ext4_normal_get_block_write);
4039+ if (ret != 0)
4040+ goto out_unlock;
4041+
4042+ page_bufs = page_buffers(page);
4043+ walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL,
4044+ bget_one);
4045+ /* As soon as we unlock the page, it can go away, but we have
4046+ * references to buffers so we are safe */
4047+ unlock_page(page);
4048
4049 handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
4050 if (IS_ERR(handle)) {
4051 ret = PTR_ERR(handle);
4052- goto out_fail;
4053+ goto out;
4054 }
4055
4056- if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
4057- ret = nobh_writepage(page, ext4_get_block, wbc);
4058- else
4059- ret = block_write_full_page(page, ext4_get_block, wbc);
4060+ ret = walk_page_buffers(handle, page_bufs, 0,
4061+ PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
4062
4063+ err = walk_page_buffers(handle, page_bufs, 0,
4064+ PAGE_CACHE_SIZE, NULL, write_end_fn);
4065+ if (ret == 0)
4066+ ret = err;
4067 err = ext4_journal_stop(handle);
4068 if (!ret)
4069 ret = err;
4070- return ret;
4071
4072-out_fail:
4073- redirty_page_for_writepage(wbc, page);
4074+ walk_page_buffers(handle, page_bufs, 0,
4075+ PAGE_CACHE_SIZE, NULL, bput_one);
4076+ EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
4077+ goto out;
4078+
4079+out_unlock:
4080 unlock_page(page);
4081+out:
4082 return ret;
4083 }
4084
4085@@ -1624,59 +2762,53 @@ static int ext4_journalled_writepage(struct page *page,
4086 struct writeback_control *wbc)
4087 {
4088 struct inode *inode = page->mapping->host;
4089- handle_t *handle = NULL;
4090- int ret = 0;
4091- int err;
4092+ loff_t size = i_size_read(inode);
4093+ loff_t len;
4094
4095- if (ext4_journal_current_handle())
4096- goto no_write;
4097+ J_ASSERT(PageLocked(page));
4098+ if (page->index == size >> PAGE_CACHE_SHIFT)
4099+ len = size & ~PAGE_CACHE_MASK;
4100+ else
4101+ len = PAGE_CACHE_SIZE;
4102+
4103+ if (page_has_buffers(page)) {
4104+ /* if page has buffers it should all be mapped
4105+ * and allocated. If there are not buffers attached
4106+ * to the page we know the page is dirty but it lost
4107+ * buffers. That means that at some moment in time
4108+ * after write_begin() / write_end() has been called
4109+ * all buffers have been clean and thus they must have been
4110+ * written at least once. So they are all mapped and we can
4111+ * happily proceed with mapping them and writing the page.
4112+ */
4113+ BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
4114+ ext4_bh_unmapped_or_delay));
4115+ }
4116
4117- handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode));
4118- if (IS_ERR(handle)) {
4119- ret = PTR_ERR(handle);
4120+ if (ext4_journal_current_handle())
4121 goto no_write;
4122- }
4123
4124- if (!page_has_buffers(page) || PageChecked(page)) {
4125+ if (PageChecked(page)) {
4126 /*
4127 * It's mmapped pagecache. Add buffers and journal it. There
4128 * doesn't seem much point in redirtying the page here.
4129 */
4130 ClearPageChecked(page);
4131- ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
4132- ext4_get_block);
4133- if (ret != 0) {
4134- ext4_journal_stop(handle);
4135- goto out_unlock;
4136- }
4137- ret = walk_page_buffers(handle, page_buffers(page), 0,
4138- PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
4139-
4140- err = walk_page_buffers(handle, page_buffers(page), 0,
4141- PAGE_CACHE_SIZE, NULL, write_end_fn);
4142- if (ret == 0)
4143- ret = err;
4144- EXT4_I(inode)->i_state |= EXT4_STATE_JDATA;
4145- unlock_page(page);
4146+ return __ext4_journalled_writepage(page, wbc);
4147 } else {
4148 /*
4149 * It may be a page full of checkpoint-mode buffers. We don't
4150 * really know unless we go poke around in the buffer_heads.
4151 * But block_write_full_page will do the right thing.
4152 */
4153- ret = block_write_full_page(page, ext4_get_block, wbc);
4154+ return block_write_full_page(page,
4155+ ext4_normal_get_block_write,
4156+ wbc);
4157 }
4158- err = ext4_journal_stop(handle);
4159- if (!ret)
4160- ret = err;
4161-out:
4162- return ret;
4163-
4164 no_write:
4165 redirty_page_for_writepage(wbc, page);
4166-out_unlock:
4167 unlock_page(page);
4168- goto out;
4169+ return 0;
4170 }
4171
4172 static int ext4_readpage(struct file *file, struct page *page)
4173@@ -1819,7 +2951,7 @@ static int ext4_journalled_set_page_dirty(struct page *page)
4174 static const struct address_space_operations ext4_ordered_aops = {
4175 .readpage = ext4_readpage,
4176 .readpages = ext4_readpages,
4177- .writepage = ext4_ordered_writepage,
4178+ .writepage = ext4_normal_writepage,
4179 .sync_page = block_sync_page,
4180 .write_begin = ext4_write_begin,
4181 .write_end = ext4_ordered_write_end,
4182@@ -1833,7 +2965,7 @@ static const struct address_space_operations ext4_ordered_aops = {
4183 static const struct address_space_operations ext4_writeback_aops = {
4184 .readpage = ext4_readpage,
4185 .readpages = ext4_readpages,
4186- .writepage = ext4_writeback_writepage,
4187+ .writepage = ext4_normal_writepage,
4188 .sync_page = block_sync_page,
4189 .write_begin = ext4_write_begin,
4190 .write_end = ext4_writeback_write_end,
4191@@ -1857,10 +2989,31 @@ static const struct address_space_operations ext4_journalled_aops = {
4192 .releasepage = ext4_releasepage,
4193 };
4194
4195+static const struct address_space_operations ext4_da_aops = {
4196+ .readpage = ext4_readpage,
4197+ .readpages = ext4_readpages,
4198+ .writepage = ext4_da_writepage,
4199+ .writepages = ext4_da_writepages,
4200+ .sync_page = block_sync_page,
4201+ .write_begin = ext4_da_write_begin,
4202+ .write_end = ext4_da_write_end,
4203+ .bmap = ext4_bmap,
4204+ .invalidatepage = ext4_da_invalidatepage,
4205+ .releasepage = ext4_releasepage,
4206+ .direct_IO = ext4_direct_IO,
4207+ .migratepage = buffer_migrate_page,
4208+};
4209+
4210 void ext4_set_aops(struct inode *inode)
4211 {
4212- if (ext4_should_order_data(inode))
4213+ if (ext4_should_order_data(inode) &&
4214+ test_opt(inode->i_sb, DELALLOC))
4215+ inode->i_mapping->a_ops = &ext4_da_aops;
4216+ else if (ext4_should_order_data(inode))
4217 inode->i_mapping->a_ops = &ext4_ordered_aops;
4218+ else if (ext4_should_writeback_data(inode) &&
4219+ test_opt(inode->i_sb, DELALLOC))
4220+ inode->i_mapping->a_ops = &ext4_da_aops;
4221 else if (ext4_should_writeback_data(inode))
4222 inode->i_mapping->a_ops = &ext4_writeback_aops;
4223 else
4224@@ -1873,7 +3026,7 @@ void ext4_set_aops(struct inode *inode)
4225 * This required during truncate. We need to physically zero the tail end
4226 * of that block so it doesn't yield old data if the file is later grown.
4227 */
4228-int ext4_block_truncate_page(handle_t *handle, struct page *page,
4229+int ext4_block_truncate_page(handle_t *handle,
4230 struct address_space *mapping, loff_t from)
4231 {
4232 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
4233@@ -1882,8 +3035,13 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
4234 ext4_lblk_t iblock;
4235 struct inode *inode = mapping->host;
4236 struct buffer_head *bh;
4237+ struct page *page;
4238 int err = 0;
4239
4240+ page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT);
4241+ if (!page)
4242+ return -EINVAL;
4243+
4244 blocksize = inode->i_sb->s_blocksize;
4245 length = blocksize - (offset & (blocksize - 1));
4246 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
4247@@ -1956,7 +3114,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page,
4248 err = ext4_journal_dirty_metadata(handle, bh);
4249 } else {
4250 if (ext4_should_order_data(inode))
4251- err = ext4_journal_dirty_data(handle, bh);
4252+ err = ext4_jbd2_file_inode(handle, inode);
4253 mark_buffer_dirty(bh);
4254 }
4255
4256@@ -2179,7 +3337,21 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
4257
4258 if (this_bh) {
4259 BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata");
4260- ext4_journal_dirty_metadata(handle, this_bh);
4261+
4262+ /*
4263+ * The buffer head should have an attached journal head at this
4264+ * point. However, if the data is corrupted and an indirect
4265+ * block pointed to itself, it would have been detached when
4266+ * the block was cleared. Check for this instead of OOPSing.
4267+ */
4268+ if (bh2jh(this_bh))
4269+ ext4_journal_dirty_metadata(handle, this_bh);
4270+ else
4271+ ext4_error(inode->i_sb, __func__,
4272+ "circular indirect block detected, "
4273+ "inode=%lu, block=%llu",
4274+ inode->i_ino,
4275+ (unsigned long long) this_bh->b_blocknr);
4276 }
4277 }
4278
4279@@ -2305,6 +3477,19 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4280 }
4281 }
4282
4283+int ext4_can_truncate(struct inode *inode)
4284+{
4285+ if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4286+ return 0;
4287+ if (S_ISREG(inode->i_mode))
4288+ return 1;
4289+ if (S_ISDIR(inode->i_mode))
4290+ return 1;
4291+ if (S_ISLNK(inode->i_mode))
4292+ return !ext4_inode_is_fast_symlink(inode);
4293+ return 0;
4294+}
4295+
4296 /*
4297 * ext4_truncate()
4298 *
4299@@ -2347,51 +3532,25 @@ void ext4_truncate(struct inode *inode)
4300 int n;
4301 ext4_lblk_t last_block;
4302 unsigned blocksize = inode->i_sb->s_blocksize;
4303- struct page *page;
4304
4305- if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
4306- S_ISLNK(inode->i_mode)))
4307- return;
4308- if (ext4_inode_is_fast_symlink(inode))
4309- return;
4310- if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
4311+ if (!ext4_can_truncate(inode))
4312 return;
4313
4314- /*
4315- * We have to lock the EOF page here, because lock_page() nests
4316- * outside jbd2_journal_start().
4317- */
4318- if ((inode->i_size & (blocksize - 1)) == 0) {
4319- /* Block boundary? Nothing to do */
4320- page = NULL;
4321- } else {
4322- page = grab_cache_page(mapping,
4323- inode->i_size >> PAGE_CACHE_SHIFT);
4324- if (!page)
4325- return;
4326- }
4327-
4328 if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
4329- ext4_ext_truncate(inode, page);
4330+ ext4_ext_truncate(inode);
4331 return;
4332 }
4333
4334 handle = start_transaction(inode);
4335- if (IS_ERR(handle)) {
4336- if (page) {
4337- clear_highpage(page);
4338- flush_dcache_page(page);
4339- unlock_page(page);
4340- page_cache_release(page);
4341- }
4342+ if (IS_ERR(handle))
4343 return; /* AKPM: return what? */
4344- }
4345
4346 last_block = (inode->i_size + blocksize-1)
4347 >> EXT4_BLOCK_SIZE_BITS(inode->i_sb);
4348
4349- if (page)
4350- ext4_block_truncate_page(handle, page, mapping, inode->i_size);
4351+ if (inode->i_size & (blocksize - 1))
4352+ if (ext4_block_truncate_page(handle, mapping, inode->i_size))
4353+ goto out_stop;
4354
4355 n = ext4_block_to_path(inode, last_block, offsets, NULL);
4356 if (n == 0)
4357@@ -2410,6 +3569,14 @@ void ext4_truncate(struct inode *inode)
4358 goto out_stop;
4359
4360 /*
4361+ * From here we block out all ext4_get_block() callers who want to
4362+ * modify the block allocation tree.
4363+ */
4364+ down_write(&ei->i_data_sem);
4365+
4366+ ext4_discard_reservation(inode);
4367+
4368+ /*
4369 * The orphan list entry will now protect us from any crash which
4370 * occurs before the truncate completes, so it is now safe to propagate
4371 * the new, shorter inode size (held for now in i_size) into the
4372@@ -2418,12 +3585,6 @@ void ext4_truncate(struct inode *inode)
4373 */
4374 ei->i_disksize = inode->i_size;
4375
4376- /*
4377- * From here we block out all ext4_get_block() callers who want to
4378- * modify the block allocation tree.
4379- */
4380- down_write(&ei->i_data_sem);
4381-
4382 if (n == 1) { /* direct blocks */
4383 ext4_free_data(handle, inode, NULL, i_data+offsets[0],
4384 i_data + EXT4_NDIR_BLOCKS);
4385@@ -2484,8 +3645,6 @@ do_indirects:
4386 ;
4387 }
4388
4389- ext4_discard_reservation(inode);
4390-
4391 up_write(&ei->i_data_sem);
4392 inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
4393 ext4_mark_inode_dirty(handle, inode);
4394@@ -2571,6 +3730,16 @@ static int __ext4_get_inode_loc(struct inode *inode,
4395 }
4396 if (!buffer_uptodate(bh)) {
4397 lock_buffer(bh);
4398+
4399+ /*
4400+ * If the buffer has the write error flag, we have failed
4401+ * to write out another inode in the same block. In this
4402+ * case, we don't have to read the block because we may
4403+ * read the old inode data successfully.
4404+ */
4405+ if (buffer_write_io_error(bh) && !buffer_uptodate(bh))
4406+ set_buffer_uptodate(bh);
4407+
4408 if (buffer_uptodate(bh)) {
4409 /* someone brought it uptodate while we waited */
4410 unlock_buffer(bh);
4411@@ -3107,7 +4276,14 @@ int ext4_write_inode(struct inode *inode, int wait)
4412 * be freed, so we have a strong guarantee that no future commit will
4413 * leave these blocks visible to the user.)
4414 *
4415- * Called with inode->sem down.
4416+ * Another thing we have to assure is that if we are in ordered mode
4417+ * and inode is still attached to the committing transaction, we must
4418+ * we start writeout of all the dirty pages which are being truncated.
4419+ * This way we are sure that all the data written in the previous
4420+ * transaction are already on disk (truncate waits for pages under
4421+ * writeback).
4422+ *
4423+ * Called with inode->i_mutex down.
4424 */
4425 int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4426 {
4427@@ -3173,6 +4349,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
4428 if (!error)
4429 error = rc;
4430 ext4_journal_stop(handle);
4431+
4432+ if (ext4_should_order_data(inode)) {
4433+ error = ext4_begin_ordered_truncate(inode,
4434+ attr->ia_size);
4435+ if (error) {
4436+ /* Do as much error cleanup as possible */
4437+ handle = ext4_journal_start(inode, 3);
4438+ if (IS_ERR(handle)) {
4439+ ext4_orphan_del(NULL, inode);
4440+ goto err_out;
4441+ }
4442+ ext4_orphan_del(handle, inode);
4443+ ext4_journal_stop(handle);
4444+ goto err_out;
4445+ }
4446+ }
4447 }
4448
4449 rc = inode_setattr(inode, attr);
4450@@ -3193,58 +4385,156 @@ err_out:
4451 return error;
4452 }
4453
4454+int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
4455+ struct kstat *stat)
4456+{
4457+ struct inode *inode;
4458+ unsigned long delalloc_blocks;
4459+
4460+ inode = dentry->d_inode;
4461+ generic_fillattr(inode, stat);
4462
4463+ /*
4464+ * We can't update i_blocks if the block allocation is delayed
4465+ * otherwise in the case of system crash before the real block
4466+ * allocation is done, we will have i_blocks inconsistent with
4467+ * on-disk file blocks.
4468+ * We always keep i_blocks updated together with real
4469+ * allocation. But to not confuse with user, stat
4470+ * will return the blocks that include the delayed allocation
4471+ * blocks for this file.
4472+ */
4473+ spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
4474+ delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
4475+ spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
4476+
4477+ stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
4478+ return 0;
4479+}
4480+
4481+static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
4482+ int chunk)
4483+{
4484+ int indirects;
4485+
4486+ /* if nrblocks are contiguous */
4487+ if (chunk) {
4488+ /*
4489+ * With N contiguous data blocks, it need at most
4490+ * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks
4491+ * 2 dindirect blocks
4492+ * 1 tindirect block
4493+ */
4494+ indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb);
4495+ return indirects + 3;
4496+ }
4497+ /*
4498+ * if nrblocks are not contiguous, worse case, each block touch
4499+ * a indirect block, and each indirect block touch a double indirect
4500+ * block, plus a triple indirect block
4501+ */
4502+ indirects = nrblocks * 2 + 1;
4503+ return indirects;
4504+}
4505+
4506+static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4507+{
4508+ if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
4509+ return ext4_indirect_trans_blocks(inode, nrblocks, 0);
4510+ return ext4_ext_index_trans_blocks(inode, nrblocks, 0);
4511+}
4512 /*
4513- * How many blocks doth make a writepage()?
4514- *
4515- * With N blocks per page, it may be:
4516- * N data blocks
4517- * 2 indirect block
4518- * 2 dindirect
4519- * 1 tindirect
4520- * N+5 bitmap blocks (from the above)
4521- * N+5 group descriptor summary blocks
4522- * 1 inode block
4523- * 1 superblock.
4524- * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files
4525- *
4526- * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS
4527- *
4528- * With ordered or writeback data it's the same, less the N data blocks.
4529- *
4530- * If the inode's direct blocks can hold an integral number of pages then a
4531- * page cannot straddle two indirect blocks, and we can only touch one indirect
4532- * and dindirect block, and the "5" above becomes "3".
4533- *
4534- * This still overestimates under most circumstances. If we were to pass the
4535- * start and end offsets in here as well we could do block_to_path() on each
4536- * block and work out the exact number of indirects which are touched. Pah.
4537+ * Account for index blocks, block groups bitmaps and block group
4538+ * descriptor blocks if modify datablocks and index blocks
4539+ * worse case, the indexs blocks spread over different block groups
4540+ *
4541+ * If datablocks are discontiguous, they are possible to spread over
4542+ * different block groups too. If they are contiugous, with flexbg,
4543+ * they could still across block group boundary.
4544+ *
4545+ * Also account for superblock, inode, quota and xattr blocks
4546 */
4547+int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4548+{
4549+ int groups, gdpblocks;
4550+ int idxblocks;
4551+ int ret = 0;
4552+
4553+ /*
4554+ * How many index blocks need to touch to modify nrblocks?
4555+ * The "Chunk" flag indicating whether the nrblocks is
4556+ * physically contiguous on disk
4557+ *
4558+ * For Direct IO and fallocate, they calls get_block to allocate
4559+ * one single extent at a time, so they could set the "Chunk" flag
4560+ */
4561+ idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk);
4562
4563+ ret = idxblocks;
4564+
4565+ /*
4566+ * Now let's see how many group bitmaps and group descriptors need
4567+ * to account
4568+ */
4569+ groups = idxblocks;
4570+ if (chunk)
4571+ groups += 1;
4572+ else
4573+ groups += nrblocks;
4574+
4575+ gdpblocks = groups;
4576+ if (groups > EXT4_SB(inode->i_sb)->s_groups_count)
4577+ groups = EXT4_SB(inode->i_sb)->s_groups_count;
4578+ if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
4579+ gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
4580+
4581+ /* bitmaps and block group descriptor blocks */
4582+ ret += groups + gdpblocks;
4583+
4584+ /* Blocks for super block, inode, quota and xattr blocks */
4585+ ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
4586+
4587+ return ret;
4588+}
4589+
4590+/*
4591+ * Calulate the total number of credits to reserve to fit
4592+ * the modification of a single pages into a single transaction,
4593+ * which may include multiple chunks of block allocations.
4594+ *
4595+ * This could be called via ext4_write_begin()
4596+ *
4597+ * We need to consider the worse case, when
4598+ * one new block per extent.
4599+ */
4600 int ext4_writepage_trans_blocks(struct inode *inode)
4601 {
4602 int bpp = ext4_journal_blocks_per_page(inode);
4603- int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3;
4604 int ret;
4605
4606- if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
4607- return ext4_ext_writepage_trans_blocks(inode, bpp);
4608+ ret = ext4_meta_trans_blocks(inode, bpp, 0);
4609
4610+ /* Account for data blocks for journalled mode */
4611 if (ext4_should_journal_data(inode))
4612- ret = 3 * (bpp + indirects) + 2;
4613- else
4614- ret = 2 * (bpp + indirects) + 2;
4615-
4616-#ifdef CONFIG_QUOTA
4617- /* We know that structure was already allocated during DQUOT_INIT so
4618- * we will be updating only the data blocks + inodes */
4619- ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
4620-#endif
4621-
4622+ ret += bpp;
4623 return ret;
4624 }
4625
4626 /*
4627+ * Calculate the journal credits for a chunk of data modification.
4628+ *
4629+ * This is called from DIO, fallocate or whoever calling
4630+ * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks.
4631+ *
4632+ * journal buffers for data blocks are not included here, as DIO
4633+ * and fallocate do no need to journal data buffers.
4634+ */
4635+int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks)
4636+{
4637+ return ext4_meta_trans_blocks(inode, nrblocks, 1);
4638+}
4639+
4640+/*
4641 * The caller must have previously called ext4_reserve_inode_write().
4642 * Give this, we know that the caller already has write access to iloc->bh.
4643 */
4644@@ -3506,3 +4796,64 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
4645
4646 return err;
4647 }
4648+
4649+static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
4650+{
4651+ return !buffer_mapped(bh);
4652+}
4653+
4654+int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
4655+{
4656+ loff_t size;
4657+ unsigned long len;
4658+ int ret = -EINVAL;
4659+ struct file *file = vma->vm_file;
4660+ struct inode *inode = file->f_path.dentry->d_inode;
4661+ struct address_space *mapping = inode->i_mapping;
4662+
4663+ /*
4664+ * Get i_alloc_sem to stop truncates messing with the inode. We cannot
4665+ * get i_mutex because we are already holding mmap_sem.
4666+ */
4667+ down_read(&inode->i_alloc_sem);
4668+ size = i_size_read(inode);
4669+ if (page->mapping != mapping || size <= page_offset(page)
4670+ || !PageUptodate(page)) {
4671+ /* page got truncated from under us? */
4672+ goto out_unlock;
4673+ }
4674+ ret = 0;
4675+ if (PageMappedToDisk(page))
4676+ goto out_unlock;
4677+
4678+ if (page->index == size >> PAGE_CACHE_SHIFT)
4679+ len = size & ~PAGE_CACHE_MASK;
4680+ else
4681+ len = PAGE_CACHE_SIZE;
4682+
4683+ if (page_has_buffers(page)) {
4684+ /* return if we have all the buffers mapped */
4685+ if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
4686+ ext4_bh_unmapped))
4687+ goto out_unlock;
4688+ }
4689+ /*
4690+ * OK, we need to fill the hole... Do write_begin write_end
4691+ * to do block allocation/reservation.We are not holding
4692+ * inode.i__mutex here. That allow * parallel write_begin,
4693+ * write_end call. lock_page prevent this from happening
4694+ * on the same page though
4695+ */
4696+ ret = mapping->a_ops->write_begin(file, mapping, page_offset(page),
4697+ len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
4698+ if (ret < 0)
4699+ goto out_unlock;
4700+ ret = mapping->a_ops->write_end(file, mapping, page_offset(page),
4701+ len, len, page, NULL);
4702+ if (ret < 0)
4703+ goto out_unlock;
4704+ ret = 0;
4705+out_unlock:
4706+ up_read(&inode->i_alloc_sem);
4707+ return ret;
4708+}
4709diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
4710index c9900aa..e0e3a5e 100644
4711--- a/fs/ext4/mballoc.c
4712+++ b/fs/ext4/mballoc.c
4713@@ -381,22 +381,28 @@ static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
4714
4715 static inline int mb_find_next_zero_bit(void *addr, int max, int start)
4716 {
4717- int fix = 0;
4718+ int fix = 0, ret, tmpmax;
4719 addr = mb_correct_addr_and_bit(&fix, addr);
4720- max += fix;
4721+ tmpmax = max + fix;
4722 start += fix;
4723
4724- return ext4_find_next_zero_bit(addr, max, start) - fix;
4725+ ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
4726+ if (ret > max)
4727+ return max;
4728+ return ret;
4729 }
4730
4731 static inline int mb_find_next_bit(void *addr, int max, int start)
4732 {
4733- int fix = 0;
4734+ int fix = 0, ret, tmpmax;
4735 addr = mb_correct_addr_and_bit(&fix, addr);
4736- max += fix;
4737+ tmpmax = max + fix;
4738 start += fix;
4739
4740- return ext4_find_next_bit(addr, max, start) - fix;
4741+ ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
4742+ if (ret > max)
4743+ return max;
4744+ return ret;
4745 }
4746
4747 static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
4748@@ -781,13 +787,16 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
4749 if (bh_uptodate_or_lock(bh[i]))
4750 continue;
4751
4752+ spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
4753 if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
4754 ext4_init_block_bitmap(sb, bh[i],
4755 first_group + i, desc);
4756 set_buffer_uptodate(bh[i]);
4757 unlock_buffer(bh[i]);
4758+ spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
4759 continue;
4760 }
4761+ spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
4762 get_bh(bh[i]);
4763 bh[i]->b_end_io = end_buffer_read_sync;
4764 submit_bh(READ, bh[i]);
4765@@ -803,6 +812,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
4766 if (!buffer_uptodate(bh[i]))
4767 goto out;
4768
4769+ err = 0;
4770 first_block = page->index * blocks_per_page;
4771 for (i = 0; i < blocks_per_page; i++) {
4772 int group;
4773@@ -883,6 +893,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
4774 int pnum;
4775 int poff;
4776 struct page *page;
4777+ int ret;
4778
4779 mb_debug("load group %lu\n", group);
4780
4781@@ -914,15 +925,21 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
4782 if (page) {
4783 BUG_ON(page->mapping != inode->i_mapping);
4784 if (!PageUptodate(page)) {
4785- ext4_mb_init_cache(page, NULL);
4786+ ret = ext4_mb_init_cache(page, NULL);
4787+ if (ret) {
4788+ unlock_page(page);
4789+ goto err;
4790+ }
4791 mb_cmp_bitmaps(e4b, page_address(page) +
4792 (poff * sb->s_blocksize));
4793 }
4794 unlock_page(page);
4795 }
4796 }
4797- if (page == NULL || !PageUptodate(page))
4798+ if (page == NULL || !PageUptodate(page)) {
4799+ ret = -EIO;
4800 goto err;
4801+ }
4802 e4b->bd_bitmap_page = page;
4803 e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
4804 mark_page_accessed(page);
4805@@ -938,14 +955,20 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
4806 page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
4807 if (page) {
4808 BUG_ON(page->mapping != inode->i_mapping);
4809- if (!PageUptodate(page))
4810- ext4_mb_init_cache(page, e4b->bd_bitmap);
4811-
4812+ if (!PageUptodate(page)) {
4813+ ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
4814+ if (ret) {
4815+ unlock_page(page);
4816+ goto err;
4817+ }
4818+ }
4819 unlock_page(page);
4820 }
4821 }
4822- if (page == NULL || !PageUptodate(page))
4823+ if (page == NULL || !PageUptodate(page)) {
4824+ ret = -EIO;
4825 goto err;
4826+ }
4827 e4b->bd_buddy_page = page;
4828 e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
4829 mark_page_accessed(page);
4830@@ -962,7 +985,7 @@ err:
4831 page_cache_release(e4b->bd_buddy_page);
4832 e4b->bd_buddy = NULL;
4833 e4b->bd_bitmap = NULL;
4834- return -EIO;
4835+ return ret;
4836 }
4837
4838 static void ext4_mb_release_desc(struct ext4_buddy *e4b)
4839@@ -1031,7 +1054,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
4840 }
4841 }
4842
4843-static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
4844+static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
4845 int first, int count)
4846 {
4847 int block = 0;
4848@@ -1071,11 +1094,12 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
4849 blocknr += block;
4850 blocknr +=
4851 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
4852-
4853+ ext4_unlock_group(sb, e4b->bd_group);
4854 ext4_error(sb, __func__, "double-free of inode"
4855 " %lu's block %llu(bit %u in group %lu)\n",
4856 inode ? inode->i_ino : 0, blocknr, block,
4857 e4b->bd_group);
4858+ ext4_lock_group(sb, e4b->bd_group);
4859 }
4860 mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
4861 e4b->bd_info->bb_counters[order]++;
4862@@ -1113,8 +1137,6 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
4863 } while (1);
4864 }
4865 mb_check_buddy(e4b);
4866-
4867- return 0;
4868 }
4869
4870 static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
4871@@ -1730,10 +1752,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
4872 ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
4873 spin_unlock(&sbi->s_md_lock);
4874 }
4875-
4876- /* searching for the right group start from the goal value specified */
4877- group = ac->ac_g_ex.fe_group;
4878-
4879 /* Let's just scan groups to find more-less suitable blocks */
4880 cr = ac->ac_2order ? 0 : 1;
4881 /*
4882@@ -1743,6 +1761,12 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
4883 repeat:
4884 for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
4885 ac->ac_criteria = cr;
4886+ /*
4887+ * searching for the right group start
4888+ * from the goal value specified
4889+ */
4890+ group = ac->ac_g_ex.fe_group;
4891+
4892 for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
4893 struct ext4_group_info *grp;
4894 struct ext4_group_desc *desc;
4895@@ -1963,6 +1987,8 @@ static int ext4_mb_seq_history_open(struct inode *inode, struct file *file)
4896 int rc;
4897 int size;
4898
4899+ if (unlikely(sbi->s_mb_history == NULL))
4900+ return -ENOMEM;
4901 s = kmalloc(sizeof(*s), GFP_KERNEL);
4902 if (s == NULL)
4903 return -ENOMEM;
4904@@ -2165,9 +2191,7 @@ static void ext4_mb_history_init(struct super_block *sb)
4905 sbi->s_mb_history_cur = 0;
4906 spin_lock_init(&sbi->s_mb_history_lock);
4907 i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
4908- sbi->s_mb_history = kmalloc(i, GFP_KERNEL);
4909- if (likely(sbi->s_mb_history != NULL))
4910- memset(sbi->s_mb_history, 0, i);
4911+ sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
4912 /* if we can't allocate history, then we simple won't use it */
4913 }
4914
4915@@ -2215,21 +2239,192 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
4916 #define ext4_mb_history_init(sb)
4917 #endif
4918
4919+
4920+/* Create and initialize ext4_group_info data for the given group. */
4921+int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
4922+ struct ext4_group_desc *desc)
4923+{
4924+ int i, len;
4925+ int metalen = 0;
4926+ struct ext4_sb_info *sbi = EXT4_SB(sb);
4927+ struct ext4_group_info **meta_group_info;
4928+
4929+ /*
4930+ * First check if this group is the first of a reserved block.
4931+ * If it's true, we have to allocate a new table of pointers
4932+ * to ext4_group_info structures
4933+ */
4934+ if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
4935+ metalen = sizeof(*meta_group_info) <<
4936+ EXT4_DESC_PER_BLOCK_BITS(sb);
4937+ meta_group_info = kmalloc(metalen, GFP_KERNEL);
4938+ if (meta_group_info == NULL) {
4939+ printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
4940+ "buddy group\n");
4941+ goto exit_meta_group_info;
4942+ }
4943+ sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
4944+ meta_group_info;
4945+ }
4946+
4947+ /*
4948+ * calculate needed size. if change bb_counters size,
4949+ * don't forget about ext4_mb_generate_buddy()
4950+ */
4951+ len = offsetof(typeof(**meta_group_info),
4952+ bb_counters[sb->s_blocksize_bits + 2]);
4953+
4954+ meta_group_info =
4955+ sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
4956+ i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
4957+
4958+ meta_group_info[i] = kzalloc(len, GFP_KERNEL);
4959+ if (meta_group_info[i] == NULL) {
4960+ printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
4961+ goto exit_group_info;
4962+ }
4963+ set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
4964+ &(meta_group_info[i]->bb_state));
4965+
4966+ /*
4967+ * initialize bb_free to be able to skip
4968+ * empty groups without initialization
4969+ */
4970+ if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
4971+ meta_group_info[i]->bb_free =
4972+ ext4_free_blocks_after_init(sb, group, desc);
4973+ } else {
4974+ meta_group_info[i]->bb_free =
4975+ le16_to_cpu(desc->bg_free_blocks_count);
4976+ }
4977+
4978+ INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
4979+
4980+#ifdef DOUBLE_CHECK
4981+ {
4982+ struct buffer_head *bh;
4983+ meta_group_info[i]->bb_bitmap =
4984+ kmalloc(sb->s_blocksize, GFP_KERNEL);
4985+ BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
4986+ bh = ext4_read_block_bitmap(sb, group);
4987+ BUG_ON(bh == NULL);
4988+ memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
4989+ sb->s_blocksize);
4990+ put_bh(bh);
4991+ }
4992+#endif
4993+
4994+ return 0;
4995+
4996+exit_group_info:
4997+ /* If a meta_group_info table has been allocated, release it now */
4998+ if (group % EXT4_DESC_PER_BLOCK(sb) == 0)
4999+ kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
5000+exit_meta_group_info:
5001+ return -ENOMEM;
5002+} /* ext4_mb_add_groupinfo */
5003+
5004+/*
5005+ * Add a group to the existing groups.
5006+ * This function is used for online resize
5007+ */
5008+int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group,
5009+ struct ext4_group_desc *desc)
5010+{
5011+ struct ext4_sb_info *sbi = EXT4_SB(sb);
5012+ struct inode *inode = sbi->s_buddy_cache;
5013+ int blocks_per_page;
5014+ int block;
5015+ int pnum;
5016+ struct page *page;
5017+ int err;
5018+
5019+ /* Add group based on group descriptor*/
5020+ err = ext4_mb_add_groupinfo(sb, group, desc);
5021+ if (err)
5022+ return err;
5023+
5024+ /*
5025+ * Cache pages containing dynamic mb_alloc datas (buddy and bitmap
5026+ * datas) are set not up to date so that they will be re-initilaized
5027+ * during the next call to ext4_mb_load_buddy
5028+ */
5029+
5030+ /* Set buddy page as not up to date */
5031+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
5032+ block = group * 2;
5033+ pnum = block / blocks_per_page;
5034+ page = find_get_page(inode->i_mapping, pnum);
5035+ if (page != NULL) {
5036+ ClearPageUptodate(page);
5037+ page_cache_release(page);
5038+ }
5039+
5040+ /* Set bitmap page as not up to date */
5041+ block++;
5042+ pnum = block / blocks_per_page;
5043+ page = find_get_page(inode->i_mapping, pnum);
5044+ if (page != NULL) {
5045+ ClearPageUptodate(page);
5046+ page_cache_release(page);
5047+ }
5048+
5049+ return 0;
5050+}
5051+
5052+/*
5053+ * Update an existing group.
5054+ * This function is used for online resize
5055+ */
5056+void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
5057+{
5058+ grp->bb_free += add;
5059+}
5060+
5061 static int ext4_mb_init_backend(struct super_block *sb)
5062 {
5063 ext4_group_t i;
5064- int j, len, metalen;
5065+ int metalen;
5066 struct ext4_sb_info *sbi = EXT4_SB(sb);
5067- int num_meta_group_infos =
5068- (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >>
5069- EXT4_DESC_PER_BLOCK_BITS(sb);
5070+ struct ext4_super_block *es = sbi->s_es;
5071+ int num_meta_group_infos;
5072+ int num_meta_group_infos_max;
5073+ int array_size;
5074 struct ext4_group_info **meta_group_info;
5075+ struct ext4_group_desc *desc;
5076
5077+ /* This is the number of blocks used by GDT */
5078+ num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
5079+ 1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
5080+
5081+ /*
5082+ * This is the total number of blocks used by GDT including
5083+ * the number of reserved blocks for GDT.
5084+ * The s_group_info array is allocated with this value
5085+ * to allow a clean online resize without a complex
5086+ * manipulation of pointer.
5087+ * The drawback is the unused memory when no resize
5088+ * occurs but it's very low in terms of pages
5089+ * (see comments below)
5090+ * Need to handle this properly when META_BG resizing is allowed
5091+ */
5092+ num_meta_group_infos_max = num_meta_group_infos +
5093+ le16_to_cpu(es->s_reserved_gdt_blocks);
5094+
5095+ /*
5096+ * array_size is the size of s_group_info array. We round it
5097+ * to the next power of two because this approximation is done
5098+ * internally by kmalloc so we can have some more memory
5099+ * for free here (e.g. may be used for META_BG resize).
5100+ */
5101+ array_size = 1;
5102+ while (array_size < sizeof(*sbi->s_group_info) *
5103+ num_meta_group_infos_max)
5104+ array_size = array_size << 1;
5105 /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
5106 * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
5107 * So a two level scheme suffices for now. */
5108- sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) *
5109- num_meta_group_infos, GFP_KERNEL);
5110+ sbi->s_group_info = kmalloc(array_size, GFP_KERNEL);
5111 if (sbi->s_group_info == NULL) {
5112 printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
5113 return -ENOMEM;
5114@@ -2256,63 +2451,15 @@ static int ext4_mb_init_backend(struct super_block *sb)
5115 sbi->s_group_info[i] = meta_group_info;
5116 }
5117
5118- /*
5119- * calculate needed size. if change bb_counters size,
5120- * don't forget about ext4_mb_generate_buddy()
5121- */
5122- len = sizeof(struct ext4_group_info);
5123- len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2);
5124 for (i = 0; i < sbi->s_groups_count; i++) {
5125- struct ext4_group_desc *desc;
5126-
5127- meta_group_info =
5128- sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)];
5129- j = i & (EXT4_DESC_PER_BLOCK(sb) - 1);
5130-
5131- meta_group_info[j] = kzalloc(len, GFP_KERNEL);
5132- if (meta_group_info[j] == NULL) {
5133- printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
5134- goto err_freebuddy;
5135- }
5136 desc = ext4_get_group_desc(sb, i, NULL);
5137 if (desc == NULL) {
5138 printk(KERN_ERR
5139 "EXT4-fs: can't read descriptor %lu\n", i);
5140- i++;
5141 goto err_freebuddy;
5142 }
5143- memset(meta_group_info[j], 0, len);
5144- set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
5145- &(meta_group_info[j]->bb_state));
5146-
5147- /*
5148- * initialize bb_free to be able to skip
5149- * empty groups without initialization
5150- */
5151- if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
5152- meta_group_info[j]->bb_free =
5153- ext4_free_blocks_after_init(sb, i, desc);
5154- } else {
5155- meta_group_info[j]->bb_free =
5156- le16_to_cpu(desc->bg_free_blocks_count);
5157- }
5158-
5159- INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list);
5160-
5161-#ifdef DOUBLE_CHECK
5162- {
5163- struct buffer_head *bh;
5164- meta_group_info[j]->bb_bitmap =
5165- kmalloc(sb->s_blocksize, GFP_KERNEL);
5166- BUG_ON(meta_group_info[j]->bb_bitmap == NULL);
5167- bh = read_block_bitmap(sb, i);
5168- BUG_ON(bh == NULL);
5169- memcpy(meta_group_info[j]->bb_bitmap, bh->b_data,
5170- sb->s_blocksize);
5171- put_bh(bh);
5172- }
5173-#endif
5174-
5175+ if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
5176+ goto err_freebuddy;
5177 }
5178
5179 return 0;
5180@@ -2333,9 +2480,10 @@ err_freesgi:
5181 int ext4_mb_init(struct super_block *sb, int needs_recovery)
5182 {
5183 struct ext4_sb_info *sbi = EXT4_SB(sb);
5184- unsigned i;
5185+ unsigned i, j;
5186 unsigned offset;
5187 unsigned max;
5188+ int ret;
5189
5190 if (!test_opt(sb, MBALLOC))
5191 return 0;
5192@@ -2370,12 +2518,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
5193 } while (i <= sb->s_blocksize_bits + 1);
5194
5195 /* init file for buddy data */
5196- i = ext4_mb_init_backend(sb);
5197- if (i) {
5198+ ret = ext4_mb_init_backend(sb);
5199+ if (ret != 0) {
5200 clear_opt(sbi->s_mount_opt, MBALLOC);
5201 kfree(sbi->s_mb_offsets);
5202 kfree(sbi->s_mb_maxs);
5203- return i;
5204+ return ret;
5205 }
5206
5207 spin_lock_init(&sbi->s_md_lock);
5208@@ -2392,7 +2540,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
5209 sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT;
5210 sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
5211
5212- i = sizeof(struct ext4_locality_group) * NR_CPUS;
5213+ i = sizeof(struct ext4_locality_group) * nr_cpu_ids;
5214 sbi->s_locality_groups = kmalloc(i, GFP_KERNEL);
5215 if (sbi->s_locality_groups == NULL) {
5216 clear_opt(sbi->s_mount_opt, MBALLOC);
5217@@ -2400,11 +2548,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
5218 kfree(sbi->s_mb_maxs);
5219 return -ENOMEM;
5220 }
5221- for (i = 0; i < NR_CPUS; i++) {
5222+ for (i = 0; i < nr_cpu_ids; i++) {
5223 struct ext4_locality_group *lg;
5224 lg = &sbi->s_locality_groups[i];
5225 mutex_init(&lg->lg_mutex);
5226- INIT_LIST_HEAD(&lg->lg_prealloc_list);
5227+ for (j = 0; j < PREALLOC_TB_SIZE; j++)
5228+ INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
5229 spin_lock_init(&lg->lg_prealloc_lock);
5230 }
5231
5232@@ -2548,8 +2697,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
5233 ext4_lock_group(sb, md->group);
5234 for (i = 0; i < md->num; i++) {
5235 mb_debug(" %u", md->blocks[i]);
5236- err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
5237- BUG_ON(err != 0);
5238+ mb_free_blocks(NULL, &e4b, md->blocks[i], 1);
5239 }
5240 mb_debug("\n");
5241 ext4_unlock_group(sb, md->group);
5242@@ -2575,25 +2723,24 @@ ext4_mb_free_committed_blocks(struct super_block *sb)
5243
5244
5245
5246-#define MB_PROC_VALUE_READ(name) \
5247-static int ext4_mb_read_##name(char *page, char **start, \
5248- off_t off, int count, int *eof, void *data) \
5249+#define MB_PROC_FOPS(name) \
5250+static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v) \
5251 { \
5252- struct ext4_sb_info *sbi = data; \
5253- int len; \
5254- *eof = 1; \
5255- if (off != 0) \
5256- return 0; \
5257- len = sprintf(page, "%ld\n", sbi->s_mb_##name); \
5258- *start = page; \
5259- return len; \
5260-}
5261-
5262-#define MB_PROC_VALUE_WRITE(name) \
5263-static int ext4_mb_write_##name(struct file *file, \
5264- const char __user *buf, unsigned long cnt, void *data) \
5265+ struct ext4_sb_info *sbi = m->private; \
5266+ \
5267+ seq_printf(m, "%ld\n", sbi->s_mb_##name); \
5268+ return 0; \
5269+} \
5270+ \
5271+static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\
5272 { \
5273- struct ext4_sb_info *sbi = data; \
5274+ return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\
5275+} \
5276+ \
5277+static ssize_t ext4_mb_##name##_proc_write(struct file *file, \
5278+ const char __user *buf, size_t cnt, loff_t *ppos) \
5279+{ \
5280+ struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\
5281 char str[32]; \
5282 long value; \
5283 if (cnt >= sizeof(str)) \
5284@@ -2605,31 +2752,32 @@ static int ext4_mb_write_##name(struct file *file, \
5285 return -ERANGE; \
5286 sbi->s_mb_##name = value; \
5287 return cnt; \
5288-}
5289+} \
5290+ \
5291+static const struct file_operations ext4_mb_##name##_proc_fops = { \
5292+ .owner = THIS_MODULE, \
5293+ .open = ext4_mb_##name##_proc_open, \
5294+ .read = seq_read, \
5295+ .llseek = seq_lseek, \
5296+ .release = single_release, \
5297+ .write = ext4_mb_##name##_proc_write, \
5298+};
5299
5300-MB_PROC_VALUE_READ(stats);
5301-MB_PROC_VALUE_WRITE(stats);
5302-MB_PROC_VALUE_READ(max_to_scan);
5303-MB_PROC_VALUE_WRITE(max_to_scan);
5304-MB_PROC_VALUE_READ(min_to_scan);
5305-MB_PROC_VALUE_WRITE(min_to_scan);
5306-MB_PROC_VALUE_READ(order2_reqs);
5307-MB_PROC_VALUE_WRITE(order2_reqs);
5308-MB_PROC_VALUE_READ(stream_request);
5309-MB_PROC_VALUE_WRITE(stream_request);
5310-MB_PROC_VALUE_READ(group_prealloc);
5311-MB_PROC_VALUE_WRITE(group_prealloc);
5312+MB_PROC_FOPS(stats);
5313+MB_PROC_FOPS(max_to_scan);
5314+MB_PROC_FOPS(min_to_scan);
5315+MB_PROC_FOPS(order2_reqs);
5316+MB_PROC_FOPS(stream_request);
5317+MB_PROC_FOPS(group_prealloc);
5318
5319 #define MB_PROC_HANDLER(name, var) \
5320 do { \
5321- proc = create_proc_entry(name, mode, sbi->s_mb_proc); \
5322+ proc = proc_create_data(name, mode, sbi->s_mb_proc, \
5323+ &ext4_mb_##var##_proc_fops, sbi); \
5324 if (proc == NULL) { \
5325 printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \
5326 goto err_out; \
5327 } \
5328- proc->data = sbi; \
5329- proc->read_proc = ext4_mb_read_##var ; \
5330- proc->write_proc = ext4_mb_write_##var; \
5331 } while (0)
5332
5333 static int ext4_mb_init_per_dev_proc(struct super_block *sb)
5334@@ -2639,6 +2787,10 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb)
5335 struct proc_dir_entry *proc;
5336 char devname[64];
5337
5338+ if (proc_root_ext4 == NULL) {
5339+ sbi->s_mb_proc = NULL;
5340+ return -EINVAL;
5341+ }
5342 bdevname(sb->s_bdev, devname);
5343 sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4);
5344
5345@@ -2747,7 +2899,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
5346
5347
5348 err = -EIO;
5349- bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group);
5350+ bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
5351 if (!bitmap_bh)
5352 goto out_err;
5353
5354@@ -2816,7 +2968,23 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
5355 le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len);
5356 gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
5357 spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
5358- percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
5359+
5360+ /*
5361+ * free blocks account has already be reduced/reserved
5362+ * at write_begin() time for delayed allocation
5363+ * do not double accounting
5364+ */
5365+ if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
5366+ percpu_counter_sub(&sbi->s_freeblocks_counter,
5367+ ac->ac_b_ex.fe_len);
5368+
5369+ if (sbi->s_log_groups_per_flex) {
5370+ ext4_group_t flex_group = ext4_flex_group(sbi,
5371+ ac->ac_b_ex.fe_group);
5372+ spin_lock(sb_bgl_lock(sbi, flex_group));
5373+ sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
5374+ spin_unlock(sb_bgl_lock(sbi, flex_group));
5375+ }
5376
5377 err = ext4_journal_dirty_metadata(handle, bitmap_bh);
5378 if (err)
5379@@ -3096,6 +3264,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
5380 struct ext4_prealloc_space *pa)
5381 {
5382 unsigned int len = ac->ac_o_ex.fe_len;
5383+
5384 ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
5385 &ac->ac_b_ex.fe_group,
5386 &ac->ac_b_ex.fe_start);
5387@@ -3113,14 +3282,45 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
5388 }
5389
5390 /*
5391+ * Return the prealloc space that have minimal distance
5392+ * from the goal block. @cpa is the prealloc
5393+ * space that is having currently known minimal distance
5394+ * from the goal block.
5395+ */
5396+static struct ext4_prealloc_space *
5397+ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
5398+ struct ext4_prealloc_space *pa,
5399+ struct ext4_prealloc_space *cpa)
5400+{
5401+ ext4_fsblk_t cur_distance, new_distance;
5402+
5403+ if (cpa == NULL) {
5404+ atomic_inc(&pa->pa_count);
5405+ return pa;
5406+ }
5407+ cur_distance = abs(goal_block - cpa->pa_pstart);
5408+ new_distance = abs(goal_block - pa->pa_pstart);
5409+
5410+ if (cur_distance < new_distance)
5411+ return cpa;
5412+
5413+ /* drop the previous reference */
5414+ atomic_dec(&cpa->pa_count);
5415+ atomic_inc(&pa->pa_count);
5416+ return pa;
5417+}
5418+
5419+/*
5420 * search goal blocks in preallocated space
5421 */
5422 static noinline_for_stack int
5423 ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
5424 {
5425+ int order, i;
5426 struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
5427 struct ext4_locality_group *lg;
5428- struct ext4_prealloc_space *pa;
5429+ struct ext4_prealloc_space *pa, *cpa = NULL;
5430+ ext4_fsblk_t goal_block;
5431
5432 /* only data can be preallocated */
5433 if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
5434@@ -3158,22 +3358,38 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
5435 lg = ac->ac_lg;
5436 if (lg == NULL)
5437 return 0;
5438+ order = fls(ac->ac_o_ex.fe_len) - 1;
5439+ if (order > PREALLOC_TB_SIZE - 1)
5440+ /* The max size of hash table is PREALLOC_TB_SIZE */
5441+ order = PREALLOC_TB_SIZE - 1;
5442+
5443+ goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) +
5444+ ac->ac_g_ex.fe_start +
5445+ le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block);
5446+ /*
5447+ * search for the prealloc space that is having
5448+ * minimal distance from the goal block.
5449+ */
5450+ for (i = order; i < PREALLOC_TB_SIZE; i++) {
5451+ rcu_read_lock();
5452+ list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
5453+ pa_inode_list) {
5454+ spin_lock(&pa->pa_lock);
5455+ if (pa->pa_deleted == 0 &&
5456+ pa->pa_free >= ac->ac_o_ex.fe_len) {
5457
5458- rcu_read_lock();
5459- list_for_each_entry_rcu(pa, &lg->lg_prealloc_list, pa_inode_list) {
5460- spin_lock(&pa->pa_lock);
5461- if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) {
5462- atomic_inc(&pa->pa_count);
5463- ext4_mb_use_group_pa(ac, pa);
5464+ cpa = ext4_mb_check_group_pa(goal_block,
5465+ pa, cpa);
5466+ }
5467 spin_unlock(&pa->pa_lock);
5468- ac->ac_criteria = 20;
5469- rcu_read_unlock();
5470- return 1;
5471 }
5472- spin_unlock(&pa->pa_lock);
5473+ rcu_read_unlock();
5474+ }
5475+ if (cpa) {
5476+ ext4_mb_use_group_pa(ac, cpa);
5477+ ac->ac_criteria = 20;
5478+ return 1;
5479 }
5480- rcu_read_unlock();
5481-
5482 return 0;
5483 }
5484
5485@@ -3396,6 +3612,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
5486 pa->pa_free = pa->pa_len;
5487 atomic_set(&pa->pa_count, 1);
5488 spin_lock_init(&pa->pa_lock);
5489+ INIT_LIST_HEAD(&pa->pa_inode_list);
5490 pa->pa_deleted = 0;
5491 pa->pa_linear = 1;
5492
5493@@ -3416,10 +3633,10 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
5494 list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
5495 ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
5496
5497- spin_lock(pa->pa_obj_lock);
5498- list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list);
5499- spin_unlock(pa->pa_obj_lock);
5500-
5501+ /*
5502+ * We will later add the new pa to the right bucket
5503+ * after updating the pa_free in ext4_mb_release_context
5504+ */
5505 return 0;
5506 }
5507
5508@@ -3473,8 +3690,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
5509 if (bit >= end)
5510 break;
5511 next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
5512- if (next > end)
5513- next = end;
5514 start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
5515 le32_to_cpu(sbi->s_es->s_first_data_block);
5516 mb_debug(" free preallocated %u/%u in group %u\n",
5517@@ -3569,22 +3784,25 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
5518 if (list_empty(&grp->bb_prealloc_list))
5519 return 0;
5520
5521- bitmap_bh = read_block_bitmap(sb, group);
5522+ bitmap_bh = ext4_read_block_bitmap(sb, group);
5523 if (bitmap_bh == NULL) {
5524- /* error handling here */
5525- ext4_mb_release_desc(&e4b);
5526- BUG_ON(bitmap_bh == NULL);
5527+ ext4_error(sb, __func__, "Error in reading block "
5528+ "bitmap for %lu\n", group);
5529+ return 0;
5530 }
5531
5532 err = ext4_mb_load_buddy(sb, group, &e4b);
5533- BUG_ON(err != 0); /* error handling here */
5534+ if (err) {
5535+ ext4_error(sb, __func__, "Error in loading buddy "
5536+ "information for %lu\n", group);
5537+ put_bh(bitmap_bh);
5538+ return 0;
5539+ }
5540
5541 if (needed == 0)
5542 needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
5543
5544- grp = ext4_get_group_info(sb, group);
5545 INIT_LIST_HEAD(&list);
5546-
5547 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
5548 repeat:
5549 ext4_lock_group(sb, group);
5550@@ -3741,13 +3959,18 @@ repeat:
5551 ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
5552
5553 err = ext4_mb_load_buddy(sb, group, &e4b);
5554- BUG_ON(err != 0); /* error handling here */
5555+ if (err) {
5556+ ext4_error(sb, __func__, "Error in loading buddy "
5557+ "information for %lu\n", group);
5558+ continue;
5559+ }
5560
5561- bitmap_bh = read_block_bitmap(sb, group);
5562+ bitmap_bh = ext4_read_block_bitmap(sb, group);
5563 if (bitmap_bh == NULL) {
5564- /* error handling here */
5565+ ext4_error(sb, __func__, "Error in reading block "
5566+ "bitmap for %lu\n", group);
5567 ext4_mb_release_desc(&e4b);
5568- BUG_ON(bitmap_bh == NULL);
5569+ continue;
5570 }
5571
5572 ext4_lock_group(sb, group);
5573@@ -3950,22 +4173,168 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
5574
5575 }
5576
5577+static noinline_for_stack void
5578+ext4_mb_discard_lg_preallocations(struct super_block *sb,
5579+ struct ext4_locality_group *lg,
5580+ int order, int total_entries)
5581+{
5582+ ext4_group_t group = 0;
5583+ struct ext4_buddy e4b;
5584+ struct list_head discard_list;
5585+ struct ext4_prealloc_space *pa, *tmp;
5586+ struct ext4_allocation_context *ac;
5587+
5588+ mb_debug("discard locality group preallocation\n");
5589+
5590+ INIT_LIST_HEAD(&discard_list);
5591+ ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
5592+
5593+ spin_lock(&lg->lg_prealloc_lock);
5594+ list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
5595+ pa_inode_list) {
5596+ spin_lock(&pa->pa_lock);
5597+ if (atomic_read(&pa->pa_count)) {
5598+ /*
5599+ * This is the pa that we just used
5600+ * for block allocation. So don't
5601+ * free that
5602+ */
5603+ spin_unlock(&pa->pa_lock);
5604+ continue;
5605+ }
5606+ if (pa->pa_deleted) {
5607+ spin_unlock(&pa->pa_lock);
5608+ continue;
5609+ }
5610+ /* only lg prealloc space */
5611+ BUG_ON(!pa->pa_linear);
5612+
5613+ /* seems this one can be freed ... */
5614+ pa->pa_deleted = 1;
5615+ spin_unlock(&pa->pa_lock);
5616+
5617+ list_del_rcu(&pa->pa_inode_list);
5618+ list_add(&pa->u.pa_tmp_list, &discard_list);
5619+
5620+ total_entries--;
5621+ if (total_entries <= 5) {
5622+ /*
5623+ * we want to keep only 5 entries
5624+ * allowing it to grow to 8. This
5625+ * mak sure we don't call discard
5626+ * soon for this list.
5627+ */
5628+ break;
5629+ }
5630+ }
5631+ spin_unlock(&lg->lg_prealloc_lock);
5632+
5633+ list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
5634+
5635+ ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
5636+ if (ext4_mb_load_buddy(sb, group, &e4b)) {
5637+ ext4_error(sb, __func__, "Error in loading buddy "
5638+ "information for %lu\n", group);
5639+ continue;
5640+ }
5641+ ext4_lock_group(sb, group);
5642+ list_del(&pa->pa_group_list);
5643+ ext4_mb_release_group_pa(&e4b, pa, ac);
5644+ ext4_unlock_group(sb, group);
5645+
5646+ ext4_mb_release_desc(&e4b);
5647+ list_del(&pa->u.pa_tmp_list);
5648+ call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
5649+ }
5650+ if (ac)
5651+ kmem_cache_free(ext4_ac_cachep, ac);
5652+}
5653+
5654+/*
5655+ * We have incremented pa_count. So it cannot be freed at this
5656+ * point. Also we hold lg_mutex. So no parallel allocation is
5657+ * possible from this lg. That means pa_free cannot be updated.
5658+ *
5659+ * A parallel ext4_mb_discard_group_preallocations is possible.
5660+ * which can cause the lg_prealloc_list to be updated.
5661+ */
5662+
5663+static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
5664+{
5665+ int order, added = 0, lg_prealloc_count = 1;
5666+ struct super_block *sb = ac->ac_sb;
5667+ struct ext4_locality_group *lg = ac->ac_lg;
5668+ struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
5669+
5670+ order = fls(pa->pa_free) - 1;
5671+ if (order > PREALLOC_TB_SIZE - 1)
5672+ /* The max size of hash table is PREALLOC_TB_SIZE */
5673+ order = PREALLOC_TB_SIZE - 1;
5674+ /* Add the prealloc space to lg */
5675+ rcu_read_lock();
5676+ list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
5677+ pa_inode_list) {
5678+ spin_lock(&tmp_pa->pa_lock);
5679+ if (tmp_pa->pa_deleted) {
5680+ spin_unlock(&pa->pa_lock);
5681+ continue;
5682+ }
5683+ if (!added && pa->pa_free < tmp_pa->pa_free) {
5684+ /* Add to the tail of the previous entry */
5685+ list_add_tail_rcu(&pa->pa_inode_list,
5686+ &tmp_pa->pa_inode_list);
5687+ added = 1;
5688+ /*
5689+ * we want to count the total
5690+ * number of entries in the list
5691+ */
5692+ }
5693+ spin_unlock(&tmp_pa->pa_lock);
5694+ lg_prealloc_count++;
5695+ }
5696+ if (!added)
5697+ list_add_tail_rcu(&pa->pa_inode_list,
5698+ &lg->lg_prealloc_list[order]);
5699+ rcu_read_unlock();
5700+
5701+ /* Now trim the list to be not more than 8 elements */
5702+ if (lg_prealloc_count > 8) {
5703+ ext4_mb_discard_lg_preallocations(sb, lg,
5704+ order, lg_prealloc_count);
5705+ return;
5706+ }
5707+ return ;
5708+}
5709+
5710 /*
5711 * release all resource we used in allocation
5712 */
5713 static int ext4_mb_release_context(struct ext4_allocation_context *ac)
5714 {
5715- if (ac->ac_pa) {
5716- if (ac->ac_pa->pa_linear) {
5717+ struct ext4_prealloc_space *pa = ac->ac_pa;
5718+ if (pa) {
5719+ if (pa->pa_linear) {
5720 /* see comment in ext4_mb_use_group_pa() */
5721- spin_lock(&ac->ac_pa->pa_lock);
5722- ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len;
5723- ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len;
5724- ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len;
5725- ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len;
5726- spin_unlock(&ac->ac_pa->pa_lock);
5727+ spin_lock(&pa->pa_lock);
5728+ pa->pa_pstart += ac->ac_b_ex.fe_len;
5729+ pa->pa_lstart += ac->ac_b_ex.fe_len;
5730+ pa->pa_free -= ac->ac_b_ex.fe_len;
5731+ pa->pa_len -= ac->ac_b_ex.fe_len;
5732+ spin_unlock(&pa->pa_lock);
5733+ /*
5734+ * We want to add the pa to the right bucket.
5735+ * Remove it from the list and while adding
5736+ * make sure the list to which we are adding
5737+ * doesn't grow big.
5738+ */
5739+ if (likely(pa->pa_free)) {
5740+ spin_lock(pa->pa_obj_lock);
5741+ list_del_rcu(&pa->pa_inode_list);
5742+ spin_unlock(pa->pa_obj_lock);
5743+ ext4_mb_add_n_trim(ac);
5744+ }
5745 }
5746- ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa);
5747+ ext4_mb_put_pa(ac, ac->ac_sb, pa);
5748 }
5749 if (ac->ac_bitmap_page)
5750 page_cache_release(ac->ac_bitmap_page);
5751@@ -4011,10 +4380,21 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
5752 sbi = EXT4_SB(sb);
5753
5754 if (!test_opt(sb, MBALLOC)) {
5755- block = ext4_new_blocks_old(handle, ar->inode, ar->goal,
5756+ block = ext4_old_new_blocks(handle, ar->inode, ar->goal,
5757 &(ar->len), errp);
5758 return block;
5759 }
5760+ if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) {
5761+ /*
5762+ * With delalloc we already reserved the blocks
5763+ */
5764+ ar->len = ext4_has_free_blocks(sbi, ar->len);
5765+ }
5766+
5767+ if (ar->len == 0) {
5768+ *errp = -ENOSPC;
5769+ return 0;
5770+ }
5771
5772 while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) {
5773 ar->flags |= EXT4_MB_HINT_NOPREALLOC;
5774@@ -4026,10 +4406,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
5775 }
5776 inquota = ar->len;
5777
5778+ if (EXT4_I(ar->inode)->i_delalloc_reserved_flag)
5779+ ar->flags |= EXT4_MB_DELALLOC_RESERVED;
5780+
5781 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
5782 if (!ac) {
5783+ ar->len = 0;
5784 *errp = -ENOMEM;
5785- return 0;
5786+ goto out1;
5787 }
5788
5789 ext4_mb_poll_new_transaction(sb, handle);
5790@@ -4037,12 +4421,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
5791 *errp = ext4_mb_initialize_context(ac, ar);
5792 if (*errp) {
5793 ar->len = 0;
5794- goto out;
5795+ goto out2;
5796 }
5797
5798 ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
5799 if (!ext4_mb_use_preallocated(ac)) {
5800-
5801 ac->ac_op = EXT4_MB_HISTORY_ALLOC;
5802 ext4_mb_normalize_request(ac, ar);
5803 repeat:
5804@@ -4085,11 +4468,12 @@ repeat:
5805
5806 ext4_mb_release_context(ac);
5807
5808-out:
5809+out2:
5810+ kmem_cache_free(ext4_ac_cachep, ac);
5811+out1:
5812 if (ar->len < inquota)
5813 DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len);
5814
5815- kmem_cache_free(ext4_ac_cachep, ac);
5816 return block;
5817 }
5818 static void ext4_mb_poll_new_transaction(struct super_block *sb,
5819@@ -4242,12 +4626,16 @@ do_more:
5820 overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
5821 count -= overflow;
5822 }
5823- bitmap_bh = read_block_bitmap(sb, block_group);
5824- if (!bitmap_bh)
5825+ bitmap_bh = ext4_read_block_bitmap(sb, block_group);
5826+ if (!bitmap_bh) {
5827+ err = -EIO;
5828 goto error_return;
5829+ }
5830 gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
5831- if (!gdp)
5832+ if (!gdp) {
5833+ err = -EIO;
5834 goto error_return;
5835+ }
5836
5837 if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
5838 in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
5839@@ -4309,10 +4697,9 @@ do_more:
5840 ext4_mb_free_metadata(handle, &e4b, block_group, bit, count);
5841 } else {
5842 ext4_lock_group(sb, block_group);
5843- err = mb_free_blocks(inode, &e4b, bit, count);
5844+ mb_free_blocks(inode, &e4b, bit, count);
5845 ext4_mb_return_to_preallocation(inode, &e4b, block, count);
5846 ext4_unlock_group(sb, block_group);
5847- BUG_ON(err != 0);
5848 }
5849
5850 spin_lock(sb_bgl_lock(sbi, block_group));
5851@@ -4321,6 +4708,13 @@ do_more:
5852 spin_unlock(sb_bgl_lock(sbi, block_group));
5853 percpu_counter_add(&sbi->s_freeblocks_counter, count);
5854
5855+ if (sbi->s_log_groups_per_flex) {
5856+ ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
5857+ spin_lock(sb_bgl_lock(sbi, flex_group));
5858+ sbi->s_flex_groups[flex_group].free_blocks += count;
5859+ spin_unlock(sb_bgl_lock(sbi, flex_group));
5860+ }
5861+
5862 ext4_mb_release_desc(&e4b);
5863
5864 *freed += count;
5865diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
5866index bfe6add..c7c9906 100644
5867--- a/fs/ext4/mballoc.h
5868+++ b/fs/ext4/mballoc.h
5869@@ -164,11 +164,17 @@ struct ext4_free_extent {
5870 * Locality group:
5871 * we try to group all related changes together
5872 * so that writeback can flush/allocate them together as well
5873+ * Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC
5874+ * (512). We store prealloc space into the hash based on the pa_free blocks
5875+ * order value.ie, fls(pa_free)-1;
5876 */
5877+#define PREALLOC_TB_SIZE 10
5878 struct ext4_locality_group {
5879 /* for allocator */
5880- struct mutex lg_mutex; /* to serialize allocates */
5881- struct list_head lg_prealloc_list;/* list of preallocations */
5882+ /* to serialize allocates */
5883+ struct mutex lg_mutex;
5884+ /* list of preallocations */
5885+ struct list_head lg_prealloc_list[PREALLOC_TB_SIZE];
5886 spinlock_t lg_prealloc_lock;
5887 };
5888
5889diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
5890index b9e077b..46fc0b5 100644
5891--- a/fs/ext4/migrate.c
5892+++ b/fs/ext4/migrate.c
5893@@ -53,7 +53,8 @@ static int finish_range(handle_t *handle, struct inode *inode,
5894 * credit. But below we try to not accumalate too much
5895 * of them by restarting the journal.
5896 */
5897- needed = ext4_ext_calc_credits_for_insert(inode, path);
5898+ needed = ext4_ext_calc_credits_for_single_extent(inode,
5899+ lb->last_block - lb->first_block + 1, path);
5900
5901 /*
5902 * Make sure the credit we accumalated is not really high
5903diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
5904index ab16bea..387ad98 100644
5905--- a/fs/ext4/namei.c
5906+++ b/fs/ext4/namei.c
5907@@ -183,6 +183,16 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
5908 struct inode *inode);
5909
5910 /*
5911+ * p is at least 6 bytes before the end of page
5912+ */
5913+static inline struct ext4_dir_entry_2 *
5914+ext4_next_entry(struct ext4_dir_entry_2 *p)
5915+{
5916+ return (struct ext4_dir_entry_2 *)((char *)p +
5917+ ext4_rec_len_from_disk(p->rec_len));
5918+}
5919+
5920+/*
5921 * Future: use high four bits of block for coalesce-on-delete flags
5922 * Mask them off for now.
5923 */
5924@@ -231,13 +241,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize)
5925 {
5926 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
5927 EXT4_DIR_REC_LEN(2) - infosize;
5928- return 0? 20: entry_space / sizeof(struct dx_entry);
5929+ return entry_space / sizeof(struct dx_entry);
5930 }
5931
5932 static inline unsigned dx_node_limit (struct inode *dir)
5933 {
5934 unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
5935- return 0? 22: entry_space / sizeof(struct dx_entry);
5936+ return entry_space / sizeof(struct dx_entry);
5937 }
5938
5939 /*
5940@@ -554,15 +564,6 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
5941
5942
5943 /*
5944- * p is at least 6 bytes before the end of page
5945- */
5946-static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p)
5947-{
5948- return (struct ext4_dir_entry_2 *)((char *)p +
5949- ext4_rec_len_from_disk(p->rec_len));
5950-}
5951-
5952-/*
5953 * This function fills a red-black tree with information from a
5954 * directory block. It returns the number directory entries loaded
5955 * into the tree. If there is an error it is returned in err.
5956@@ -993,19 +994,21 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry,
5957 de = (struct ext4_dir_entry_2 *) bh->b_data;
5958 top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
5959 EXT4_DIR_REC_LEN(0));
5960- for (; de < top; de = ext4_next_entry(de))
5961- if (ext4_match (namelen, name, de)) {
5962- if (!ext4_check_dir_entry("ext4_find_entry",
5963- dir, de, bh,
5964- (block<<EXT4_BLOCK_SIZE_BITS(sb))
5965- +((char *)de - bh->b_data))) {
5966- brelse (bh);
5967+ for (; de < top; de = ext4_next_entry(de)) {
5968+ int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
5969+ + ((char *) de - bh->b_data);
5970+
5971+ if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) {
5972+ brelse(bh);
5973 *err = ERR_BAD_DX_DIR;
5974 goto errout;
5975 }
5976- *res_dir = de;
5977- dx_release (frames);
5978- return bh;
5979+
5980+ if (ext4_match(namelen, name, de)) {
5981+ *res_dir = de;
5982+ dx_release(frames);
5983+ return bh;
5984+ }
5985 }
5986 brelse (bh);
5987 /* Check to see if we should continue to search */
5988diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
5989index 9ff7b1c..b3d3560 100644
5990--- a/fs/ext4/resize.c
5991+++ b/fs/ext4/resize.c
5992@@ -73,7 +73,7 @@ static int verify_group_input(struct super_block *sb,
5993 "Inode bitmap not in group (block %llu)",
5994 (unsigned long long)input->inode_bitmap);
5995 else if (outside(input->inode_table, start, end) ||
5996- outside(itend - 1, start, end))
5997+ outside(itend - 1, start, end))
5998 ext4_warning(sb, __func__,
5999 "Inode table not in group (blocks %llu-%llu)",
6000 (unsigned long long)input->inode_table, itend - 1);
6001@@ -104,7 +104,7 @@ static int verify_group_input(struct super_block *sb,
6002 (unsigned long long)input->inode_bitmap,
6003 start, metaend - 1);
6004 else if (inside(input->inode_table, start, metaend) ||
6005- inside(itend - 1, start, metaend))
6006+ inside(itend - 1, start, metaend))
6007 ext4_warning(sb, __func__,
6008 "Inode table (%llu-%llu) overlaps"
6009 "GDT table (%llu-%llu)",
6010@@ -158,9 +158,9 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh,
6011 if (err) {
6012 if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA)))
6013 return err;
6014- if ((err = ext4_journal_get_write_access(handle, bh)))
6015+ if ((err = ext4_journal_get_write_access(handle, bh)))
6016 return err;
6017- }
6018+ }
6019
6020 return 0;
6021 }
6022@@ -416,11 +416,11 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
6023 "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
6024 gdb_num);
6025
6026- /*
6027- * If we are not using the primary superblock/GDT copy don't resize,
6028- * because the user tools have no way of handling this. Probably a
6029- * bad time to do it anyways.
6030- */
6031+ /*
6032+ * If we are not using the primary superblock/GDT copy don't resize,
6033+ * because the user tools have no way of handling this. Probably a
6034+ * bad time to do it anyways.
6035+ */
6036 if (EXT4_SB(sb)->s_sbh->b_blocknr !=
6037 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
6038 ext4_warning(sb, __func__,
6039@@ -507,14 +507,14 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
6040 return 0;
6041
6042 exit_inode:
6043- //ext4_journal_release_buffer(handle, iloc.bh);
6044+ /* ext4_journal_release_buffer(handle, iloc.bh); */
6045 brelse(iloc.bh);
6046 exit_dindj:
6047- //ext4_journal_release_buffer(handle, dind);
6048+ /* ext4_journal_release_buffer(handle, dind); */
6049 exit_primary:
6050- //ext4_journal_release_buffer(handle, *primary);
6051+ /* ext4_journal_release_buffer(handle, *primary); */
6052 exit_sbh:
6053- //ext4_journal_release_buffer(handle, *primary);
6054+ /* ext4_journal_release_buffer(handle, *primary); */
6055 exit_dind:
6056 brelse(dind);
6057 exit_bh:
6058@@ -773,7 +773,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
6059
6060 if (reserved_gdb || gdb_off == 0) {
6061 if (!EXT4_HAS_COMPAT_FEATURE(sb,
6062- EXT4_FEATURE_COMPAT_RESIZE_INODE)){
6063+ EXT4_FEATURE_COMPAT_RESIZE_INODE)
6064+ || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
6065 ext4_warning(sb, __func__,
6066 "No reserved GDT blocks, can't resize");
6067 return -EPERM;
6068@@ -818,12 +819,12 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
6069 if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh)))
6070 goto exit_journal;
6071
6072- /*
6073- * We will only either add reserved group blocks to a backup group
6074- * or remove reserved blocks for the first group in a new group block.
6075- * Doing both would be mean more complex code, and sane people don't
6076- * use non-sparse filesystems anymore. This is already checked above.
6077- */
6078+ /*
6079+ * We will only either add reserved group blocks to a backup group
6080+ * or remove reserved blocks for the first group in a new group block.
6081+ * Doing both would be mean more complex code, and sane people don't
6082+ * use non-sparse filesystems anymore. This is already checked above.
6083+ */
6084 if (gdb_off) {
6085 primary = sbi->s_group_desc[gdb_num];
6086 if ((err = ext4_journal_get_write_access(handle, primary)))
6087@@ -835,24 +836,24 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
6088 } else if ((err = add_new_gdb(handle, inode, input, &primary)))
6089 goto exit_journal;
6090
6091- /*
6092- * OK, now we've set up the new group. Time to make it active.
6093- *
6094- * Current kernels don't lock all allocations via lock_super(),
6095- * so we have to be safe wrt. concurrent accesses the group
6096- * data. So we need to be careful to set all of the relevant
6097- * group descriptor data etc. *before* we enable the group.
6098- *
6099- * The key field here is sbi->s_groups_count: as long as
6100- * that retains its old value, nobody is going to access the new
6101- * group.
6102- *
6103- * So first we update all the descriptor metadata for the new
6104- * group; then we update the total disk blocks count; then we
6105- * update the groups count to enable the group; then finally we
6106- * update the free space counts so that the system can start
6107- * using the new disk blocks.
6108- */
6109+ /*
6110+ * OK, now we've set up the new group. Time to make it active.
6111+ *
6112+ * Current kernels don't lock all allocations via lock_super(),
6113+ * so we have to be safe wrt. concurrent accesses the group
6114+ * data. So we need to be careful to set all of the relevant
6115+ * group descriptor data etc. *before* we enable the group.
6116+ *
6117+ * The key field here is sbi->s_groups_count: as long as
6118+ * that retains its old value, nobody is going to access the new
6119+ * group.
6120+ *
6121+ * So first we update all the descriptor metadata for the new
6122+ * group; then we update the total disk blocks count; then we
6123+ * update the groups count to enable the group; then finally we
6124+ * update the free space counts so that the system can start
6125+ * using the new disk blocks.
6126+ */
6127
6128 /* Update group descriptor block for new group */
6129 gdp = (struct ext4_group_desc *)((char *)primary->b_data +
6130@@ -866,6 +867,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
6131 gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp);
6132
6133 /*
6134+ * We can allocate memory for mb_alloc based on the new group
6135+ * descriptor
6136+ */
6137+ if (test_opt(sb, MBALLOC)) {
6138+ err = ext4_mb_add_more_groupinfo(sb, input->group, gdp);
6139+ if (err)
6140+ goto exit_journal;
6141+ }
6142+ /*
6143 * Make the new blocks and inodes valid next. We do this before
6144 * increasing the group count so that once the group is enabled,
6145 * all of its blocks and inodes are already valid.
6146@@ -937,7 +947,8 @@ exit_put:
6147 return err;
6148 } /* ext4_group_add */
6149
6150-/* Extend the filesystem to the new number of blocks specified. This entry
6151+/*
6152+ * Extend the filesystem to the new number of blocks specified. This entry
6153 * point is only used to extend the current filesystem to the end of the last
6154 * existing group. It can be accessed via ioctl, or by "remount,resize=<size>"
6155 * for emergencies (because it has no dependencies on reserved blocks).
6156@@ -957,6 +968,8 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
6157 handle_t *handle;
6158 int err;
6159 unsigned long freed_blocks;
6160+ ext4_group_t group;
6161+ struct ext4_group_info *grp;
6162
6163 /* We don't need to worry about locking wrt other resizers just
6164 * yet: we're going to revalidate es->s_blocks_count after
6165@@ -988,7 +1001,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
6166 }
6167
6168 /* Handle the remaining blocks in the last group only. */
6169- ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last);
6170+ ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last);
6171
6172 if (last == 0) {
6173 ext4_warning(sb, __func__,
6174@@ -1013,7 +1026,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
6175 o_blocks_count + add, add);
6176
6177 /* See if the device is actually as big as what was requested */
6178- bh = sb_bread(sb, o_blocks_count + add -1);
6179+ bh = sb_bread(sb, o_blocks_count + add - 1);
6180 if (!bh) {
6181 ext4_warning(sb, __func__,
6182 "can't read last block, resize aborted");
6183@@ -1060,6 +1073,45 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
6184 o_blocks_count + add);
6185 if ((err = ext4_journal_stop(handle)))
6186 goto exit_put;
6187+
6188+ /*
6189+ * Mark mballoc pages as not up to date so that they will be updated
6190+ * next time they are loaded by ext4_mb_load_buddy.
6191+ */
6192+ if (test_opt(sb, MBALLOC)) {
6193+ struct ext4_sb_info *sbi = EXT4_SB(sb);
6194+ struct inode *inode = sbi->s_buddy_cache;
6195+ int blocks_per_page;
6196+ int block;
6197+ int pnum;
6198+ struct page *page;
6199+
6200+ /* Set buddy page as not up to date */
6201+ blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
6202+ block = group * 2;
6203+ pnum = block / blocks_per_page;
6204+ page = find_get_page(inode->i_mapping, pnum);
6205+ if (page != NULL) {
6206+ ClearPageUptodate(page);
6207+ page_cache_release(page);
6208+ }
6209+
6210+ /* Set bitmap page as not up to date */
6211+ block++;
6212+ pnum = block / blocks_per_page;
6213+ page = find_get_page(inode->i_mapping, pnum);
6214+ if (page != NULL) {
6215+ ClearPageUptodate(page);
6216+ page_cache_release(page);
6217+ }
6218+
6219+ /* Get the info on the last group */
6220+ grp = ext4_get_group_info(sb, group);
6221+
6222+ /* Update free blocks in group info */
6223+ ext4_mb_update_group_info(grp, add);
6224+ }
6225+
6226 if (test_opt(sb, DEBUG))
6227 printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n",
6228 ext4_blocks_count(es));
6229diff --git a/fs/ext4/super.c b/fs/ext4/super.c
6230index 02bf243..ed80f9f 100644
6231--- a/fs/ext4/super.c
6232+++ b/fs/ext4/super.c
6233@@ -49,20 +49,19 @@ static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
6234 unsigned long journal_devnum);
6235 static int ext4_create_journal(struct super_block *, struct ext4_super_block *,
6236 unsigned int);
6237-static void ext4_commit_super (struct super_block * sb,
6238- struct ext4_super_block * es,
6239- int sync);
6240-static void ext4_mark_recovery_complete(struct super_block * sb,
6241- struct ext4_super_block * es);
6242-static void ext4_clear_journal_err(struct super_block * sb,
6243- struct ext4_super_block * es);
6244+static void ext4_commit_super(struct super_block *sb,
6245+ struct ext4_super_block *es, int sync);
6246+static void ext4_mark_recovery_complete(struct super_block *sb,
6247+ struct ext4_super_block *es);
6248+static void ext4_clear_journal_err(struct super_block *sb,
6249+ struct ext4_super_block *es);
6250 static int ext4_sync_fs(struct super_block *sb, int wait);
6251-static const char *ext4_decode_error(struct super_block * sb, int errno,
6252+static const char *ext4_decode_error(struct super_block *sb, int errno,
6253 char nbuf[16]);
6254-static int ext4_remount (struct super_block * sb, int * flags, char * data);
6255-static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf);
6256+static int ext4_remount(struct super_block *sb, int *flags, char *data);
6257+static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
6258 static void ext4_unlockfs(struct super_block *sb);
6259-static void ext4_write_super (struct super_block * sb);
6260+static void ext4_write_super(struct super_block *sb);
6261 static void ext4_write_super_lockfs(struct super_block *sb);
6262
6263
6264@@ -211,15 +210,15 @@ static void ext4_handle_error(struct super_block *sb)
6265 if (sb->s_flags & MS_RDONLY)
6266 return;
6267
6268- if (!test_opt (sb, ERRORS_CONT)) {
6269+ if (!test_opt(sb, ERRORS_CONT)) {
6270 journal_t *journal = EXT4_SB(sb)->s_journal;
6271
6272 EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
6273 if (journal)
6274 jbd2_journal_abort(journal, -EIO);
6275 }
6276- if (test_opt (sb, ERRORS_RO)) {
6277- printk (KERN_CRIT "Remounting filesystem read-only\n");
6278+ if (test_opt(sb, ERRORS_RO)) {
6279+ printk(KERN_CRIT "Remounting filesystem read-only\n");
6280 sb->s_flags |= MS_RDONLY;
6281 }
6282 ext4_commit_super(sb, es, 1);
6283@@ -228,13 +227,13 @@ static void ext4_handle_error(struct super_block *sb)
6284 sb->s_id);
6285 }
6286
6287-void ext4_error (struct super_block * sb, const char * function,
6288- const char * fmt, ...)
6289+void ext4_error(struct super_block *sb, const char *function,
6290+ const char *fmt, ...)
6291 {
6292 va_list args;
6293
6294 va_start(args, fmt);
6295- printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
6296+ printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
6297 vprintk(fmt, args);
6298 printk("\n");
6299 va_end(args);
6300@@ -242,7 +241,7 @@ void ext4_error (struct super_block * sb, const char * function,
6301 ext4_handle_error(sb);
6302 }
6303
6304-static const char *ext4_decode_error(struct super_block * sb, int errno,
6305+static const char *ext4_decode_error(struct super_block *sb, int errno,
6306 char nbuf[16])
6307 {
6308 char *errstr = NULL;
6309@@ -278,8 +277,7 @@ static const char *ext4_decode_error(struct super_block * sb, int errno,
6310 /* __ext4_std_error decodes expected errors from journaling functions
6311 * automatically and invokes the appropriate error response. */
6312
6313-void __ext4_std_error (struct super_block * sb, const char * function,
6314- int errno)
6315+void __ext4_std_error(struct super_block *sb, const char *function, int errno)
6316 {
6317 char nbuf[16];
6318 const char *errstr;
6319@@ -292,8 +290,8 @@ void __ext4_std_error (struct super_block * sb, const char * function,
6320 return;
6321
6322 errstr = ext4_decode_error(sb, errno, nbuf);
6323- printk (KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n",
6324- sb->s_id, function, errstr);
6325+ printk(KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n",
6326+ sb->s_id, function, errstr);
6327
6328 ext4_handle_error(sb);
6329 }
6330@@ -308,15 +306,15 @@ void __ext4_std_error (struct super_block * sb, const char * function,
6331 * case we take the easy way out and panic immediately.
6332 */
6333
6334-void ext4_abort (struct super_block * sb, const char * function,
6335- const char * fmt, ...)
6336+void ext4_abort(struct super_block *sb, const char *function,
6337+ const char *fmt, ...)
6338 {
6339 va_list args;
6340
6341- printk (KERN_CRIT "ext4_abort called.\n");
6342+ printk(KERN_CRIT "ext4_abort called.\n");
6343
6344 va_start(args, fmt);
6345- printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function);
6346+ printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
6347 vprintk(fmt, args);
6348 printk("\n");
6349 va_end(args);
6350@@ -334,8 +332,8 @@ void ext4_abort (struct super_block * sb, const char * function,
6351 jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
6352 }
6353
6354-void ext4_warning (struct super_block * sb, const char * function,
6355- const char * fmt, ...)
6356+void ext4_warning(struct super_block *sb, const char *function,
6357+ const char *fmt, ...)
6358 {
6359 va_list args;
6360
6361@@ -496,7 +494,7 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
6362 }
6363 }
6364
6365-static void ext4_put_super (struct super_block * sb)
6366+static void ext4_put_super(struct super_block *sb)
6367 {
6368 struct ext4_sb_info *sbi = EXT4_SB(sb);
6369 struct ext4_super_block *es = sbi->s_es;
6370@@ -506,6 +504,7 @@ static void ext4_put_super (struct super_block * sb)
6371 ext4_ext_release(sb);
6372 ext4_xattr_put_super(sb);
6373 jbd2_journal_destroy(sbi->s_journal);
6374+ sbi->s_journal = NULL;
6375 if (!(sb->s_flags & MS_RDONLY)) {
6376 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
6377 es->s_state = cpu_to_le16(sbi->s_mount_state);
6378@@ -517,6 +516,7 @@ static void ext4_put_super (struct super_block * sb)
6379 for (i = 0; i < sbi->s_gdb_count; i++)
6380 brelse(sbi->s_group_desc[i]);
6381 kfree(sbi->s_group_desc);
6382+ kfree(sbi->s_flex_groups);
6383 percpu_counter_destroy(&sbi->s_freeblocks_counter);
6384 percpu_counter_destroy(&sbi->s_freeinodes_counter);
6385 percpu_counter_destroy(&sbi->s_dirs_counter);
6386@@ -568,9 +568,16 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
6387 #endif
6388 ei->i_block_alloc_info = NULL;
6389 ei->vfs_inode.i_version = 1;
6390+ ei->vfs_inode.i_data.writeback_index = 0;
6391 memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache));
6392 INIT_LIST_HEAD(&ei->i_prealloc_list);
6393 spin_lock_init(&ei->i_prealloc_lock);
6394+ jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode);
6395+ ei->i_reserved_data_blocks = 0;
6396+ ei->i_reserved_meta_blocks = 0;
6397+ ei->i_allocated_meta_blocks = 0;
6398+ ei->i_delalloc_reserved_flag = 0;
6399+ spin_lock_init(&(ei->i_block_reservation_lock));
6400 return &ei->vfs_inode;
6401 }
6402
6403@@ -635,9 +642,12 @@ static void ext4_clear_inode(struct inode *inode)
6404 EXT4_I(inode)->i_block_alloc_info = NULL;
6405 if (unlikely(rsv))
6406 kfree(rsv);
6407+ jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal,
6408+ &EXT4_I(inode)->jinode);
6409 }
6410
6411-static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb)
6412+static inline void ext4_show_quota_options(struct seq_file *seq,
6413+ struct super_block *sb)
6414 {
6415 #if defined(CONFIG_QUOTA)
6416 struct ext4_sb_info *sbi = EXT4_SB(sb);
6417@@ -671,7 +681,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
6418 unsigned long def_mount_opts;
6419 struct super_block *sb = vfs->mnt_sb;
6420 struct ext4_sb_info *sbi = EXT4_SB(sb);
6421- journal_t *journal = sbi->s_journal;
6422 struct ext4_super_block *es = sbi->s_es;
6423
6424 def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
6425@@ -747,6 +756,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
6426 seq_puts(seq, ",nomballoc");
6427 if (test_opt(sb, I_VERSION))
6428 seq_puts(seq, ",i_version");
6429+ if (!test_opt(sb, DELALLOC))
6430+ seq_puts(seq, ",nodelalloc");
6431+
6432
6433 if (sbi->s_stripe)
6434 seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
6435@@ -810,8 +822,8 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
6436 }
6437
6438 #ifdef CONFIG_QUOTA
6439-#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
6440-#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
6441+#define QTYPE2NAME(t) ((t) == USRQUOTA?"user":"group")
6442+#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
6443
6444 static int ext4_dquot_initialize(struct inode *inode, int type);
6445 static int ext4_dquot_drop(struct inode *inode);
6446@@ -894,7 +906,7 @@ enum {
6447 Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
6448 Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
6449 Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version,
6450- Opt_mballoc, Opt_nomballoc, Opt_stripe,
6451+ Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc,
6452 };
6453
6454 static match_table_t tokens = {
6455@@ -953,6 +965,8 @@ static match_table_t tokens = {
6456 {Opt_nomballoc, "nomballoc"},
6457 {Opt_stripe, "stripe=%u"},
6458 {Opt_resize, "resize"},
6459+ {Opt_delalloc, "delalloc"},
6460+ {Opt_nodelalloc, "nodelalloc"},
6461 {Opt_err, NULL},
6462 };
6463
6464@@ -977,12 +991,12 @@ static ext4_fsblk_t get_sb_block(void **data)
6465 return sb_block;
6466 }
6467
6468-static int parse_options (char *options, struct super_block *sb,
6469- unsigned int *inum, unsigned long *journal_devnum,
6470- ext4_fsblk_t *n_blocks_count, int is_remount)
6471+static int parse_options(char *options, struct super_block *sb,
6472+ unsigned int *inum, unsigned long *journal_devnum,
6473+ ext4_fsblk_t *n_blocks_count, int is_remount)
6474 {
6475 struct ext4_sb_info *sbi = EXT4_SB(sb);
6476- char * p;
6477+ char *p;
6478 substring_t args[MAX_OPT_ARGS];
6479 int data_opt = 0;
6480 int option;
6481@@ -990,11 +1004,12 @@ static int parse_options (char *options, struct super_block *sb,
6482 int qtype, qfmt;
6483 char *qname;
6484 #endif
6485+ ext4_fsblk_t last_block;
6486
6487 if (!options)
6488 return 1;
6489
6490- while ((p = strsep (&options, ",")) != NULL) {
6491+ while ((p = strsep(&options, ",")) != NULL) {
6492 int token;
6493 if (!*p)
6494 continue;
6495@@ -1002,16 +1017,16 @@ static int parse_options (char *options, struct super_block *sb,
6496 token = match_token(p, tokens, args);
6497 switch (token) {
6498 case Opt_bsd_df:
6499- clear_opt (sbi->s_mount_opt, MINIX_DF);
6500+ clear_opt(sbi->s_mount_opt, MINIX_DF);
6501 break;
6502 case Opt_minix_df:
6503- set_opt (sbi->s_mount_opt, MINIX_DF);
6504+ set_opt(sbi->s_mount_opt, MINIX_DF);
6505 break;
6506 case Opt_grpid:
6507- set_opt (sbi->s_mount_opt, GRPID);
6508+ set_opt(sbi->s_mount_opt, GRPID);
6509 break;
6510 case Opt_nogrpid:
6511- clear_opt (sbi->s_mount_opt, GRPID);
6512+ clear_opt(sbi->s_mount_opt, GRPID);
6513 break;
6514 case Opt_resuid:
6515 if (match_int(&args[0], &option))
6516@@ -1028,41 +1043,41 @@ static int parse_options (char *options, struct super_block *sb,
6517 /* *sb_block = match_int(&args[0]); */
6518 break;
6519 case Opt_err_panic:
6520- clear_opt (sbi->s_mount_opt, ERRORS_CONT);
6521- clear_opt (sbi->s_mount_opt, ERRORS_RO);
6522- set_opt (sbi->s_mount_opt, ERRORS_PANIC);
6523+ clear_opt(sbi->s_mount_opt, ERRORS_CONT);
6524+ clear_opt(sbi->s_mount_opt, ERRORS_RO);
6525+ set_opt(sbi->s_mount_opt, ERRORS_PANIC);
6526 break;
6527 case Opt_err_ro:
6528- clear_opt (sbi->s_mount_opt, ERRORS_CONT);
6529- clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
6530- set_opt (sbi->s_mount_opt, ERRORS_RO);
6531+ clear_opt(sbi->s_mount_opt, ERRORS_CONT);
6532+ clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
6533+ set_opt(sbi->s_mount_opt, ERRORS_RO);
6534 break;
6535 case Opt_err_cont:
6536- clear_opt (sbi->s_mount_opt, ERRORS_RO);
6537- clear_opt (sbi->s_mount_opt, ERRORS_PANIC);
6538- set_opt (sbi->s_mount_opt, ERRORS_CONT);
6539+ clear_opt(sbi->s_mount_opt, ERRORS_RO);
6540+ clear_opt(sbi->s_mount_opt, ERRORS_PANIC);
6541+ set_opt(sbi->s_mount_opt, ERRORS_CONT);
6542 break;
6543 case Opt_nouid32:
6544- set_opt (sbi->s_mount_opt, NO_UID32);
6545+ set_opt(sbi->s_mount_opt, NO_UID32);
6546 break;
6547 case Opt_nocheck:
6548- clear_opt (sbi->s_mount_opt, CHECK);
6549+ clear_opt(sbi->s_mount_opt, CHECK);
6550 break;
6551 case Opt_debug:
6552- set_opt (sbi->s_mount_opt, DEBUG);
6553+ set_opt(sbi->s_mount_opt, DEBUG);
6554 break;
6555 case Opt_oldalloc:
6556- set_opt (sbi->s_mount_opt, OLDALLOC);
6557+ set_opt(sbi->s_mount_opt, OLDALLOC);
6558 break;
6559 case Opt_orlov:
6560- clear_opt (sbi->s_mount_opt, OLDALLOC);
6561+ clear_opt(sbi->s_mount_opt, OLDALLOC);
6562 break;
6563 #ifdef CONFIG_EXT4DEV_FS_XATTR
6564 case Opt_user_xattr:
6565- set_opt (sbi->s_mount_opt, XATTR_USER);
6566+ set_opt(sbi->s_mount_opt, XATTR_USER);
6567 break;
6568 case Opt_nouser_xattr:
6569- clear_opt (sbi->s_mount_opt, XATTR_USER);
6570+ clear_opt(sbi->s_mount_opt, XATTR_USER);
6571 break;
6572 #else
6573 case Opt_user_xattr:
6574@@ -1100,7 +1115,7 @@ static int parse_options (char *options, struct super_block *sb,
6575 "journal on remount\n");
6576 return 0;
6577 }
6578- set_opt (sbi->s_mount_opt, UPDATE_JOURNAL);
6579+ set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
6580 break;
6581 case Opt_journal_inum:
6582 if (is_remount) {
6583@@ -1130,7 +1145,7 @@ static int parse_options (char *options, struct super_block *sb,
6584 set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
6585 break;
6586 case Opt_noload:
6587- set_opt (sbi->s_mount_opt, NOLOAD);
6588+ set_opt(sbi->s_mount_opt, NOLOAD);
6589 break;
6590 case Opt_commit:
6591 if (match_int(&args[0], &option))
6592@@ -1309,15 +1324,39 @@ set_qf_format:
6593 clear_opt(sbi->s_mount_opt, NOBH);
6594 break;
6595 case Opt_extents:
6596- set_opt (sbi->s_mount_opt, EXTENTS);
6597+ if (!EXT4_HAS_INCOMPAT_FEATURE(sb,
6598+ EXT4_FEATURE_INCOMPAT_EXTENTS)) {
6599+ ext4_warning(sb, __func__,
6600+ "extents feature not enabled "
6601+ "on this filesystem, use tune2fs\n");
6602+ return 0;
6603+ }
6604+ set_opt(sbi->s_mount_opt, EXTENTS);
6605 break;
6606 case Opt_noextents:
6607- clear_opt (sbi->s_mount_opt, EXTENTS);
6608+ /*
6609+ * When e2fsprogs support resizing an already existing
6610+ * ext3 file system to greater than 2**32 we need to
6611+ * add support to block allocator to handle growing
6612+ * already existing block mapped inode so that blocks
6613+ * allocated for them fall within 2**32
6614+ */
6615+ last_block = ext4_blocks_count(sbi->s_es) - 1;
6616+ if (last_block > 0xffffffffULL) {
6617+ printk(KERN_ERR "EXT4-fs: Filesystem too "
6618+ "large to mount with "
6619+ "-o noextents options\n");
6620+ return 0;
6621+ }
6622+ clear_opt(sbi->s_mount_opt, EXTENTS);
6623 break;
6624 case Opt_i_version:
6625 set_opt(sbi->s_mount_opt, I_VERSION);
6626 sb->s_flags |= MS_I_VERSION;
6627 break;
6628+ case Opt_nodelalloc:
6629+ clear_opt(sbi->s_mount_opt, DELALLOC);
6630+ break;
6631 case Opt_mballoc:
6632 set_opt(sbi->s_mount_opt, MBALLOC);
6633 break;
6634@@ -1331,10 +1370,13 @@ set_qf_format:
6635 return 0;
6636 sbi->s_stripe = option;
6637 break;
6638+ case Opt_delalloc:
6639+ set_opt(sbi->s_mount_opt, DELALLOC);
6640+ break;
6641 default:
6642- printk (KERN_ERR
6643- "EXT4-fs: Unrecognized mount option \"%s\" "
6644- "or missing value\n", p);
6645+ printk(KERN_ERR
6646+ "EXT4-fs: Unrecognized mount option \"%s\" "
6647+ "or missing value\n", p);
6648 return 0;
6649 }
6650 }
6651@@ -1381,31 +1423,31 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
6652 int res = 0;
6653
6654 if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
6655- printk (KERN_ERR "EXT4-fs warning: revision level too high, "
6656- "forcing read-only mode\n");
6657+ printk(KERN_ERR "EXT4-fs warning: revision level too high, "
6658+ "forcing read-only mode\n");
6659 res = MS_RDONLY;
6660 }
6661 if (read_only)
6662 return res;
6663 if (!(sbi->s_mount_state & EXT4_VALID_FS))
6664- printk (KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
6665- "running e2fsck is recommended\n");
6666+ printk(KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
6667+ "running e2fsck is recommended\n");
6668 else if ((sbi->s_mount_state & EXT4_ERROR_FS))
6669- printk (KERN_WARNING
6670- "EXT4-fs warning: mounting fs with errors, "
6671- "running e2fsck is recommended\n");
6672+ printk(KERN_WARNING
6673+ "EXT4-fs warning: mounting fs with errors, "
6674+ "running e2fsck is recommended\n");
6675 else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
6676 le16_to_cpu(es->s_mnt_count) >=
6677 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
6678- printk (KERN_WARNING
6679- "EXT4-fs warning: maximal mount count reached, "
6680- "running e2fsck is recommended\n");
6681+ printk(KERN_WARNING
6682+ "EXT4-fs warning: maximal mount count reached, "
6683+ "running e2fsck is recommended\n");
6684 else if (le32_to_cpu(es->s_checkinterval) &&
6685 (le32_to_cpu(es->s_lastcheck) +
6686 le32_to_cpu(es->s_checkinterval) <= get_seconds()))
6687- printk (KERN_WARNING
6688- "EXT4-fs warning: checktime reached, "
6689- "running e2fsck is recommended\n");
6690+ printk(KERN_WARNING
6691+ "EXT4-fs warning: checktime reached, "
6692+ "running e2fsck is recommended\n");
6693 #if 0
6694 /* @@@ We _will_ want to clear the valid bit if we find
6695 * inconsistencies, to force a fsck at reboot. But for
6696@@ -1443,6 +1485,53 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
6697 return res;
6698 }
6699
6700+static int ext4_fill_flex_info(struct super_block *sb)
6701+{
6702+ struct ext4_sb_info *sbi = EXT4_SB(sb);
6703+ struct ext4_group_desc *gdp = NULL;
6704+ struct buffer_head *bh;
6705+ ext4_group_t flex_group_count;
6706+ ext4_group_t flex_group;
6707+ int groups_per_flex = 0;
6708+ __u64 block_bitmap = 0;
6709+ int i;
6710+
6711+ if (!sbi->s_es->s_log_groups_per_flex) {
6712+ sbi->s_log_groups_per_flex = 0;
6713+ return 1;
6714+ }
6715+
6716+ sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
6717+ groups_per_flex = 1 << sbi->s_log_groups_per_flex;
6718+
6719+ flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) /
6720+ groups_per_flex;
6721+ sbi->s_flex_groups = kzalloc(flex_group_count *
6722+ sizeof(struct flex_groups), GFP_KERNEL);
6723+ if (sbi->s_flex_groups == NULL) {
6724+ printk(KERN_ERR "EXT4-fs: not enough memory for "
6725+ "%lu flex groups\n", flex_group_count);
6726+ goto failed;
6727+ }
6728+
6729+ gdp = ext4_get_group_desc(sb, 1, &bh);
6730+ block_bitmap = ext4_block_bitmap(sb, gdp) - 1;
6731+
6732+ for (i = 0; i < sbi->s_groups_count; i++) {
6733+ gdp = ext4_get_group_desc(sb, i, &bh);
6734+
6735+ flex_group = ext4_flex_group(sbi, i);
6736+ sbi->s_flex_groups[flex_group].free_inodes +=
6737+ le16_to_cpu(gdp->bg_free_inodes_count);
6738+ sbi->s_flex_groups[flex_group].free_blocks +=
6739+ le16_to_cpu(gdp->bg_free_blocks_count);
6740+ }
6741+
6742+ return 1;
6743+failed:
6744+ return 0;
6745+}
6746+
6747 __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
6748 struct ext4_group_desc *gdp)
6749 {
6750@@ -1507,16 +1596,14 @@ static int ext4_check_descriptors(struct super_block *sb)
6751 (EXT4_BLOCKS_PER_GROUP(sb) - 1);
6752
6753 block_bitmap = ext4_block_bitmap(sb, gdp);
6754- if (block_bitmap < first_block || block_bitmap > last_block)
6755- {
6756+ if (block_bitmap < first_block || block_bitmap > last_block) {
6757 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
6758 "Block bitmap for group %lu not in group "
6759 "(block %llu)!", i, block_bitmap);
6760 return 0;
6761 }
6762 inode_bitmap = ext4_inode_bitmap(sb, gdp);
6763- if (inode_bitmap < first_block || inode_bitmap > last_block)
6764- {
6765+ if (inode_bitmap < first_block || inode_bitmap > last_block) {
6766 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
6767 "Inode bitmap for group %lu not in group "
6768 "(block %llu)!", i, inode_bitmap);
6769@@ -1524,26 +1611,28 @@ static int ext4_check_descriptors(struct super_block *sb)
6770 }
6771 inode_table = ext4_inode_table(sb, gdp);
6772 if (inode_table < first_block ||
6773- inode_table + sbi->s_itb_per_group - 1 > last_block)
6774- {
6775+ inode_table + sbi->s_itb_per_group - 1 > last_block) {
6776 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
6777 "Inode table for group %lu not in group "
6778 "(block %llu)!", i, inode_table);
6779 return 0;
6780 }
6781+ spin_lock(sb_bgl_lock(sbi, i));
6782 if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
6783 printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
6784 "Checksum for group %lu failed (%u!=%u)\n",
6785 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
6786 gdp)), le16_to_cpu(gdp->bg_checksum));
6787- return 0;
6788+ if (!(sb->s_flags & MS_RDONLY))
6789+ return 0;
6790 }
6791+ spin_unlock(sb_bgl_lock(sbi, i));
6792 if (!flexbg_flag)
6793 first_block += EXT4_BLOCKS_PER_GROUP(sb);
6794 }
6795
6796 ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
6797- sbi->s_es->s_free_inodes_count=cpu_to_le32(ext4_count_free_inodes(sb));
6798+ sbi->s_es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
6799 return 1;
6800 }
6801
6802@@ -1564,8 +1653,8 @@ static int ext4_check_descriptors(struct super_block *sb)
6803 * e2fsck was run on this filesystem, and it must have already done the orphan
6804 * inode cleanup for us, so we can safely abort without any further action.
6805 */
6806-static void ext4_orphan_cleanup (struct super_block * sb,
6807- struct ext4_super_block * es)
6808+static void ext4_orphan_cleanup(struct super_block *sb,
6809+ struct ext4_super_block *es)
6810 {
6811 unsigned int s_flags = sb->s_flags;
6812 int nr_orphans = 0, nr_truncates = 0;
6813@@ -1642,7 +1731,7 @@ static void ext4_orphan_cleanup (struct super_block * sb,
6814 iput(inode); /* The delete magic happens here! */
6815 }
6816
6817-#define PLURAL(x) (x), ((x)==1) ? "" : "s"
6818+#define PLURAL(x) (x), ((x) == 1) ? "" : "s"
6819
6820 if (nr_orphans)
6821 printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n",
6822@@ -1809,12 +1898,12 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
6823 return 0;
6824 }
6825
6826-static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6827- __releases(kernel_sem)
6828- __acquires(kernel_sem)
6829+static int ext4_fill_super(struct super_block *sb, void *data, int silent)
6830+ __releases(kernel_lock)
6831+ __acquires(kernel_lock)
6832
6833 {
6834- struct buffer_head * bh;
6835+ struct buffer_head *bh;
6836 struct ext4_super_block *es = NULL;
6837 struct ext4_sb_info *sbi;
6838 ext4_fsblk_t block;
6839@@ -1851,11 +1940,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6840 goto out_fail;
6841 }
6842
6843- if (!sb_set_blocksize(sb, blocksize)) {
6844- printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize);
6845- goto out_fail;
6846- }
6847-
6848 /*
6849 * The ext4 superblock will not be buffer aligned for other than 1kB
6850 * block sizes. We need to calculate the offset from buffer start.
6851@@ -1868,7 +1952,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6852 }
6853
6854 if (!(bh = sb_bread(sb, logical_sb_block))) {
6855- printk (KERN_ERR "EXT4-fs: unable to read superblock\n");
6856+ printk(KERN_ERR "EXT4-fs: unable to read superblock\n");
6857 goto out_fail;
6858 }
6859 /*
6860@@ -1919,17 +2003,30 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6861
6862 /*
6863 * turn on extents feature by default in ext4 filesystem
6864- * User -o noextents to turn it off
6865+ * only if feature flag already set by mkfs or tune2fs.
6866+ * Use -o noextents to turn it off
6867 */
6868- set_opt(sbi->s_mount_opt, EXTENTS);
6869+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
6870+ set_opt(sbi->s_mount_opt, EXTENTS);
6871+ else
6872+ ext4_warning(sb, __func__,
6873+ "extents feature not enabled on this filesystem, "
6874+ "use tune2fs.\n");
6875 /*
6876- * turn on mballoc feature by default in ext4 filesystem
6877- * User -o nomballoc to turn it off
6878+ * turn on mballoc code by default in ext4 filesystem
6879+ * Use -o nomballoc to turn it off
6880 */
6881 set_opt(sbi->s_mount_opt, MBALLOC);
6882
6883- if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum,
6884- NULL, 0))
6885+ /*
6886+ * enable delayed allocation by default
6887+ * Use -o nodelalloc to turn it off
6888+ */
6889+ set_opt(sbi->s_mount_opt, DELALLOC);
6890+
6891+
6892+ if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum,
6893+ NULL, 0))
6894 goto failed_mount;
6895
6896 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
6897@@ -2004,7 +2101,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6898 goto failed_mount;
6899 }
6900
6901- brelse (bh);
6902+ brelse(bh);
6903 logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE;
6904 offset = do_div(logical_sb_block, blocksize);
6905 bh = sb_bread(sb, logical_sb_block);
6906@@ -2016,8 +2113,8 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6907 es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
6908 sbi->s_es = es;
6909 if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
6910- printk (KERN_ERR
6911- "EXT4-fs: Magic mismatch, very weird !\n");
6912+ printk(KERN_ERR
6913+ "EXT4-fs: Magic mismatch, very weird !\n");
6914 goto failed_mount;
6915 }
6916 }
6917@@ -2034,9 +2131,9 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6918 if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
6919 (!is_power_of_2(sbi->s_inode_size)) ||
6920 (sbi->s_inode_size > blocksize)) {
6921- printk (KERN_ERR
6922- "EXT4-fs: unsupported inode size: %d\n",
6923- sbi->s_inode_size);
6924+ printk(KERN_ERR
6925+ "EXT4-fs: unsupported inode size: %d\n",
6926+ sbi->s_inode_size);
6927 goto failed_mount;
6928 }
6929 if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
6930@@ -2068,20 +2165,20 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6931 sbi->s_mount_state = le16_to_cpu(es->s_state);
6932 sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
6933 sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
6934- for (i=0; i < 4; i++)
6935+ for (i = 0; i < 4; i++)
6936 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
6937 sbi->s_def_hash_version = es->s_def_hash_version;
6938
6939 if (sbi->s_blocks_per_group > blocksize * 8) {
6940- printk (KERN_ERR
6941- "EXT4-fs: #blocks per group too big: %lu\n",
6942- sbi->s_blocks_per_group);
6943+ printk(KERN_ERR
6944+ "EXT4-fs: #blocks per group too big: %lu\n",
6945+ sbi->s_blocks_per_group);
6946 goto failed_mount;
6947 }
6948 if (sbi->s_inodes_per_group > blocksize * 8) {
6949- printk (KERN_ERR
6950- "EXT4-fs: #inodes per group too big: %lu\n",
6951- sbi->s_inodes_per_group);
6952+ printk(KERN_ERR
6953+ "EXT4-fs: #inodes per group too big: %lu\n",
6954+ sbi->s_inodes_per_group);
6955 goto failed_mount;
6956 }
6957
6958@@ -2115,10 +2212,10 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6959 sbi->s_groups_count = blocks_count;
6960 db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
6961 EXT4_DESC_PER_BLOCK(sb);
6962- sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *),
6963+ sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
6964 GFP_KERNEL);
6965 if (sbi->s_group_desc == NULL) {
6966- printk (KERN_ERR "EXT4-fs: not enough memory\n");
6967+ printk(KERN_ERR "EXT4-fs: not enough memory\n");
6968 goto failed_mount;
6969 }
6970
6971@@ -2128,16 +2225,24 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
6972 block = descriptor_loc(sb, logical_sb_block, i);
6973 sbi->s_group_desc[i] = sb_bread(sb, block);
6974 if (!sbi->s_group_desc[i]) {
6975- printk (KERN_ERR "EXT4-fs: "
6976- "can't read group descriptor %d\n", i);
6977+ printk(KERN_ERR "EXT4-fs: "
6978+ "can't read group descriptor %d\n", i);
6979 db_count = i;
6980 goto failed_mount2;
6981 }
6982 }
6983- if (!ext4_check_descriptors (sb)) {
6984+ if (!ext4_check_descriptors(sb)) {
6985 printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
6986 goto failed_mount2;
6987 }
6988+ if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
6989+ if (!ext4_fill_flex_info(sb)) {
6990+ printk(KERN_ERR
6991+ "EXT4-fs: unable to initialize "
6992+ "flex_bg meta info!\n");
6993+ goto failed_mount2;
6994+ }
6995+
6996 sbi->s_gdb_count = db_count;
6997 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
6998 spin_lock_init(&sbi->s_next_gen_lock);
6999@@ -2202,11 +2307,11 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
7000 EXT4_SB(sb)->s_journal->j_failed_commit) {
7001 printk(KERN_CRIT "EXT4-fs error (device %s): "
7002 "ext4_fill_super: Journal transaction "
7003- "%u is corrupt\n", sb->s_id,
7004+ "%u is corrupt\n", sb->s_id,
7005 EXT4_SB(sb)->s_journal->j_failed_commit);
7006- if (test_opt (sb, ERRORS_RO)) {
7007- printk (KERN_CRIT
7008- "Mounting filesystem read-only\n");
7009+ if (test_opt(sb, ERRORS_RO)) {
7010+ printk(KERN_CRIT
7011+ "Mounting filesystem read-only\n");
7012 sb->s_flags |= MS_RDONLY;
7013 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
7014 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
7015@@ -2226,9 +2331,9 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
7016 goto failed_mount3;
7017 } else {
7018 if (!silent)
7019- printk (KERN_ERR
7020- "ext4: No journal on filesystem on %s\n",
7021- sb->s_id);
7022+ printk(KERN_ERR
7023+ "ext4: No journal on filesystem on %s\n",
7024+ sb->s_id);
7025 goto failed_mount3;
7026 }
7027
7028@@ -2312,7 +2417,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
7029 goto failed_mount4;
7030 }
7031
7032- ext4_setup_super (sb, es, sb->s_flags & MS_RDONLY);
7033+ ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY);
7034
7035 /* determine the minimum size of new large inodes, if present */
7036 if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
7037@@ -2351,12 +2456,19 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent)
7038 ext4_orphan_cleanup(sb, es);
7039 EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
7040 if (needs_recovery)
7041- printk (KERN_INFO "EXT4-fs: recovery complete.\n");
7042+ printk(KERN_INFO "EXT4-fs: recovery complete.\n");
7043 ext4_mark_recovery_complete(sb, es);
7044- printk (KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
7045- test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
7046- test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
7047- "writeback");
7048+ printk(KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n",
7049+ test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal":
7050+ test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered":
7051+ "writeback");
7052+
7053+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
7054+ printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
7055+ "requested data journaling mode\n");
7056+ clear_opt(sbi->s_mount_opt, DELALLOC);
7057+ } else if (test_opt(sb, DELALLOC))
7058+ printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
7059
7060 ext4_ext_init(sb);
7061 ext4_mb_init(sb, needs_recovery);
7062@@ -2372,6 +2484,7 @@ cantfind_ext4:
7063
7064 failed_mount4:
7065 jbd2_journal_destroy(sbi->s_journal);
7066+ sbi->s_journal = NULL;
7067 failed_mount3:
7068 percpu_counter_destroy(&sbi->s_freeblocks_counter);
7069 percpu_counter_destroy(&sbi->s_freeinodes_counter);
7070@@ -2461,14 +2574,14 @@ static journal_t *ext4_get_journal(struct super_block *sb,
7071 static journal_t *ext4_get_dev_journal(struct super_block *sb,
7072 dev_t j_dev)
7073 {
7074- struct buffer_head * bh;
7075+ struct buffer_head *bh;
7076 journal_t *journal;
7077 ext4_fsblk_t start;
7078 ext4_fsblk_t len;
7079 int hblock, blocksize;
7080 ext4_fsblk_t sb_block;
7081 unsigned long offset;
7082- struct ext4_super_block * es;
7083+ struct ext4_super_block *es;
7084 struct block_device *bdev;
7085
7086 bdev = ext4_blkdev_get(j_dev);
7087@@ -2583,8 +2696,8 @@ static int ext4_load_journal(struct super_block *sb,
7088 "unavailable, cannot proceed.\n");
7089 return -EROFS;
7090 }
7091- printk (KERN_INFO "EXT4-fs: write access will "
7092- "be enabled during recovery.\n");
7093+ printk(KERN_INFO "EXT4-fs: write access will "
7094+ "be enabled during recovery.\n");
7095 }
7096 }
7097
7098@@ -2637,8 +2750,8 @@ static int ext4_load_journal(struct super_block *sb,
7099 return 0;
7100 }
7101
7102-static int ext4_create_journal(struct super_block * sb,
7103- struct ext4_super_block * es,
7104+static int ext4_create_journal(struct super_block *sb,
7105+ struct ext4_super_block *es,
7106 unsigned int journal_inum)
7107 {
7108 journal_t *journal;
7109@@ -2679,9 +2792,8 @@ static int ext4_create_journal(struct super_block * sb,
7110 return 0;
7111 }
7112
7113-static void ext4_commit_super (struct super_block * sb,
7114- struct ext4_super_block * es,
7115- int sync)
7116+static void ext4_commit_super(struct super_block *sb,
7117+ struct ext4_super_block *es, int sync)
7118 {
7119 struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
7120
7121@@ -2702,8 +2814,8 @@ static void ext4_commit_super (struct super_block * sb,
7122 * remounting) the filesystem readonly, then we will end up with a
7123 * consistent fs on disk. Record that fact.
7124 */
7125-static void ext4_mark_recovery_complete(struct super_block * sb,
7126- struct ext4_super_block * es)
7127+static void ext4_mark_recovery_complete(struct super_block *sb,
7128+ struct ext4_super_block *es)
7129 {
7130 journal_t *journal = EXT4_SB(sb)->s_journal;
7131
7132@@ -2725,8 +2837,8 @@ static void ext4_mark_recovery_complete(struct super_block * sb,
7133 * has recorded an error from a previous lifetime, move that error to the
7134 * main filesystem now.
7135 */
7136-static void ext4_clear_journal_err(struct super_block * sb,
7137- struct ext4_super_block * es)
7138+static void ext4_clear_journal_err(struct super_block *sb,
7139+ struct ext4_super_block *es)
7140 {
7141 journal_t *journal;
7142 int j_errno;
7143@@ -2751,7 +2863,7 @@ static void ext4_clear_journal_err(struct super_block * sb,
7144
7145 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
7146 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
7147- ext4_commit_super (sb, es, 1);
7148+ ext4_commit_super(sb, es, 1);
7149
7150 jbd2_journal_clear_err(journal);
7151 }
7152@@ -2784,7 +2896,7 @@ int ext4_force_commit(struct super_block *sb)
7153 * This implicitly triggers the writebehind on sync().
7154 */
7155
7156-static void ext4_write_super (struct super_block * sb)
7157+static void ext4_write_super(struct super_block *sb)
7158 {
7159 if (mutex_trylock(&sb->s_lock) != 0)
7160 BUG();
7161@@ -2840,13 +2952,14 @@ static void ext4_unlockfs(struct super_block *sb)
7162 }
7163 }
7164
7165-static int ext4_remount (struct super_block * sb, int * flags, char * data)
7166+static int ext4_remount(struct super_block *sb, int *flags, char *data)
7167 {
7168- struct ext4_super_block * es;
7169+ struct ext4_super_block *es;
7170 struct ext4_sb_info *sbi = EXT4_SB(sb);
7171 ext4_fsblk_t n_blocks_count = 0;
7172 unsigned long old_sb_flags;
7173 struct ext4_mount_options old_opts;
7174+ ext4_group_t g;
7175 int err;
7176 #ifdef CONFIG_QUOTA
7177 int i;
7178@@ -2925,6 +3038,26 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data)
7179 }
7180
7181 /*
7182+ * Make sure the group descriptor checksums
7183+ * are sane. If they aren't, refuse to
7184+ * remount r/w.
7185+ */
7186+ for (g = 0; g < sbi->s_groups_count; g++) {
7187+ struct ext4_group_desc *gdp =
7188+ ext4_get_group_desc(sb, g, NULL);
7189+
7190+ if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
7191+ printk(KERN_ERR
7192+ "EXT4-fs: ext4_remount: "
7193+ "Checksum for group %lu failed (%u!=%u)\n",
7194+ g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
7195+ le16_to_cpu(gdp->bg_checksum));
7196+ err = -EINVAL;
7197+ goto restore_opts;
7198+ }
7199+ }
7200+
7201+ /*
7202 * If we have an unprocessed orphan list hanging
7203 * around from a previously readonly bdev mount,
7204 * require a full umount/remount for now.
7205@@ -2949,7 +3082,7 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data)
7206 sbi->s_mount_state = le16_to_cpu(es->s_state);
7207 if ((err = ext4_group_extend(sb, es, n_blocks_count)))
7208 goto restore_opts;
7209- if (!ext4_setup_super (sb, es, 0))
7210+ if (!ext4_setup_super(sb, es, 0))
7211 sb->s_flags &= ~MS_RDONLY;
7212 }
7213 }
7214@@ -2979,7 +3112,7 @@ restore_opts:
7215 return err;
7216 }
7217
7218-static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf)
7219+static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
7220 {
7221 struct super_block *sb = dentry->d_sb;
7222 struct ext4_sb_info *sbi = EXT4_SB(sb);
7223@@ -3217,12 +3350,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
7224 }
7225 /* Journaling quota? */
7226 if (EXT4_SB(sb)->s_qf_names[type]) {
7227- /* Quotafile not of fs root? */
7228+ /* Quotafile not in fs root? */
7229 if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode)
7230 printk(KERN_WARNING
7231 "EXT4-fs: Quota file not on filesystem root. "
7232 "Journaled quota will not work.\n");
7233- }
7234+ }
7235
7236 /*
7237 * When we journal data on quota file, we have to flush journal to see
7238@@ -3325,7 +3458,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
7239 err = ext4_journal_dirty_metadata(handle, bh);
7240 else {
7241 /* Always do at least ordered writes for quotas */
7242- err = ext4_journal_dirty_data(handle, bh);
7243+ err = ext4_jbd2_file_inode(handle, inode);
7244 mark_buffer_dirty(bh);
7245 }
7246 brelse(bh);
7247diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
7248index ff08633..8954208 100644
7249--- a/fs/ext4/xattr.c
7250+++ b/fs/ext4/xattr.c
7251@@ -810,7 +810,7 @@ inserted:
7252 /* We need to allocate a new block */
7253 ext4_fsblk_t goal = ext4_group_first_block_no(sb,
7254 EXT4_I(inode)->i_block_group);
7255- ext4_fsblk_t block = ext4_new_block(handle, inode,
7256+ ext4_fsblk_t block = ext4_new_meta_block(handle, inode,
7257 goal, &error);
7258 if (error)
7259 goto cleanup;
7260@@ -1512,7 +1512,7 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header,
7261 char *name = entry->e_name;
7262 int n;
7263
7264- for (n=0; n < entry->e_name_len; n++) {
7265+ for (n = 0; n < entry->e_name_len; n++) {
7266 hash = (hash << NAME_HASH_SHIFT) ^
7267 (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^
7268 *name++;
7269diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
7270index fff3338..ac1a52c 100644
7271--- a/fs/ext4/xattr_trusted.c
7272+++ b/fs/ext4/xattr_trusted.c
7273@@ -13,13 +13,11 @@
7274 #include "ext4.h"
7275 #include "xattr.h"
7276
7277-#define XATTR_TRUSTED_PREFIX "trusted."
7278-
7279 static size_t
7280 ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size,
7281 const char *name, size_t name_len)
7282 {
7283- const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1;
7284+ const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
7285 const size_t total_len = prefix_len + name_len + 1;
7286
7287 if (!capable(CAP_SYS_ADMIN))
7288diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
7289index 67be723..d91aa61 100644
7290--- a/fs/ext4/xattr_user.c
7291+++ b/fs/ext4/xattr_user.c
7292@@ -12,13 +12,11 @@
7293 #include "ext4.h"
7294 #include "xattr.h"
7295
7296-#define XATTR_USER_PREFIX "user."
7297-
7298 static size_t
7299 ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size,
7300 const char *name, size_t name_len)
7301 {
7302- const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1;
7303+ const size_t prefix_len = XATTR_USER_PREFIX_LEN;
7304 const size_t total_len = prefix_len + name_len + 1;
7305
7306 if (!test_opt(inode->i_sb, XATTR_USER))
7307diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
7308index 6914598..91389c8 100644
7309--- a/fs/jbd2/checkpoint.c
7310+++ b/fs/jbd2/checkpoint.c
7311@@ -688,7 +688,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact
7312
7313 J_ASSERT(transaction->t_state == T_FINISHED);
7314 J_ASSERT(transaction->t_buffers == NULL);
7315- J_ASSERT(transaction->t_sync_datalist == NULL);
7316 J_ASSERT(transaction->t_forget == NULL);
7317 J_ASSERT(transaction->t_iobuf_list == NULL);
7318 J_ASSERT(transaction->t_shadow_list == NULL);
7319diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
7320index a2ed72f..adf0395 100644
7321--- a/fs/jbd2/commit.c
7322+++ b/fs/jbd2/commit.c
7323@@ -22,6 +22,8 @@
7324 #include <linux/pagemap.h>
7325 #include <linux/jiffies.h>
7326 #include <linux/crc32.h>
7327+#include <linux/writeback.h>
7328+#include <linux/backing-dev.h>
7329
7330 /*
7331 * Default IO end handler for temporary BJ_IO buffer_heads.
7332@@ -37,8 +39,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
7333 }
7334
7335 /*
7336- * When an ext3-ordered file is truncated, it is possible that many pages are
7337- * not sucessfully freed, because they are attached to a committing transaction.
7338+ * When an ext4 file is truncated, it is possible that some pages are not
7339+ * successfully freed, because they are attached to a committing transaction.
7340 * After the transaction commits, these pages are left on the LRU, with no
7341 * ->mapping, and with attached buffers. These pages are trivially reclaimable
7342 * by the VM, but their apparent absence upsets the VM accounting, and it makes
7343@@ -80,21 +82,6 @@ nope:
7344 }
7345
7346 /*
7347- * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
7348- * held. For ranking reasons we must trylock. If we lose, schedule away and
7349- * return 0. j_list_lock is dropped in this case.
7350- */
7351-static int inverted_lock(journal_t *journal, struct buffer_head *bh)
7352-{
7353- if (!jbd_trylock_bh_state(bh)) {
7354- spin_unlock(&journal->j_list_lock);
7355- schedule();
7356- return 0;
7357- }
7358- return 1;
7359-}
7360-
7361-/*
7362 * Done it all: now submit the commit record. We should have
7363 * cleaned up our previous buffers by now, so if we are in abort
7364 * mode we can now just skip the rest of the journal write
7365@@ -112,6 +99,7 @@ static int journal_submit_commit_record(journal_t *journal,
7366 struct buffer_head *bh;
7367 int ret;
7368 int barrier_done = 0;
7369+ struct timespec now = current_kernel_time();
7370
7371 if (is_journal_aborted(journal))
7372 return 0;
7373@@ -126,6 +114,8 @@ static int journal_submit_commit_record(journal_t *journal,
7374 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
7375 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
7376 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
7377+ tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
7378+ tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
7379
7380 if (JBD2_HAS_COMPAT_FEATURE(journal,
7381 JBD2_FEATURE_COMPAT_CHECKSUM)) {
7382@@ -197,159 +187,114 @@ static int journal_wait_on_commit_record(struct buffer_head *bh)
7383 }
7384
7385 /*
7386- * Wait for all submitted IO to complete.
7387+ * write the filemap data using writepage() address_space_operations.
7388+ * We don't do block allocation here even for delalloc. We don't
7389+ * use writepages() because with dealyed allocation we may be doing
7390+ * block allocation in writepages().
7391 */
7392-static int journal_wait_on_locked_list(journal_t *journal,
7393- transaction_t *commit_transaction)
7394+static int journal_submit_inode_data_buffers(struct address_space *mapping)
7395 {
7396- int ret = 0;
7397- struct journal_head *jh;
7398-
7399- while (commit_transaction->t_locked_list) {
7400- struct buffer_head *bh;
7401-
7402- jh = commit_transaction->t_locked_list->b_tprev;
7403- bh = jh2bh(jh);
7404- get_bh(bh);
7405- if (buffer_locked(bh)) {
7406- spin_unlock(&journal->j_list_lock);
7407- wait_on_buffer(bh);
7408- if (unlikely(!buffer_uptodate(bh)))
7409- ret = -EIO;
7410- spin_lock(&journal->j_list_lock);
7411- }
7412- if (!inverted_lock(journal, bh)) {
7413- put_bh(bh);
7414- spin_lock(&journal->j_list_lock);
7415- continue;
7416- }
7417- if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
7418- __jbd2_journal_unfile_buffer(jh);
7419- jbd_unlock_bh_state(bh);
7420- jbd2_journal_remove_journal_head(bh);
7421- put_bh(bh);
7422- } else {
7423- jbd_unlock_bh_state(bh);
7424- }
7425- put_bh(bh);
7426- cond_resched_lock(&journal->j_list_lock);
7427- }
7428+ int ret;
7429+ struct writeback_control wbc = {
7430+ .sync_mode = WB_SYNC_ALL,
7431+ .nr_to_write = mapping->nrpages * 2,
7432+ .range_start = 0,
7433+ .range_end = i_size_read(mapping->host),
7434+ .for_writepages = 1,
7435+ };
7436+
7437+ ret = generic_writepages(mapping, &wbc);
7438 return ret;
7439- }
7440+}
7441
7442-static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
7443+/*
7444+ * Submit all the data buffers of inode associated with the transaction to
7445+ * disk.
7446+ *
7447+ * We are in a committing transaction. Therefore no new inode can be added to
7448+ * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
7449+ * operate on from being released while we write out pages.
7450+ */
7451+static int journal_submit_data_buffers(journal_t *journal,
7452+ transaction_t *commit_transaction)
7453 {
7454- int i;
7455+ struct jbd2_inode *jinode;
7456+ int err, ret = 0;
7457+ struct address_space *mapping;
7458
7459- for (i = 0; i < bufs; i++) {
7460- wbuf[i]->b_end_io = end_buffer_write_sync;
7461- /* We use-up our safety reference in submit_bh() */
7462- submit_bh(WRITE, wbuf[i]);
7463+ spin_lock(&journal->j_list_lock);
7464+ list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
7465+ mapping = jinode->i_vfs_inode->i_mapping;
7466+ jinode->i_flags |= JI_COMMIT_RUNNING;
7467+ spin_unlock(&journal->j_list_lock);
7468+ /*
7469+ * submit the inode data buffers. We use writepage
7470+ * instead of writepages. Because writepages can do
7471+ * block allocation with delalloc. We need to write
7472+ * only allocated blocks here.
7473+ */
7474+ err = journal_submit_inode_data_buffers(mapping);
7475+ if (!ret)
7476+ ret = err;
7477+ spin_lock(&journal->j_list_lock);
7478+ J_ASSERT(jinode->i_transaction == commit_transaction);
7479+ jinode->i_flags &= ~JI_COMMIT_RUNNING;
7480+ wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
7481 }
7482+ spin_unlock(&journal->j_list_lock);
7483+ return ret;
7484 }
7485
7486 /*
7487- * Submit all the data buffers to disk
7488+ * Wait for data submitted for writeout, refile inodes to proper
7489+ * transaction if needed.
7490+ *
7491 */
7492-static void journal_submit_data_buffers(journal_t *journal,
7493- transaction_t *commit_transaction)
7494+static int journal_finish_inode_data_buffers(journal_t *journal,
7495+ transaction_t *commit_transaction)
7496 {
7497- struct journal_head *jh;
7498- struct buffer_head *bh;
7499- int locked;
7500- int bufs = 0;
7501- struct buffer_head **wbuf = journal->j_wbuf;
7502+ struct jbd2_inode *jinode, *next_i;
7503+ int err, ret = 0;
7504
7505- /*
7506- * Whenever we unlock the journal and sleep, things can get added
7507- * onto ->t_sync_datalist, so we have to keep looping back to
7508- * write_out_data until we *know* that the list is empty.
7509- *
7510- * Cleanup any flushed data buffers from the data list. Even in
7511- * abort mode, we want to flush this out as soon as possible.
7512- */
7513-write_out_data:
7514- cond_resched();
7515+ /* For locking, see the comment in journal_submit_data_buffers() */
7516 spin_lock(&journal->j_list_lock);
7517-
7518- while (commit_transaction->t_sync_datalist) {
7519- jh = commit_transaction->t_sync_datalist;
7520- bh = jh2bh(jh);
7521- locked = 0;
7522-
7523- /* Get reference just to make sure buffer does not disappear
7524- * when we are forced to drop various locks */
7525- get_bh(bh);
7526- /* If the buffer is dirty, we need to submit IO and hence
7527- * we need the buffer lock. We try to lock the buffer without
7528- * blocking. If we fail, we need to drop j_list_lock and do
7529- * blocking lock_buffer().
7530- */
7531- if (buffer_dirty(bh)) {
7532- if (test_set_buffer_locked(bh)) {
7533- BUFFER_TRACE(bh, "needs blocking lock");
7534- spin_unlock(&journal->j_list_lock);
7535- /* Write out all data to prevent deadlocks */
7536- journal_do_submit_data(wbuf, bufs);
7537- bufs = 0;
7538- lock_buffer(bh);
7539- spin_lock(&journal->j_list_lock);
7540- }
7541- locked = 1;
7542- }
7543- /* We have to get bh_state lock. Again out of order, sigh. */
7544- if (!inverted_lock(journal, bh)) {
7545- jbd_lock_bh_state(bh);
7546- spin_lock(&journal->j_list_lock);
7547- }
7548- /* Someone already cleaned up the buffer? */
7549- if (!buffer_jbd(bh)
7550- || jh->b_transaction != commit_transaction
7551- || jh->b_jlist != BJ_SyncData) {
7552- jbd_unlock_bh_state(bh);
7553- if (locked)
7554- unlock_buffer(bh);
7555- BUFFER_TRACE(bh, "already cleaned up");
7556- put_bh(bh);
7557- continue;
7558- }
7559- if (locked && test_clear_buffer_dirty(bh)) {
7560- BUFFER_TRACE(bh, "needs writeout, adding to array");
7561- wbuf[bufs++] = bh;
7562- __jbd2_journal_file_buffer(jh, commit_transaction,
7563- BJ_Locked);
7564- jbd_unlock_bh_state(bh);
7565- if (bufs == journal->j_wbufsize) {
7566- spin_unlock(&journal->j_list_lock);
7567- journal_do_submit_data(wbuf, bufs);
7568- bufs = 0;
7569- goto write_out_data;
7570- }
7571- } else if (!locked && buffer_locked(bh)) {
7572- __jbd2_journal_file_buffer(jh, commit_transaction,
7573- BJ_Locked);
7574- jbd_unlock_bh_state(bh);
7575- put_bh(bh);
7576- } else {
7577- BUFFER_TRACE(bh, "writeout complete: unfile");
7578- __jbd2_journal_unfile_buffer(jh);
7579- jbd_unlock_bh_state(bh);
7580- if (locked)
7581- unlock_buffer(bh);
7582- jbd2_journal_remove_journal_head(bh);
7583- /* Once for our safety reference, once for
7584- * jbd2_journal_remove_journal_head() */
7585- put_bh(bh);
7586- put_bh(bh);
7587+ list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
7588+ jinode->i_flags |= JI_COMMIT_RUNNING;
7589+ spin_unlock(&journal->j_list_lock);
7590+ err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
7591+ if (err) {
7592+ /*
7593+ * Because AS_EIO is cleared by
7594+ * wait_on_page_writeback_range(), set it again so
7595+ * that user process can get -EIO from fsync().
7596+ */
7597+ set_bit(AS_EIO,
7598+ &jinode->i_vfs_inode->i_mapping->flags);
7599+
7600+ if (!ret)
7601+ ret = err;
7602 }
7603+ spin_lock(&journal->j_list_lock);
7604+ jinode->i_flags &= ~JI_COMMIT_RUNNING;
7605+ wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
7606+ }
7607
7608- if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
7609- spin_unlock(&journal->j_list_lock);
7610- goto write_out_data;
7611+ /* Now refile inode to proper lists */
7612+ list_for_each_entry_safe(jinode, next_i,
7613+ &commit_transaction->t_inode_list, i_list) {
7614+ list_del(&jinode->i_list);
7615+ if (jinode->i_next_transaction) {
7616+ jinode->i_transaction = jinode->i_next_transaction;
7617+ jinode->i_next_transaction = NULL;
7618+ list_add(&jinode->i_list,
7619+ &jinode->i_transaction->t_inode_list);
7620+ } else {
7621+ jinode->i_transaction = NULL;
7622 }
7623 }
7624 spin_unlock(&journal->j_list_lock);
7625- journal_do_submit_data(wbuf, bufs);
7626+
7627+ return ret;
7628 }
7629
7630 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
7631@@ -524,21 +469,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
7632 * Now start flushing things to disk, in the order they appear
7633 * on the transaction lists. Data blocks go first.
7634 */
7635- err = 0;
7636- journal_submit_data_buffers(journal, commit_transaction);
7637-
7638- /*
7639- * Wait for all previously submitted IO to complete if commit
7640- * record is to be written synchronously.
7641- */
7642- spin_lock(&journal->j_list_lock);
7643- if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
7644- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
7645- err = journal_wait_on_locked_list(journal,
7646- commit_transaction);
7647-
7648- spin_unlock(&journal->j_list_lock);
7649-
7650+ err = journal_submit_data_buffers(journal, commit_transaction);
7651 if (err)
7652 jbd2_journal_abort(journal, err);
7653
7654@@ -547,16 +478,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
7655 jbd_debug(3, "JBD: commit phase 2\n");
7656
7657 /*
7658- * If we found any dirty or locked buffers, then we should have
7659- * looped back up to the write_out_data label. If there weren't
7660- * any then journal_clean_data_list should have wiped the list
7661- * clean by now, so check that it is in fact empty.
7662- */
7663- J_ASSERT (commit_transaction->t_sync_datalist == NULL);
7664-
7665- jbd_debug (3, "JBD: commit phase 3\n");
7666-
7667- /*
7668 * Way to go: we have now written out all of the data for a
7669 * transaction! Now comes the tricky part: we need to write out
7670 * metadata. Loop over the transaction's entire buffer list:
7671@@ -574,6 +495,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
7672 J_ASSERT(commit_transaction->t_nr_buffers <=
7673 commit_transaction->t_outstanding_credits);
7674
7675+ err = 0;
7676 descriptor = NULL;
7677 bufs = 0;
7678 while (commit_transaction->t_buffers) {
7679@@ -748,13 +670,23 @@ start_journal_io:
7680 &cbh, crc32_sum);
7681 if (err)
7682 __jbd2_journal_abort_hard(journal);
7683+ }
7684
7685- spin_lock(&journal->j_list_lock);
7686- err = journal_wait_on_locked_list(journal,
7687- commit_transaction);
7688- spin_unlock(&journal->j_list_lock);
7689- if (err)
7690- __jbd2_journal_abort_hard(journal);
7691+ /*
7692+ * This is the right place to wait for data buffers both for ASYNC
7693+ * and !ASYNC commit. If commit is ASYNC, we need to wait only after
7694+ * the commit block went to disk (which happens above). If commit is
7695+ * SYNC, we need to wait for data buffers before we start writing
7696+ * commit block, which happens below in such setting.
7697+ */
7698+ err = journal_finish_inode_data_buffers(journal, commit_transaction);
7699+ if (err) {
7700+ char b[BDEVNAME_SIZE];
7701+
7702+ printk(KERN_WARNING
7703+ "JBD2: Detected IO errors while flushing file data "
7704+ "on %s\n", bdevname(journal->j_fs_dev, b));
7705+ err = 0;
7706 }
7707
7708 /* Lo and behold: we have just managed to send a transaction to
7709@@ -768,7 +700,7 @@ start_journal_io:
7710 so we incur less scheduling load.
7711 */
7712
7713- jbd_debug(3, "JBD: commit phase 4\n");
7714+ jbd_debug(3, "JBD: commit phase 3\n");
7715
7716 /*
7717 * akpm: these are BJ_IO, and j_list_lock is not needed.
7718@@ -827,7 +759,7 @@ wait_for_iobuf:
7719
7720 J_ASSERT (commit_transaction->t_shadow_list == NULL);
7721
7722- jbd_debug(3, "JBD: commit phase 5\n");
7723+ jbd_debug(3, "JBD: commit phase 4\n");
7724
7725 /* Here we wait for the revoke record and descriptor record buffers */
7726 wait_for_ctlbuf:
7727@@ -854,7 +786,7 @@ wait_for_iobuf:
7728 /* AKPM: bforget here */
7729 }
7730
7731- jbd_debug(3, "JBD: commit phase 6\n");
7732+ jbd_debug(3, "JBD: commit phase 5\n");
7733
7734 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
7735 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
7736@@ -874,9 +806,9 @@ wait_for_iobuf:
7737 transaction can be removed from any checkpoint list it was on
7738 before. */
7739
7740- jbd_debug(3, "JBD: commit phase 7\n");
7741+ jbd_debug(3, "JBD: commit phase 6\n");
7742
7743- J_ASSERT(commit_transaction->t_sync_datalist == NULL);
7744+ J_ASSERT(list_empty(&commit_transaction->t_inode_list));
7745 J_ASSERT(commit_transaction->t_buffers == NULL);
7746 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
7747 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
7748@@ -997,7 +929,7 @@ restart_loop:
7749
7750 /* Done with this transaction! */
7751
7752- jbd_debug(3, "JBD: commit phase 8\n");
7753+ jbd_debug(3, "JBD: commit phase 7\n");
7754
7755 J_ASSERT(commit_transaction->t_state == T_COMMIT);
7756
7757diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
7758index 2e24567..8207a01 100644
7759--- a/fs/jbd2/journal.c
7760+++ b/fs/jbd2/journal.c
7761@@ -50,7 +50,6 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
7762 EXPORT_SYMBOL(jbd2_journal_get_write_access);
7763 EXPORT_SYMBOL(jbd2_journal_get_create_access);
7764 EXPORT_SYMBOL(jbd2_journal_get_undo_access);
7765-EXPORT_SYMBOL(jbd2_journal_dirty_data);
7766 EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
7767 EXPORT_SYMBOL(jbd2_journal_release_buffer);
7768 EXPORT_SYMBOL(jbd2_journal_forget);
7769@@ -69,7 +68,6 @@ EXPORT_SYMBOL(jbd2_journal_set_features);
7770 EXPORT_SYMBOL(jbd2_journal_create);
7771 EXPORT_SYMBOL(jbd2_journal_load);
7772 EXPORT_SYMBOL(jbd2_journal_destroy);
7773-EXPORT_SYMBOL(jbd2_journal_update_superblock);
7774 EXPORT_SYMBOL(jbd2_journal_abort);
7775 EXPORT_SYMBOL(jbd2_journal_errno);
7776 EXPORT_SYMBOL(jbd2_journal_ack_err);
7777@@ -82,6 +80,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
7778 EXPORT_SYMBOL(jbd2_journal_invalidatepage);
7779 EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
7780 EXPORT_SYMBOL(jbd2_journal_force_commit);
7781+EXPORT_SYMBOL(jbd2_journal_file_inode);
7782+EXPORT_SYMBOL(jbd2_journal_init_jbd_inode);
7783+EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
7784+EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
7785
7786 static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
7787 static void __journal_abort_soft (journal_t *journal, int errno);
7788@@ -2195,6 +2197,54 @@ void jbd2_journal_put_journal_head(struct journal_head *jh)
7789 }
7790
7791 /*
7792+ * Initialize jbd inode head
7793+ */
7794+void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode)
7795+{
7796+ jinode->i_transaction = NULL;
7797+ jinode->i_next_transaction = NULL;
7798+ jinode->i_vfs_inode = inode;
7799+ jinode->i_flags = 0;
7800+ INIT_LIST_HEAD(&jinode->i_list);
7801+}
7802+
7803+/*
7804+ * Function to be called before we start removing inode from memory (i.e.,
7805+ * clear_inode() is a fine place to be called from). It removes inode from
7806+ * transaction's lists.
7807+ */
7808+void jbd2_journal_release_jbd_inode(journal_t *journal,
7809+ struct jbd2_inode *jinode)
7810+{
7811+ int writeout = 0;
7812+
7813+ if (!journal)
7814+ return;
7815+restart:
7816+ spin_lock(&journal->j_list_lock);
7817+ /* Is commit writing out inode - we have to wait */
7818+ if (jinode->i_flags & JI_COMMIT_RUNNING) {
7819+ wait_queue_head_t *wq;
7820+ DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING);
7821+ wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING);
7822+ prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
7823+ spin_unlock(&journal->j_list_lock);
7824+ schedule();
7825+ finish_wait(wq, &wait.wait);
7826+ goto restart;
7827+ }
7828+
7829+ /* Do we need to wait for data writeback? */
7830+ if (journal->j_committing_transaction == jinode->i_transaction)
7831+ writeout = 1;
7832+ if (jinode->i_transaction) {
7833+ list_del(&jinode->i_list);
7834+ jinode->i_transaction = NULL;
7835+ }
7836+ spin_unlock(&journal->j_list_lock);
7837+}
7838+
7839+/*
7840 * debugfs tunables
7841 */
7842 #ifdef CONFIG_JBD2_DEBUG
7843diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
7844index d6e006e..4f7cadb 100644
7845--- a/fs/jbd2/transaction.c
7846+++ b/fs/jbd2/transaction.c
7847@@ -41,7 +41,6 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
7848 * new transaction and we can't block without protecting against other
7849 * processes trying to touch the journal while it is in transition.
7850 *
7851- * Called under j_state_lock
7852 */
7853
7854 static transaction_t *
7855@@ -52,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
7856 transaction->t_tid = journal->j_transaction_sequence++;
7857 transaction->t_expires = jiffies + journal->j_commit_interval;
7858 spin_lock_init(&transaction->t_handle_lock);
7859+ INIT_LIST_HEAD(&transaction->t_inode_list);
7860
7861 /* Set up the commit timer for the new transaction. */
7862 journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
7863@@ -943,183 +943,6 @@ out:
7864 }
7865
7866 /**
7867- * int jbd2_journal_dirty_data() - mark a buffer as containing dirty data which
7868- * needs to be flushed before we can commit the
7869- * current transaction.
7870- * @handle: transaction
7871- * @bh: bufferhead to mark
7872- *
7873- * The buffer is placed on the transaction's data list and is marked as
7874- * belonging to the transaction.
7875- *
7876- * Returns error number or 0 on success.
7877- *
7878- * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage
7879- * by kswapd.
7880- */
7881-int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
7882-{
7883- journal_t *journal = handle->h_transaction->t_journal;
7884- int need_brelse = 0;
7885- struct journal_head *jh;
7886-
7887- if (is_handle_aborted(handle))
7888- return 0;
7889-
7890- jh = jbd2_journal_add_journal_head(bh);
7891- JBUFFER_TRACE(jh, "entry");
7892-
7893- /*
7894- * The buffer could *already* be dirty. Writeout can start
7895- * at any time.
7896- */
7897- jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
7898-
7899- /*
7900- * What if the buffer is already part of a running transaction?
7901- *
7902- * There are two cases:
7903- * 1) It is part of the current running transaction. Refile it,
7904- * just in case we have allocated it as metadata, deallocated
7905- * it, then reallocated it as data.
7906- * 2) It is part of the previous, still-committing transaction.
7907- * If all we want to do is to guarantee that the buffer will be
7908- * written to disk before this new transaction commits, then
7909- * being sure that the *previous* transaction has this same
7910- * property is sufficient for us! Just leave it on its old
7911- * transaction.
7912- *
7913- * In case (2), the buffer must not already exist as metadata
7914- * --- that would violate write ordering (a transaction is free
7915- * to write its data at any point, even before the previous
7916- * committing transaction has committed). The caller must
7917- * never, ever allow this to happen: there's nothing we can do
7918- * about it in this layer.
7919- */
7920- jbd_lock_bh_state(bh);
7921- spin_lock(&journal->j_list_lock);
7922-
7923- /* Now that we have bh_state locked, are we really still mapped? */
7924- if (!buffer_mapped(bh)) {
7925- JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
7926- goto no_journal;
7927- }
7928-
7929- if (jh->b_transaction) {
7930- JBUFFER_TRACE(jh, "has transaction");
7931- if (jh->b_transaction != handle->h_transaction) {
7932- JBUFFER_TRACE(jh, "belongs to older transaction");
7933- J_ASSERT_JH(jh, jh->b_transaction ==
7934- journal->j_committing_transaction);
7935-
7936- /* @@@ IS THIS TRUE ? */
7937- /*
7938- * Not any more. Scenario: someone does a write()
7939- * in data=journal mode. The buffer's transaction has
7940- * moved into commit. Then someone does another
7941- * write() to the file. We do the frozen data copyout
7942- * and set b_next_transaction to point to j_running_t.
7943- * And while we're in that state, someone does a
7944- * writepage() in an attempt to pageout the same area
7945- * of the file via a shared mapping. At present that
7946- * calls jbd2_journal_dirty_data(), and we get right here.
7947- * It may be too late to journal the data. Simply
7948- * falling through to the next test will suffice: the
7949- * data will be dirty and wil be checkpointed. The
7950- * ordering comments in the next comment block still
7951- * apply.
7952- */
7953- //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
7954-
7955- /*
7956- * If we're journalling data, and this buffer was
7957- * subject to a write(), it could be metadata, forget
7958- * or shadow against the committing transaction. Now,
7959- * someone has dirtied the same darn page via a mapping
7960- * and it is being writepage()'d.
7961- * We *could* just steal the page from commit, with some
7962- * fancy locking there. Instead, we just skip it -
7963- * don't tie the page's buffers to the new transaction
7964- * at all.
7965- * Implication: if we crash before the writepage() data
7966- * is written into the filesystem, recovery will replay
7967- * the write() data.
7968- */
7969- if (jh->b_jlist != BJ_None &&
7970- jh->b_jlist != BJ_SyncData &&
7971- jh->b_jlist != BJ_Locked) {
7972- JBUFFER_TRACE(jh, "Not stealing");
7973- goto no_journal;
7974- }
7975-
7976- /*
7977- * This buffer may be undergoing writeout in commit. We
7978- * can't return from here and let the caller dirty it
7979- * again because that can cause the write-out loop in
7980- * commit to never terminate.
7981- */
7982- if (buffer_dirty(bh)) {
7983- get_bh(bh);
7984- spin_unlock(&journal->j_list_lock);
7985- jbd_unlock_bh_state(bh);
7986- need_brelse = 1;
7987- sync_dirty_buffer(bh);
7988- jbd_lock_bh_state(bh);
7989- spin_lock(&journal->j_list_lock);
7990- /* Since we dropped the lock... */
7991- if (!buffer_mapped(bh)) {
7992- JBUFFER_TRACE(jh, "buffer got unmapped");
7993- goto no_journal;
7994- }
7995- /* The buffer may become locked again at any
7996- time if it is redirtied */
7997- }
7998-
7999- /* journal_clean_data_list() may have got there first */
8000- if (jh->b_transaction != NULL) {
8001- JBUFFER_TRACE(jh, "unfile from commit");
8002- __jbd2_journal_temp_unlink_buffer(jh);
8003- /* It still points to the committing
8004- * transaction; move it to this one so
8005- * that the refile assert checks are
8006- * happy. */
8007- jh->b_transaction = handle->h_transaction;
8008- }
8009- /* The buffer will be refiled below */
8010-
8011- }
8012- /*
8013- * Special case --- the buffer might actually have been
8014- * allocated and then immediately deallocated in the previous,
8015- * committing transaction, so might still be left on that
8016- * transaction's metadata lists.
8017- */
8018- if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
8019- JBUFFER_TRACE(jh, "not on correct data list: unfile");
8020- J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
8021- __jbd2_journal_temp_unlink_buffer(jh);
8022- jh->b_transaction = handle->h_transaction;
8023- JBUFFER_TRACE(jh, "file as data");
8024- __jbd2_journal_file_buffer(jh, handle->h_transaction,
8025- BJ_SyncData);
8026- }
8027- } else {
8028- JBUFFER_TRACE(jh, "not on a transaction");
8029- __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
8030- }
8031-no_journal:
8032- spin_unlock(&journal->j_list_lock);
8033- jbd_unlock_bh_state(bh);
8034- if (need_brelse) {
8035- BUFFER_TRACE(bh, "brelse");
8036- __brelse(bh);
8037- }
8038- JBUFFER_TRACE(jh, "exit");
8039- jbd2_journal_put_journal_head(jh);
8040- return 0;
8041-}
8042-
8043-/**
8044 * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
8045 * @handle: transaction to add buffer to.
8046 * @bh: buffer to mark
8047@@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
8048 * Remove a buffer from the appropriate transaction list.
8049 *
8050 * Note that this function can *change* the value of
8051- * bh->b_transaction->t_sync_datalist, t_buffers, t_forget,
8052- * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller
8053- * is holding onto a copy of one of thee pointers, it could go bad.
8054- * Generally the caller needs to re-read the pointer from the transaction_t.
8055+ * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
8056+ * t_log_list or t_reserved_list. If the caller is holding onto a copy of one
8057+ * of these pointers, it could go bad. Generally the caller needs to re-read
8058+ * the pointer from the transaction_t.
8059 *
8060 * Called under j_list_lock. The journal may not be locked.
8061 */
8062@@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
8063 switch (jh->b_jlist) {
8064 case BJ_None:
8065 return;
8066- case BJ_SyncData:
8067- list = &transaction->t_sync_datalist;
8068- break;
8069 case BJ_Metadata:
8070 transaction->t_nr_buffers--;
8071 J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
8072@@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
8073 case BJ_Reserved:
8074 list = &transaction->t_reserved_list;
8075 break;
8076- case BJ_Locked:
8077- list = &transaction->t_locked_list;
8078- break;
8079 }
8080
8081 __blist_del_buffer(list, jh);
8082@@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
8083 goto out;
8084
8085 spin_lock(&journal->j_list_lock);
8086- if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) {
8087- if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
8088- /* A written-back ordered data buffer */
8089- JBUFFER_TRACE(jh, "release data");
8090- __jbd2_journal_unfile_buffer(jh);
8091- jbd2_journal_remove_journal_head(bh);
8092- __brelse(bh);
8093- }
8094- } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
8095+ if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
8096 /* written-back checkpointed metadata buffer */
8097 if (jh->b_jlist == BJ_None) {
8098 JBUFFER_TRACE(jh, "remove from checkpoint list");
8099@@ -1656,12 +1465,43 @@ out:
8100 return;
8101 }
8102
8103+/*
8104+ * jbd2_journal_try_to_free_buffers() could race with
8105+ * jbd2_journal_commit_transaction(). The later might still hold the
8106+ * reference count to the buffers when inspecting them on
8107+ * t_syncdata_list or t_locked_list.
8108+ *
8109+ * jbd2_journal_try_to_free_buffers() will call this function to
8110+ * wait for the current transaction to finish syncing data buffers, before
8111+ * try to free that buffer.
8112+ *
8113+ * Called with journal->j_state_lock hold.
8114+ */
8115+static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
8116+{
8117+ transaction_t *transaction;
8118+ tid_t tid;
8119+
8120+ spin_lock(&journal->j_state_lock);
8121+ transaction = journal->j_committing_transaction;
8122+
8123+ if (!transaction) {
8124+ spin_unlock(&journal->j_state_lock);
8125+ return;
8126+ }
8127+
8128+ tid = transaction->t_tid;
8129+ spin_unlock(&journal->j_state_lock);
8130+ jbd2_log_wait_commit(journal, tid);
8131+}
8132
8133 /**
8134 * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
8135 * @journal: journal for operation
8136 * @page: to try and free
8137- * @unused_gfp_mask: unused
8138+ * @gfp_mask: we use the mask to detect how hard should we try to release
8139+ * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
8140+ * release the buffers.
8141 *
8142 *
8143 * For all the buffers on this page,
8144@@ -1690,9 +1530,11 @@ out:
8145 * journal_try_to_free_buffer() is changing its state. But that
8146 * cannot happen because we never reallocate freed data as metadata
8147 * while the data is part of a transaction. Yes?
8148+ *
8149+ * Return 0 on failure, 1 on success
8150 */
8151 int jbd2_journal_try_to_free_buffers(journal_t *journal,
8152- struct page *page, gfp_t unused_gfp_mask)
8153+ struct page *page, gfp_t gfp_mask)
8154 {
8155 struct buffer_head *head;
8156 struct buffer_head *bh;
8157@@ -1708,7 +1550,8 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
8158 /*
8159 * We take our own ref against the journal_head here to avoid
8160 * having to add tons of locking around each instance of
8161- * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head().
8162+ * jbd2_journal_remove_journal_head() and
8163+ * jbd2_journal_put_journal_head().
8164 */
8165 jh = jbd2_journal_grab_journal_head(bh);
8166 if (!jh)
8167@@ -1721,7 +1564,28 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
8168 if (buffer_jbd(bh))
8169 goto busy;
8170 } while ((bh = bh->b_this_page) != head);
8171+
8172 ret = try_to_free_buffers(page);
8173+
8174+ /*
8175+ * There are a number of places where jbd2_journal_try_to_free_buffers()
8176+ * could race with jbd2_journal_commit_transaction(), the later still
8177+ * holds the reference to the buffers to free while processing them.
8178+ * try_to_free_buffers() failed to free those buffers. Some of the
8179+ * caller of releasepage() request page buffers to be dropped, otherwise
8180+ * treat the fail-to-free as errors (such as generic_file_direct_IO())
8181+ *
8182+ * So, if the caller of try_to_release_page() wants the synchronous
8183+ * behaviour(i.e make sure buffers are dropped upon return),
8184+ * let's wait for the current transaction to finish flush of
8185+ * dirty data buffers, then try to free those buffers again,
8186+ * with the journal locked.
8187+ */
8188+ if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
8189+ jbd2_journal_wait_for_transaction_sync_data(journal);
8190+ ret = try_to_free_buffers(page);
8191+ }
8192+
8193 busy:
8194 return ret;
8195 }
8196@@ -1823,6 +1687,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
8197 if (!buffer_jbd(bh))
8198 goto zap_buffer_unlocked;
8199
8200+ /* OK, we have data buffer in journaled mode */
8201 spin_lock(&journal->j_state_lock);
8202 jbd_lock_bh_state(bh);
8203 spin_lock(&journal->j_list_lock);
8204@@ -1886,15 +1751,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
8205 }
8206 } else if (transaction == journal->j_committing_transaction) {
8207 JBUFFER_TRACE(jh, "on committing transaction");
8208- if (jh->b_jlist == BJ_Locked) {
8209- /*
8210- * The buffer is on the committing transaction's locked
8211- * list. We have the buffer locked, so I/O has
8212- * completed. So we can nail the buffer now.
8213- */
8214- may_free = __dispose_buffer(jh, transaction);
8215- goto zap_buffer;
8216- }
8217 /*
8218 * If it is committing, we simply cannot touch it. We
8219 * can remove it's next_transaction pointer from the
8220@@ -2027,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
8221 J_ASSERT_JH(jh, !jh->b_committed_data);
8222 J_ASSERT_JH(jh, !jh->b_frozen_data);
8223 return;
8224- case BJ_SyncData:
8225- list = &transaction->t_sync_datalist;
8226- break;
8227 case BJ_Metadata:
8228 transaction->t_nr_buffers++;
8229 list = &transaction->t_buffers;
8230@@ -2049,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
8231 case BJ_Reserved:
8232 list = &transaction->t_reserved_list;
8233 break;
8234- case BJ_Locked:
8235- list = &transaction->t_locked_list;
8236- break;
8237 }
8238
8239 __blist_add_buffer(list, jh);
8240@@ -2141,3 +1991,88 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
8241 spin_unlock(&journal->j_list_lock);
8242 __brelse(bh);
8243 }
8244+
8245+/*
8246+ * File inode in the inode list of the handle's transaction
8247+ */
8248+int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
8249+{
8250+ transaction_t *transaction = handle->h_transaction;
8251+ journal_t *journal = transaction->t_journal;
8252+
8253+ if (is_handle_aborted(handle))
8254+ return -EIO;
8255+
8256+ jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
8257+ transaction->t_tid);
8258+
8259+ /*
8260+ * First check whether inode isn't already on the transaction's
8261+ * lists without taking the lock. Note that this check is safe
8262+ * without the lock as we cannot race with somebody removing inode
8263+ * from the transaction. The reason is that we remove inode from the
8264+ * transaction only in journal_release_jbd_inode() and when we commit
8265+ * the transaction. We are guarded from the first case by holding
8266+ * a reference to the inode. We are safe against the second case
8267+ * because if jinode->i_transaction == transaction, commit code
8268+ * cannot touch the transaction because we hold reference to it,
8269+ * and if jinode->i_next_transaction == transaction, commit code
8270+ * will only file the inode where we want it.
8271+ */
8272+ if (jinode->i_transaction == transaction ||
8273+ jinode->i_next_transaction == transaction)
8274+ return 0;
8275+
8276+ spin_lock(&journal->j_list_lock);
8277+
8278+ if (jinode->i_transaction == transaction ||
8279+ jinode->i_next_transaction == transaction)
8280+ goto done;
8281+
8282+ /* On some different transaction's list - should be
8283+ * the committing one */
8284+ if (jinode->i_transaction) {
8285+ J_ASSERT(jinode->i_next_transaction == NULL);
8286+ J_ASSERT(jinode->i_transaction ==
8287+ journal->j_committing_transaction);
8288+ jinode->i_next_transaction = transaction;
8289+ goto done;
8290+ }
8291+ /* Not on any transaction list... */
8292+ J_ASSERT(!jinode->i_next_transaction);
8293+ jinode->i_transaction = transaction;
8294+ list_add(&jinode->i_list, &transaction->t_inode_list);
8295+done:
8296+ spin_unlock(&journal->j_list_lock);
8297+
8298+ return 0;
8299+}
8300+
8301+/*
8302+ * This function must be called when inode is journaled in ordered mode
8303+ * before truncation happens. It starts writeout of truncated part in
8304+ * case it is in the committing transaction so that we stand to ordered
8305+ * mode consistency guarantees.
8306+ */
8307+int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
8308+ loff_t new_size)
8309+{
8310+ journal_t *journal;
8311+ transaction_t *commit_trans;
8312+ int ret = 0;
8313+
8314+ if (!inode->i_transaction && !inode->i_next_transaction)
8315+ goto out;
8316+ journal = inode->i_transaction->t_journal;
8317+ spin_lock(&journal->j_state_lock);
8318+ commit_trans = journal->j_committing_transaction;
8319+ spin_unlock(&journal->j_state_lock);
8320+ if (inode->i_transaction == commit_trans) {
8321+ ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
8322+ new_size, LLONG_MAX);
8323+ if (ret)
8324+ jbd2_journal_abort(journal, ret);
8325+ }
8326+out:
8327+ return ret;
8328+}
8329diff --git a/fs/mpage.c b/fs/mpage.c
8330index 235e4d3..dbcc7af 100644
8331--- a/fs/mpage.c
8332+++ b/fs/mpage.c
8333@@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err)
8334 bio_put(bio);
8335 }
8336
8337-static struct bio *mpage_bio_submit(int rw, struct bio *bio)
8338+struct bio *mpage_bio_submit(int rw, struct bio *bio)
8339 {
8340 bio->bi_end_io = mpage_end_io_read;
8341 if (rw == WRITE)
8342@@ -90,6 +90,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio)
8343 submit_bio(rw, bio);
8344 return NULL;
8345 }
8346+EXPORT_SYMBOL(mpage_bio_submit);
8347
8348 static struct bio *
8349 mpage_alloc(struct block_device *bdev,
8350@@ -435,15 +436,9 @@ EXPORT_SYMBOL(mpage_readpage);
8351 * written, so it can intelligently allocate a suitably-sized BIO. For now,
8352 * just allocate full-size (16-page) BIOs.
8353 */
8354-struct mpage_data {
8355- struct bio *bio;
8356- sector_t last_block_in_bio;
8357- get_block_t *get_block;
8358- unsigned use_writepage;
8359-};
8360
8361-static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
8362- void *data)
8363+int __mpage_writepage(struct page *page, struct writeback_control *wbc,
8364+ void *data)
8365 {
8366 struct mpage_data *mpd = data;
8367 struct bio *bio = mpd->bio;
8368@@ -651,6 +646,7 @@ out:
8369 mpd->bio = bio;
8370 return ret;
8371 }
8372+EXPORT_SYMBOL(__mpage_writepage);
8373
8374 /**
8375 * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them
8376diff --git a/include/linux/fs.h b/include/linux/fs.h
8377index d8e2762..97f992a 100644
8378--- a/include/linux/fs.h
8379+++ b/include/linux/fs.h
8380@@ -1740,6 +1740,8 @@ extern int wait_on_page_writeback_range(struct address_space *mapping,
8381 pgoff_t start, pgoff_t end);
8382 extern int __filemap_fdatawrite_range(struct address_space *mapping,
8383 loff_t start, loff_t end, int sync_mode);
8384+extern int filemap_fdatawrite_range(struct address_space *mapping,
8385+ loff_t start, loff_t end);
8386
8387 extern long do_fsync(struct file *file, int datasync);
8388 extern void sync_supers(void);
8389diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
8390index d147f0f..3dd2090 100644
8391--- a/include/linux/jbd2.h
8392+++ b/include/linux/jbd2.h
8393@@ -168,6 +168,8 @@ struct commit_header {
8394 unsigned char h_chksum_size;
8395 unsigned char h_padding[2];
8396 __be32 h_chksum[JBD2_CHECKSUM_BYTES];
8397+ __be64 h_commit_sec;
8398+ __be32 h_commit_nsec;
8399 };
8400
8401 /*
8402@@ -379,6 +381,38 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
8403 bit_spin_unlock(BH_JournalHead, &bh->b_state);
8404 }
8405
8406+/* Flags in jbd_inode->i_flags */
8407+#define __JI_COMMIT_RUNNING 0
8408+/* Commit of the inode data in progress. We use this flag to protect us from
8409+ * concurrent deletion of inode. We cannot use reference to inode for this
8410+ * since we cannot afford doing last iput() on behalf of kjournald
8411+ */
8412+#define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING)
8413+
8414+/**
8415+ * struct jbd_inode is the structure linking inodes in ordered mode
8416+ * present in a transaction so that we can sync them during commit.
8417+ */
8418+struct jbd2_inode {
8419+ /* Which transaction does this inode belong to? Either the running
8420+ * transaction or the committing one. [j_list_lock] */
8421+ transaction_t *i_transaction;
8422+
8423+ /* Pointer to the running transaction modifying inode's data in case
8424+ * there is already a committing transaction touching it. [j_list_lock] */
8425+ transaction_t *i_next_transaction;
8426+
8427+ /* List of inodes in the i_transaction [j_list_lock] */
8428+ struct list_head i_list;
8429+
8430+ /* VFS inode this inode belongs to [constant during the lifetime
8431+ * of the structure] */
8432+ struct inode *i_vfs_inode;
8433+
8434+ /* Flags of inode [j_list_lock] */
8435+ unsigned int i_flags;
8436+};
8437+
8438 struct jbd2_revoke_table_s;
8439
8440 /**
8441@@ -509,24 +543,12 @@ struct transaction_s
8442 struct journal_head *t_reserved_list;
8443
8444 /*
8445- * Doubly-linked circular list of all buffers under writeout during
8446- * commit [j_list_lock]
8447- */
8448- struct journal_head *t_locked_list;
8449-
8450- /*
8451 * Doubly-linked circular list of all metadata buffers owned by this
8452 * transaction [j_list_lock]
8453 */
8454 struct journal_head *t_buffers;
8455
8456 /*
8457- * Doubly-linked circular list of all data buffers still to be
8458- * flushed before this transaction can be committed [j_list_lock]
8459- */
8460- struct journal_head *t_sync_datalist;
8461-
8462- /*
8463 * Doubly-linked circular list of all forget buffers (superseded
8464 * buffers which we can un-checkpoint once this transaction commits)
8465 * [j_list_lock]
8466@@ -565,6 +587,12 @@ struct transaction_s
8467 struct journal_head *t_log_list;
8468
8469 /*
8470+ * List of inodes whose data we've modified in data=ordered mode.
8471+ * [j_list_lock]
8472+ */
8473+ struct list_head t_inode_list;
8474+
8475+ /*
8476 * Protects info related to handles
8477 */
8478 spinlock_t t_handle_lock;
8479@@ -1004,7 +1032,6 @@ extern int jbd2_journal_extend (handle_t *, int nblocks);
8480 extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *);
8481 extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *);
8482 extern int jbd2_journal_get_undo_access(handle_t *, struct buffer_head *);
8483-extern int jbd2_journal_dirty_data (handle_t *, struct buffer_head *);
8484 extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *);
8485 extern void jbd2_journal_release_buffer (handle_t *, struct buffer_head *);
8486 extern int jbd2_journal_forget (handle_t *, struct buffer_head *);
8487@@ -1044,6 +1071,10 @@ extern void jbd2_journal_ack_err (journal_t *);
8488 extern int jbd2_journal_clear_err (journal_t *);
8489 extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
8490 extern int jbd2_journal_force_commit(journal_t *);
8491+extern int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode);
8492+extern int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, loff_t new_size);
8493+extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
8494+extern void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode);
8495
8496 /*
8497 * journal_head management
8498@@ -1179,15 +1210,13 @@ static inline int jbd_space_needed(journal_t *journal)
8499
8500 /* journaling buffer types */
8501 #define BJ_None 0 /* Not journaled */
8502-#define BJ_SyncData 1 /* Normal data: flush before commit */
8503-#define BJ_Metadata 2 /* Normal journaled metadata */
8504-#define BJ_Forget 3 /* Buffer superseded by this transaction */
8505-#define BJ_IO 4 /* Buffer is for temporary IO use */
8506-#define BJ_Shadow 5 /* Buffer contents being shadowed to the log */
8507-#define BJ_LogCtl 6 /* Buffer contains log descriptors */
8508-#define BJ_Reserved 7 /* Buffer is reserved for access by journal */
8509-#define BJ_Locked 8 /* Locked for I/O during commit */
8510-#define BJ_Types 9
8511+#define BJ_Metadata 1 /* Normal journaled metadata */
8512+#define BJ_Forget 2 /* Buffer superseded by this transaction */
8513+#define BJ_IO 3 /* Buffer is for temporary IO use */
8514+#define BJ_Shadow 4 /* Buffer contents being shadowed to the log */
8515+#define BJ_LogCtl 5 /* Buffer contains log descriptors */
8516+#define BJ_Reserved 6 /* Buffer is reserved for access by journal */
8517+#define BJ_Types 7
8518
8519 extern int jbd_blocks_per_page(struct inode *inode);
8520
8521diff --git a/include/linux/mpage.h b/include/linux/mpage.h
8522index 068a0c9..5c42821 100644
8523--- a/include/linux/mpage.h
8524+++ b/include/linux/mpage.h
8525@@ -11,11 +11,21 @@
8526 */
8527 #ifdef CONFIG_BLOCK
8528
8529+struct mpage_data {
8530+ struct bio *bio;
8531+ sector_t last_block_in_bio;
8532+ get_block_t *get_block;
8533+ unsigned use_writepage;
8534+};
8535+
8536 struct writeback_control;
8537
8538+struct bio *mpage_bio_submit(int rw, struct bio *bio);
8539 int mpage_readpages(struct address_space *mapping, struct list_head *pages,
8540 unsigned nr_pages, get_block_t get_block);
8541 int mpage_readpage(struct page *page, get_block_t get_block);
8542+int __mpage_writepage(struct page *page, struct writeback_control *wbc,
8543+ void *data);
8544 int mpage_writepages(struct address_space *mapping,
8545 struct writeback_control *wbc, get_block_t get_block);
8546 int mpage_writepage(struct page *page, get_block_t *get_block,
8547diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
8548index 9007ccd..2083888 100644
8549--- a/include/linux/percpu_counter.h
8550+++ b/include/linux/percpu_counter.h
8551@@ -35,7 +35,7 @@ int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount);
8552 void percpu_counter_destroy(struct percpu_counter *fbc);
8553 void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
8554 void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
8555-s64 __percpu_counter_sum(struct percpu_counter *fbc);
8556+s64 __percpu_counter_sum(struct percpu_counter *fbc, int set);
8557
8558 static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
8559 {
8560@@ -44,13 +44,19 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
8561
8562 static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
8563 {
8564- s64 ret = __percpu_counter_sum(fbc);
8565+ s64 ret = __percpu_counter_sum(fbc, 0);
8566 return ret < 0 ? 0 : ret;
8567 }
8568
8569+static inline s64 percpu_counter_sum_and_set(struct percpu_counter *fbc)
8570+{
8571+ return __percpu_counter_sum(fbc, 1);
8572+}
8573+
8574+
8575 static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
8576 {
8577- return __percpu_counter_sum(fbc);
8578+ return __percpu_counter_sum(fbc, 0);
8579 }
8580
8581 static inline s64 percpu_counter_read(struct percpu_counter *fbc)
8582diff --git a/include/linux/writeback.h b/include/linux/writeback.h
8583index f462439..0d8573e 100644
8584--- a/include/linux/writeback.h
8585+++ b/include/linux/writeback.h
8586@@ -63,6 +63,7 @@ struct writeback_control {
8587 unsigned for_writepages:1; /* This is a writepages() call */
8588 unsigned range_cyclic:1; /* range_start is cyclic */
8589 unsigned more_io:1; /* more io to be dispatched */
8590+ unsigned range_cont:1;
8591 };
8592
8593 /*
8594diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
8595index 1191744..4a8ba4b 100644
8596--- a/lib/percpu_counter.c
8597+++ b/lib/percpu_counter.c
8598@@ -52,7 +52,7 @@ EXPORT_SYMBOL(__percpu_counter_add);
8599 * Add up all the per-cpu counts, return the result. This is a more accurate
8600 * but much slower version of percpu_counter_read_positive()
8601 */
8602-s64 __percpu_counter_sum(struct percpu_counter *fbc)
8603+s64 __percpu_counter_sum(struct percpu_counter *fbc, int set)
8604 {
8605 s64 ret;
8606 int cpu;
8607@@ -62,7 +62,12 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
8608 for_each_online_cpu(cpu) {
8609 s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
8610 ret += *pcount;
8611+ if (set)
8612+ *pcount = 0;
8613 }
8614+ if (set)
8615+ fbc->count = ret;
8616+
8617 spin_unlock(&fbc->lock);
8618 return ret;
8619 }
8620diff --git a/mm/filemap.c b/mm/filemap.c
8621index 1e6a7d3..65d9d9e 100644
8622--- a/mm/filemap.c
8623+++ b/mm/filemap.c
8624@@ -236,11 +236,12 @@ int filemap_fdatawrite(struct address_space *mapping)
8625 }
8626 EXPORT_SYMBOL(filemap_fdatawrite);
8627
8628-static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
8629+int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
8630 loff_t end)
8631 {
8632 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
8633 }
8634+EXPORT_SYMBOL(filemap_fdatawrite_range);
8635
8636 /**
8637 * filemap_flush - mostly a non-blocking flush
8638diff --git a/mm/page-writeback.c b/mm/page-writeback.c
8639index 789b6ad..ded57d5 100644
8640--- a/mm/page-writeback.c
8641+++ b/mm/page-writeback.c
8642@@ -956,6 +956,9 @@ retry:
8643 }
8644 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
8645 mapping->writeback_index = index;
8646+
8647+ if (wbc->range_cont)
8648+ wbc->range_start = index << PAGE_CACHE_SHIFT;
8649 return ret;
8650 }
8651 EXPORT_SYMBOL(write_cache_pages);
This page took 0.928744 seconds and 4 git commands to generate.