]>
Commit | Line | Data |
---|---|---|
02f21861 | 1 | Patchset: 2.6.26-ext4-7 |
2 | ||
3 | This patch was created by combining the ext4-pushed-post-2.6.27-rc1.gz | |
4 | patches with the stable patches in 2.6.27-rc3-ext4-1 series. | |
5 | ||
6 | Documentation/filesystems/ext4.txt | 131 ++- | |
7 | fs/buffer.c | 19 +- | |
8 | fs/ext4/acl.c | 188 ++-- | |
9 | fs/ext4/balloc.c | 221 +++-- | |
10 | fs/ext4/dir.c | 37 +- | |
11 | fs/ext4/ext4.h | 64 +- | |
12 | fs/ext4/ext4_extents.h | 5 +- | |
13 | fs/ext4/ext4_i.h | 10 +- | |
14 | fs/ext4/ext4_jbd2.h | 29 +- | |
15 | fs/ext4/ext4_sb.h | 5 +- | |
16 | fs/ext4/extents.c | 277 +++--- | |
17 | fs/ext4/file.c | 20 +- | |
18 | fs/ext4/fsync.c | 4 + | |
19 | fs/ext4/group.h | 2 +- | |
20 | fs/ext4/ialloc.c | 169 +++- | |
21 | fs/ext4/inode.c | 1931 ++++++++++++++++++++++++++++++------ | |
22 | fs/ext4/mballoc.c | 744 +++++++++++---- | |
23 | fs/ext4/mballoc.h | 10 +- | |
24 | fs/ext4/migrate.c | 3 +- | |
25 | fs/ext4/namei.c | 45 +- | |
26 | fs/ext4/resize.c | 134 ++- | |
27 | fs/ext4/super.c | 451 ++++++--- | |
28 | fs/ext4/xattr.c | 4 +- | |
29 | fs/ext4/xattr_trusted.c | 4 +- | |
30 | fs/ext4/xattr_user.c | 4 +- | |
31 | fs/jbd2/checkpoint.c | 1 - | |
32 | fs/jbd2/commit.c | 308 +++---- | |
33 | fs/jbd2/journal.c | 54 +- | |
34 | fs/jbd2/transaction.c | 365 +++---- | |
35 | fs/mpage.c | 14 +- | |
36 | include/linux/fs.h | 2 + | |
37 | include/linux/jbd2.h | 73 +- | |
38 | include/linux/mpage.h | 10 + | |
39 | include/linux/percpu_counter.h | 12 +- | |
40 | include/linux/writeback.h | 1 + | |
41 | lib/percpu_counter.c | 7 +- | |
42 | mm/filemap.c | 3 +- | |
43 | mm/page-writeback.c | 3 + | |
44 | 38 files changed, 3822 insertions(+), 1542 deletions(-) | |
45 | ||
46 | diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt | |
47 | index 0c5086d..0d53949 100644 | |
48 | --- a/Documentation/filesystems/ext4.txt | |
49 | +++ b/Documentation/filesystems/ext4.txt | |
50 | @@ -13,72 +13,99 @@ Mailing list: linux-ext4@vger.kernel.org | |
51 | 1. Quick usage instructions: | |
52 | =========================== | |
53 | ||
54 | - - Grab updated e2fsprogs from | |
55 | - ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs-interim/ | |
56 | - This is a patchset on top of e2fsprogs-1.39, which can be found at | |
57 | + - Compile and install the latest version of e2fsprogs (as of this | |
58 | + writing version 1.41) from: | |
59 | + | |
60 | + http://sourceforge.net/project/showfiles.php?group_id=2406 | |
61 | + | |
62 | + or | |
63 | + | |
64 | ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/ | |
65 | ||
66 | - - It's still mke2fs -j /dev/hda1 | |
67 | + or grab the latest git repository from: | |
68 | + | |
69 | + git://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git | |
70 | + | |
71 | + - Note that it is highly important to install the mke2fs.conf file | |
72 | + that comes with the e2fsprogs 1.41.x sources in /etc/mke2fs.conf. If | |
73 | + you have edited the /etc/mke2fs.conf file installed on your system, | |
74 | + you will need to merge your changes with the version from e2fsprogs | |
75 | + 1.41.x. | |
76 | + | |
77 | + - Create a new filesystem using the ext4dev filesystem type: | |
78 | + | |
79 | + # mke2fs -t ext4dev /dev/hda1 | |
80 | + | |
81 | + Or configure an existing ext3 filesystem to support extents and set | |
82 | + the test_fs flag to indicate that it's ok for an in-development | |
83 | + filesystem to touch this filesystem: | |
84 | ||
85 | - - mount /dev/hda1 /wherever -t ext4dev | |
86 | + # tune2fs -O extents -E test_fs /dev/hda1 | |
87 | ||
88 | - - To enable extents, | |
89 | + If the filesystem was created with 128 byte inodes, it can be | |
90 | + converted to use 256 byte for greater efficiency via: | |
91 | ||
92 | - mount /dev/hda1 /wherever -t ext4dev -o extents | |
93 | + # tune2fs -I 256 /dev/hda1 | |
94 | ||
95 | - - The filesystem is compatible with the ext3 driver until you add a file | |
96 | - which has extents (ie: `mount -o extents', then create a file). | |
97 | + (Note: we currently do not have tools to convert an ext4dev | |
98 | + filesystem back to ext3; so please do not do try this on production | |
99 | + filesystems.) | |
100 | ||
101 | - NOTE: The "extents" mount flag is temporary. It will soon go away and | |
102 | - extents will be enabled by the "-o extents" flag to mke2fs or tune2fs | |
103 | + - Mounting: | |
104 | + | |
105 | + # mount -t ext4dev /dev/hda1 /wherever | |
106 | ||
107 | - When comparing performance with other filesystems, remember that | |
108 | - ext3/4 by default offers higher data integrity guarantees than most. So | |
109 | - when comparing with a metadata-only journalling filesystem, use `mount -o | |
110 | - data=writeback'. And you might as well use `mount -o nobh' too along | |
111 | - with it. Making the journal larger than the mke2fs default often helps | |
112 | - performance with metadata-intensive workloads. | |
113 | + ext3/4 by default offers higher data integrity guarantees than most. | |
114 | + So when comparing with a metadata-only journalling filesystem, such | |
115 | + as ext3, use `mount -o data=writeback'. And you might as well use | |
116 | + `mount -o nobh' too along with it. Making the journal larger than | |
117 | + the mke2fs default often helps performance with metadata-intensive | |
118 | + workloads. | |
119 | ||
120 | 2. Features | |
121 | =========== | |
122 | ||
123 | 2.1 Currently available | |
124 | ||
125 | -* ability to use filesystems > 16TB | |
126 | +* ability to use filesystems > 16TB (e2fsprogs support not available yet) | |
127 | * extent format reduces metadata overhead (RAM, IO for access, transactions) | |
128 | * extent format more robust in face of on-disk corruption due to magics, | |
129 | * internal redunancy in tree | |
130 | - | |
131 | -2.1 Previously available, soon to be enabled by default by "mkefs.ext4": | |
132 | - | |
133 | -* dir_index and resize inode will be on by default | |
134 | -* large inodes will be used by default for fast EAs, nsec timestamps, etc | |
135 | +* improved file allocation (multi-block alloc) | |
136 | +* fix 32000 subdirectory limit | |
137 | +* nsec timestamps for mtime, atime, ctime, create time | |
138 | +* inode version field on disk (NFSv4, Lustre) | |
139 | +* reduced e2fsck time via uninit_bg feature | |
140 | +* journal checksumming for robustness, performance | |
141 | +* persistent file preallocation (e.g for streaming media, databases) | |
142 | +* ability to pack bitmaps and inode tables into larger virtual groups via the | |
143 | + flex_bg feature | |
144 | +* large file support | |
145 | +* Inode allocation using large virtual block groups via flex_bg | |
146 | +* delayed allocation | |
147 | +* large block (up to pagesize) support | |
148 | +* efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force | |
149 | + the ordering) | |
150 | ||
151 | 2.2 Candidate features for future inclusion | |
152 | ||
153 | -There are several under discussion, whether they all make it in is | |
154 | -partly a function of how much time everyone has to work on them: | |
155 | +* Online defrag (patches available but not well tested) | |
156 | +* reduced mke2fs time via lazy itable initialization in conjuction with | |
157 | + the uninit_bg feature (capability to do this is available in e2fsprogs | |
158 | + but a kernel thread to do lazy zeroing of unused inode table blocks | |
159 | + after filesystem is first mounted is required for safety) | |
160 | ||
161 | -* improved file allocation (multi-block alloc, delayed alloc; basically done) | |
162 | -* fix 32000 subdirectory limit (patch exists, needs some e2fsck work) | |
163 | -* nsec timestamps for mtime, atime, ctime, create time (patch exists, | |
164 | - needs some e2fsck work) | |
165 | -* inode version field on disk (NFSv4, Lustre; prototype exists) | |
166 | -* reduced mke2fs/e2fsck time via uninitialized groups (prototype exists) | |
167 | -* journal checksumming for robustness, performance (prototype exists) | |
168 | -* persistent file preallocation (e.g for streaming media, databases) | |
169 | +There are several others under discussion, whether they all make it in is | |
170 | +partly a function of how much time everyone has to work on them. Features like | |
171 | +metadata checksumming have been discussed and planned for a bit but no patches | |
172 | +exist yet so I'm not sure they're in the near-term roadmap. | |
173 | ||
174 | -Features like metadata checksumming have been discussed and planned for | |
175 | -a bit but no patches exist yet so I'm not sure they're in the near-term | |
176 | -roadmap. | |
177 | +The big performance win will come with mballoc, delalloc and flex_bg | |
178 | +grouping of bitmaps and inode tables. Some test results available here: | |
179 | ||
180 | -The big performance win will come with mballoc and delalloc. CFS has | |
181 | -been using mballoc for a few years already with Lustre, and IBM + Bull | |
182 | -did a lot of benchmarking on it. The reason it isn't in the first set of | |
183 | -patches is partly a manageability issue, and partly because it doesn't | |
184 | -directly affect the on-disk format (outside of much better allocation) | |
185 | -so it isn't critical to get into the first round of changes. I believe | |
186 | -Alex is working on a new set of patches right now. | |
187 | + - http://www.bullopensource.org/ext4/20080530/ffsb-write-2.6.26-rc2.html | |
188 | + - http://www.bullopensource.org/ext4/20080530/ffsb-readwrite-2.6.26-rc2.html | |
189 | ||
190 | 3. Options | |
191 | ========== | |
192 | @@ -222,9 +249,11 @@ stripe=n Number of filesystem blocks that mballoc will try | |
193 | to use for allocation size and alignment. For RAID5/6 | |
194 | systems this should be the number of data | |
195 | disks * RAID chunk size in file system blocks. | |
196 | - | |
197 | +delalloc (*) Deferring block allocation until write-out time. | |
198 | +nodelalloc Disable delayed allocation. Blocks are allocation | |
199 | + when data is copied from user to page cache. | |
200 | Data Mode | |
201 | ---------- | |
202 | +========= | |
203 | There are 3 different data modes: | |
204 | ||
205 | * writeback mode | |
206 | @@ -236,10 +265,10 @@ typically provide the best ext4 performance. | |
207 | ||
208 | * ordered mode | |
209 | In data=ordered mode, ext4 only officially journals metadata, but it logically | |
210 | -groups metadata and data blocks into a single unit called a transaction. When | |
211 | -it's time to write the new metadata out to disk, the associated data blocks | |
212 | -are written first. In general, this mode performs slightly slower than | |
213 | -writeback but significantly faster than journal mode. | |
214 | +groups metadata information related to data changes with the data blocks into a | |
215 | +single unit called a transaction. When it's time to write the new metadata | |
216 | +out to disk, the associated data blocks are written first. In general, | |
217 | +this mode performs slightly slower than writeback but significantly faster than journal mode. | |
218 | ||
219 | * journal mode | |
220 | data=journal mode provides full data and metadata journaling. All new data is | |
221 | @@ -247,7 +276,8 @@ written to the journal first, and then to its final location. | |
222 | In the event of a crash, the journal can be replayed, bringing both data and | |
223 | metadata into a consistent state. This mode is the slowest except when data | |
224 | needs to be read from and written to disk at the same time where it | |
225 | -outperforms all others modes. | |
226 | +outperforms all others modes. Curently ext4 does not have delayed | |
227 | +allocation support if this data journalling mode is selected. | |
228 | ||
229 | References | |
230 | ========== | |
231 | @@ -256,7 +286,8 @@ kernel source: <file:fs/ext4/> | |
232 | <file:fs/jbd2/> | |
233 | ||
234 | programs: http://e2fsprogs.sourceforge.net/ | |
235 | - http://ext2resize.sourceforge.net | |
236 | ||
237 | useful links: http://fedoraproject.org/wiki/ext3-devel | |
238 | http://www.bullopensource.org/ext4/ | |
239 | + http://ext4.wiki.kernel.org/index.php/Main_Page | |
240 | + http://fedoraproject.org/wiki/Features/Ext4 | |
241 | diff --git a/fs/buffer.c b/fs/buffer.c | |
242 | index 0f51c0f..5fa1512 100644 | |
243 | --- a/fs/buffer.c | |
244 | +++ b/fs/buffer.c | |
245 | @@ -1691,11 +1691,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page, | |
246 | */ | |
247 | clear_buffer_dirty(bh); | |
248 | set_buffer_uptodate(bh); | |
249 | - } else if (!buffer_mapped(bh) && buffer_dirty(bh)) { | |
250 | + } else if ((!buffer_mapped(bh) || buffer_delay(bh)) && | |
251 | + buffer_dirty(bh)) { | |
252 | WARN_ON(bh->b_size != blocksize); | |
253 | err = get_block(inode, block, bh, 1); | |
254 | if (err) | |
255 | goto recover; | |
256 | + clear_buffer_delay(bh); | |
257 | if (buffer_new(bh)) { | |
258 | /* blockdev mappings never come here */ | |
259 | clear_buffer_new(bh); | |
260 | @@ -1774,7 +1776,8 @@ recover: | |
261 | bh = head; | |
262 | /* Recovery: lock and submit the mapped buffers */ | |
263 | do { | |
264 | - if (buffer_mapped(bh) && buffer_dirty(bh)) { | |
265 | + if (buffer_mapped(bh) && buffer_dirty(bh) && | |
266 | + !buffer_delay(bh)) { | |
267 | lock_buffer(bh); | |
268 | mark_buffer_async_write(bh); | |
269 | } else { | |
270 | @@ -2061,6 +2064,7 @@ int generic_write_end(struct file *file, struct address_space *mapping, | |
271 | struct page *page, void *fsdata) | |
272 | { | |
273 | struct inode *inode = mapping->host; | |
274 | + int i_size_changed = 0; | |
275 | ||
276 | copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); | |
277 | ||
278 | @@ -2073,12 +2077,21 @@ int generic_write_end(struct file *file, struct address_space *mapping, | |
279 | */ | |
280 | if (pos+copied > inode->i_size) { | |
281 | i_size_write(inode, pos+copied); | |
282 | - mark_inode_dirty(inode); | |
283 | + i_size_changed = 1; | |
284 | } | |
285 | ||
286 | unlock_page(page); | |
287 | page_cache_release(page); | |
288 | ||
289 | + /* | |
290 | + * Don't mark the inode dirty under page lock. First, it unnecessarily | |
291 | + * makes the holding time of page lock longer. Second, it forces lock | |
292 | + * ordering of page lock and transaction start for journaling | |
293 | + * filesystems. | |
294 | + */ | |
295 | + if (i_size_changed) | |
296 | + mark_inode_dirty(inode); | |
297 | + | |
298 | return copied; | |
299 | } | |
300 | EXPORT_SYMBOL(generic_write_end); | |
301 | diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c | |
302 | index 3c8dab8..a234b54 100644 | |
303 | --- a/fs/ext4/acl.c | |
304 | +++ b/fs/ext4/acl.c | |
305 | @@ -40,34 +40,35 @@ ext4_acl_from_disk(const void *value, size_t size) | |
306 | acl = posix_acl_alloc(count, GFP_NOFS); | |
307 | if (!acl) | |
308 | return ERR_PTR(-ENOMEM); | |
309 | - for (n=0; n < count; n++) { | |
310 | + for (n = 0; n < count; n++) { | |
311 | ext4_acl_entry *entry = | |
312 | (ext4_acl_entry *)value; | |
313 | if ((char *)value + sizeof(ext4_acl_entry_short) > end) | |
314 | goto fail; | |
315 | acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); | |
316 | acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); | |
317 | - switch(acl->a_entries[n].e_tag) { | |
318 | - case ACL_USER_OBJ: | |
319 | - case ACL_GROUP_OBJ: | |
320 | - case ACL_MASK: | |
321 | - case ACL_OTHER: | |
322 | - value = (char *)value + | |
323 | - sizeof(ext4_acl_entry_short); | |
324 | - acl->a_entries[n].e_id = ACL_UNDEFINED_ID; | |
325 | - break; | |
326 | - | |
327 | - case ACL_USER: | |
328 | - case ACL_GROUP: | |
329 | - value = (char *)value + sizeof(ext4_acl_entry); | |
330 | - if ((char *)value > end) | |
331 | - goto fail; | |
332 | - acl->a_entries[n].e_id = | |
333 | - le32_to_cpu(entry->e_id); | |
334 | - break; | |
335 | - | |
336 | - default: | |
337 | + | |
338 | + switch (acl->a_entries[n].e_tag) { | |
339 | + case ACL_USER_OBJ: | |
340 | + case ACL_GROUP_OBJ: | |
341 | + case ACL_MASK: | |
342 | + case ACL_OTHER: | |
343 | + value = (char *)value + | |
344 | + sizeof(ext4_acl_entry_short); | |
345 | + acl->a_entries[n].e_id = ACL_UNDEFINED_ID; | |
346 | + break; | |
347 | + | |
348 | + case ACL_USER: | |
349 | + case ACL_GROUP: | |
350 | + value = (char *)value + sizeof(ext4_acl_entry); | |
351 | + if ((char *)value > end) | |
352 | goto fail; | |
353 | + acl->a_entries[n].e_id = | |
354 | + le32_to_cpu(entry->e_id); | |
355 | + break; | |
356 | + | |
357 | + default: | |
358 | + goto fail; | |
359 | } | |
360 | } | |
361 | if (value != end) | |
362 | @@ -96,27 +97,26 @@ ext4_acl_to_disk(const struct posix_acl *acl, size_t *size) | |
363 | return ERR_PTR(-ENOMEM); | |
364 | ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION); | |
365 | e = (char *)ext_acl + sizeof(ext4_acl_header); | |
366 | - for (n=0; n < acl->a_count; n++) { | |
367 | + for (n = 0; n < acl->a_count; n++) { | |
368 | ext4_acl_entry *entry = (ext4_acl_entry *)e; | |
369 | entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); | |
370 | entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); | |
371 | - switch(acl->a_entries[n].e_tag) { | |
372 | - case ACL_USER: | |
373 | - case ACL_GROUP: | |
374 | - entry->e_id = | |
375 | - cpu_to_le32(acl->a_entries[n].e_id); | |
376 | - e += sizeof(ext4_acl_entry); | |
377 | - break; | |
378 | - | |
379 | - case ACL_USER_OBJ: | |
380 | - case ACL_GROUP_OBJ: | |
381 | - case ACL_MASK: | |
382 | - case ACL_OTHER: | |
383 | - e += sizeof(ext4_acl_entry_short); | |
384 | - break; | |
385 | - | |
386 | - default: | |
387 | - goto fail; | |
388 | + switch (acl->a_entries[n].e_tag) { | |
389 | + case ACL_USER: | |
390 | + case ACL_GROUP: | |
391 | + entry->e_id = cpu_to_le32(acl->a_entries[n].e_id); | |
392 | + e += sizeof(ext4_acl_entry); | |
393 | + break; | |
394 | + | |
395 | + case ACL_USER_OBJ: | |
396 | + case ACL_GROUP_OBJ: | |
397 | + case ACL_MASK: | |
398 | + case ACL_OTHER: | |
399 | + e += sizeof(ext4_acl_entry_short); | |
400 | + break; | |
401 | + | |
402 | + default: | |
403 | + goto fail; | |
404 | } | |
405 | } | |
406 | return (char *)ext_acl; | |
407 | @@ -167,23 +167,23 @@ ext4_get_acl(struct inode *inode, int type) | |
408 | if (!test_opt(inode->i_sb, POSIX_ACL)) | |
409 | return NULL; | |
410 | ||
411 | - switch(type) { | |
412 | - case ACL_TYPE_ACCESS: | |
413 | - acl = ext4_iget_acl(inode, &ei->i_acl); | |
414 | - if (acl != EXT4_ACL_NOT_CACHED) | |
415 | - return acl; | |
416 | - name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; | |
417 | - break; | |
418 | - | |
419 | - case ACL_TYPE_DEFAULT: | |
420 | - acl = ext4_iget_acl(inode, &ei->i_default_acl); | |
421 | - if (acl != EXT4_ACL_NOT_CACHED) | |
422 | - return acl; | |
423 | - name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; | |
424 | - break; | |
425 | - | |
426 | - default: | |
427 | - return ERR_PTR(-EINVAL); | |
428 | + switch (type) { | |
429 | + case ACL_TYPE_ACCESS: | |
430 | + acl = ext4_iget_acl(inode, &ei->i_acl); | |
431 | + if (acl != EXT4_ACL_NOT_CACHED) | |
432 | + return acl; | |
433 | + name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; | |
434 | + break; | |
435 | + | |
436 | + case ACL_TYPE_DEFAULT: | |
437 | + acl = ext4_iget_acl(inode, &ei->i_default_acl); | |
438 | + if (acl != EXT4_ACL_NOT_CACHED) | |
439 | + return acl; | |
440 | + name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; | |
441 | + break; | |
442 | + | |
443 | + default: | |
444 | + return ERR_PTR(-EINVAL); | |
445 | } | |
446 | retval = ext4_xattr_get(inode, name_index, "", NULL, 0); | |
447 | if (retval > 0) { | |
448 | @@ -201,14 +201,14 @@ ext4_get_acl(struct inode *inode, int type) | |
449 | kfree(value); | |
450 | ||
451 | if (!IS_ERR(acl)) { | |
452 | - switch(type) { | |
453 | - case ACL_TYPE_ACCESS: | |
454 | - ext4_iset_acl(inode, &ei->i_acl, acl); | |
455 | - break; | |
456 | - | |
457 | - case ACL_TYPE_DEFAULT: | |
458 | - ext4_iset_acl(inode, &ei->i_default_acl, acl); | |
459 | - break; | |
460 | + switch (type) { | |
461 | + case ACL_TYPE_ACCESS: | |
462 | + ext4_iset_acl(inode, &ei->i_acl, acl); | |
463 | + break; | |
464 | + | |
465 | + case ACL_TYPE_DEFAULT: | |
466 | + ext4_iset_acl(inode, &ei->i_default_acl, acl); | |
467 | + break; | |
468 | } | |
469 | } | |
470 | return acl; | |
471 | @@ -232,31 +232,31 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, | |
472 | if (S_ISLNK(inode->i_mode)) | |
473 | return -EOPNOTSUPP; | |
474 | ||
475 | - switch(type) { | |
476 | - case ACL_TYPE_ACCESS: | |
477 | - name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; | |
478 | - if (acl) { | |
479 | - mode_t mode = inode->i_mode; | |
480 | - error = posix_acl_equiv_mode(acl, &mode); | |
481 | - if (error < 0) | |
482 | - return error; | |
483 | - else { | |
484 | - inode->i_mode = mode; | |
485 | - ext4_mark_inode_dirty(handle, inode); | |
486 | - if (error == 0) | |
487 | - acl = NULL; | |
488 | - } | |
489 | + switch (type) { | |
490 | + case ACL_TYPE_ACCESS: | |
491 | + name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; | |
492 | + if (acl) { | |
493 | + mode_t mode = inode->i_mode; | |
494 | + error = posix_acl_equiv_mode(acl, &mode); | |
495 | + if (error < 0) | |
496 | + return error; | |
497 | + else { | |
498 | + inode->i_mode = mode; | |
499 | + ext4_mark_inode_dirty(handle, inode); | |
500 | + if (error == 0) | |
501 | + acl = NULL; | |
502 | } | |
503 | - break; | |
504 | + } | |
505 | + break; | |
506 | ||
507 | - case ACL_TYPE_DEFAULT: | |
508 | - name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; | |
509 | - if (!S_ISDIR(inode->i_mode)) | |
510 | - return acl ? -EACCES : 0; | |
511 | - break; | |
512 | + case ACL_TYPE_DEFAULT: | |
513 | + name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; | |
514 | + if (!S_ISDIR(inode->i_mode)) | |
515 | + return acl ? -EACCES : 0; | |
516 | + break; | |
517 | ||
518 | - default: | |
519 | - return -EINVAL; | |
520 | + default: | |
521 | + return -EINVAL; | |
522 | } | |
523 | if (acl) { | |
524 | value = ext4_acl_to_disk(acl, &size); | |
525 | @@ -269,14 +269,14 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, | |
526 | ||
527 | kfree(value); | |
528 | if (!error) { | |
529 | - switch(type) { | |
530 | - case ACL_TYPE_ACCESS: | |
531 | - ext4_iset_acl(inode, &ei->i_acl, acl); | |
532 | - break; | |
533 | - | |
534 | - case ACL_TYPE_DEFAULT: | |
535 | - ext4_iset_acl(inode, &ei->i_default_acl, acl); | |
536 | - break; | |
537 | + switch (type) { | |
538 | + case ACL_TYPE_ACCESS: | |
539 | + ext4_iset_acl(inode, &ei->i_acl, acl); | |
540 | + break; | |
541 | + | |
542 | + case ACL_TYPE_DEFAULT: | |
543 | + ext4_iset_acl(inode, &ei->i_default_acl, acl); | |
544 | + break; | |
545 | } | |
546 | } | |
547 | return error; | |
548 | diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c | |
549 | index 9cc80b9..e9fa960 100644 | |
550 | --- a/fs/ext4/balloc.c | |
551 | +++ b/fs/ext4/balloc.c | |
552 | @@ -47,7 +47,7 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, | |
553 | ext4_group_t block_group) | |
554 | { | |
555 | ext4_group_t actual_group; | |
556 | - ext4_get_group_no_and_offset(sb, block, &actual_group, 0); | |
557 | + ext4_get_group_no_and_offset(sb, block, &actual_group, NULL); | |
558 | if (actual_group == block_group) | |
559 | return 1; | |
560 | return 0; | |
561 | @@ -121,12 +121,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, | |
562 | le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); | |
563 | } | |
564 | } else { /* For META_BG_BLOCK_GROUPS */ | |
565 | - int group_rel = (block_group - | |
566 | - le32_to_cpu(sbi->s_es->s_first_meta_bg)) % | |
567 | - EXT4_DESC_PER_BLOCK(sb); | |
568 | - if (group_rel == 0 || group_rel == 1 || | |
569 | - (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1)) | |
570 | - bit_max += 1; | |
571 | + bit_max += ext4_bg_num_gdb(sb, block_group); | |
572 | } | |
573 | ||
574 | if (block_group == sbi->s_groups_count - 1) { | |
575 | @@ -295,7 +290,7 @@ err_out: | |
576 | return 0; | |
577 | } | |
578 | /** | |
579 | - * read_block_bitmap() | |
580 | + * ext4_read_block_bitmap() | |
581 | * @sb: super block | |
582 | * @block_group: given block group | |
583 | * | |
584 | @@ -305,7 +300,7 @@ err_out: | |
585 | * Return buffer_head on success or NULL in case of failure. | |
586 | */ | |
587 | struct buffer_head * | |
588 | -read_block_bitmap(struct super_block *sb, ext4_group_t block_group) | |
589 | +ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) | |
590 | { | |
591 | struct ext4_group_desc * desc; | |
592 | struct buffer_head * bh = NULL; | |
593 | @@ -319,25 +314,28 @@ read_block_bitmap(struct super_block *sb, ext4_group_t block_group) | |
594 | if (unlikely(!bh)) { | |
595 | ext4_error(sb, __func__, | |
596 | "Cannot read block bitmap - " | |
597 | - "block_group = %d, block_bitmap = %llu", | |
598 | - (int)block_group, (unsigned long long)bitmap_blk); | |
599 | + "block_group = %lu, block_bitmap = %llu", | |
600 | + block_group, bitmap_blk); | |
601 | return NULL; | |
602 | } | |
603 | if (bh_uptodate_or_lock(bh)) | |
604 | return bh; | |
605 | ||
606 | + spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); | |
607 | if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { | |
608 | ext4_init_block_bitmap(sb, bh, block_group, desc); | |
609 | set_buffer_uptodate(bh); | |
610 | unlock_buffer(bh); | |
611 | + spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); | |
612 | return bh; | |
613 | } | |
614 | + spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); | |
615 | if (bh_submit_read(bh) < 0) { | |
616 | put_bh(bh); | |
617 | ext4_error(sb, __func__, | |
618 | "Cannot read block bitmap - " | |
619 | - "block_group = %d, block_bitmap = %llu", | |
620 | - (int)block_group, (unsigned long long)bitmap_blk); | |
621 | + "block_group = %lu, block_bitmap = %llu", | |
622 | + block_group, bitmap_blk); | |
623 | return NULL; | |
624 | } | |
625 | ext4_valid_block_bitmap(sb, desc, block_group, bh); | |
626 | @@ -409,8 +407,7 @@ restart: | |
627 | prev = rsv; | |
628 | } | |
629 | printk("Window map complete.\n"); | |
630 | - if (bad) | |
631 | - BUG(); | |
632 | + BUG_ON(bad); | |
633 | } | |
634 | #define rsv_window_dump(root, verbose) \ | |
635 | __rsv_window_dump((root), (verbose), __func__) | |
636 | @@ -694,7 +691,7 @@ do_more: | |
637 | count -= overflow; | |
638 | } | |
639 | brelse(bitmap_bh); | |
640 | - bitmap_bh = read_block_bitmap(sb, block_group); | |
641 | + bitmap_bh = ext4_read_block_bitmap(sb, block_group); | |
642 | if (!bitmap_bh) | |
643 | goto error_return; | |
644 | desc = ext4_get_group_desc (sb, block_group, &gd_bh); | |
645 | @@ -810,6 +807,13 @@ do_more: | |
646 | spin_unlock(sb_bgl_lock(sbi, block_group)); | |
647 | percpu_counter_add(&sbi->s_freeblocks_counter, count); | |
648 | ||
649 | + if (sbi->s_log_groups_per_flex) { | |
650 | + ext4_group_t flex_group = ext4_flex_group(sbi, block_group); | |
651 | + spin_lock(sb_bgl_lock(sbi, flex_group)); | |
652 | + sbi->s_flex_groups[flex_group].free_blocks += count; | |
653 | + spin_unlock(sb_bgl_lock(sbi, flex_group)); | |
654 | + } | |
655 | + | |
656 | /* We dirtied the bitmap block */ | |
657 | BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); | |
658 | err = ext4_journal_dirty_metadata(handle, bitmap_bh); | |
659 | @@ -1598,23 +1602,38 @@ out: | |
660 | ||
661 | /** | |
662 | * ext4_has_free_blocks() | |
663 | - * @sbi: in-core super block structure. | |
664 | + * @sbi: in-core super block structure. | |
665 | + * @nblocks: number of neeed blocks | |
666 | * | |
667 | - * Check if filesystem has at least 1 free block available for allocation. | |
668 | + * Check if filesystem has free blocks available for allocation. | |
669 | + * Return the number of blocks avaible for allocation for this request | |
670 | + * On success, return nblocks | |
671 | */ | |
672 | -static int ext4_has_free_blocks(struct ext4_sb_info *sbi) | |
673 | +ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, | |
674 | + ext4_fsblk_t nblocks) | |
675 | { | |
676 | - ext4_fsblk_t free_blocks, root_blocks; | |
677 | + ext4_fsblk_t free_blocks; | |
678 | + ext4_fsblk_t root_blocks = 0; | |
679 | ||
680 | free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); | |
681 | - root_blocks = ext4_r_blocks_count(sbi->s_es); | |
682 | - if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && | |
683 | + | |
684 | + if (!capable(CAP_SYS_RESOURCE) && | |
685 | sbi->s_resuid != current->fsuid && | |
686 | - (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { | |
687 | + (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid))) | |
688 | + root_blocks = ext4_r_blocks_count(sbi->s_es); | |
689 | +#ifdef CONFIG_SMP | |
690 | + if (free_blocks - root_blocks < FBC_BATCH) | |
691 | + free_blocks = | |
692 | + percpu_counter_sum_and_set(&sbi->s_freeblocks_counter); | |
693 | +#endif | |
694 | + if (free_blocks <= root_blocks) | |
695 | + /* we don't have free space */ | |
696 | return 0; | |
697 | - } | |
698 | - return 1; | |
699 | -} | |
700 | + if (free_blocks - root_blocks < nblocks) | |
701 | + return free_blocks - root_blocks; | |
702 | + return nblocks; | |
703 | + } | |
704 | + | |
705 | ||
706 | /** | |
707 | * ext4_should_retry_alloc() | |
708 | @@ -1630,7 +1649,7 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi) | |
709 | */ | |
710 | int ext4_should_retry_alloc(struct super_block *sb, int *retries) | |
711 | { | |
712 | - if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3) | |
713 | + if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3) | |
714 | return 0; | |
715 | ||
716 | jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); | |
717 | @@ -1639,20 +1658,24 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) | |
718 | } | |
719 | ||
720 | /** | |
721 | - * ext4_new_blocks_old() -- core block(s) allocation function | |
722 | + * ext4_old_new_blocks() -- core block bitmap based block allocation function | |
723 | + * | |
724 | * @handle: handle to this transaction | |
725 | * @inode: file inode | |
726 | * @goal: given target block(filesystem wide) | |
727 | * @count: target number of blocks to allocate | |
728 | * @errp: error code | |
729 | * | |
730 | - * ext4_new_blocks uses a goal block to assist allocation. It tries to | |
731 | - * allocate block(s) from the block group contains the goal block first. If that | |
732 | - * fails, it will try to allocate block(s) from other block groups without | |
733 | - * any specific goal block. | |
734 | + * ext4_old_new_blocks uses a goal block to assist allocation and look up | |
735 | + * the block bitmap directly to do block allocation. It tries to | |
736 | + * allocate block(s) from the block group contains the goal block first. If | |
737 | + * that fails, it will try to allocate block(s) from other block groups | |
738 | + * without any specific goal block. | |
739 | + * | |
740 | + * This function is called when -o nomballoc mount option is enabled | |
741 | * | |
742 | */ | |
743 | -ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, | |
744 | +ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode, | |
745 | ext4_fsblk_t goal, unsigned long *count, int *errp) | |
746 | { | |
747 | struct buffer_head *bitmap_bh = NULL; | |
748 | @@ -1676,13 +1699,26 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, | |
749 | ext4_group_t ngroups; | |
750 | unsigned long num = *count; | |
751 | ||
752 | - *errp = -ENOSPC; | |
753 | sb = inode->i_sb; | |
754 | if (!sb) { | |
755 | + *errp = -ENODEV; | |
756 | printk("ext4_new_block: nonexistent device"); | |
757 | return 0; | |
758 | } | |
759 | ||
760 | + sbi = EXT4_SB(sb); | |
761 | + if (!EXT4_I(inode)->i_delalloc_reserved_flag) { | |
762 | + /* | |
763 | + * With delalloc we already reserved the blocks | |
764 | + */ | |
765 | + *count = ext4_has_free_blocks(sbi, *count); | |
766 | + } | |
767 | + if (*count == 0) { | |
768 | + *errp = -ENOSPC; | |
769 | + return 0; /*return with ENOSPC error */ | |
770 | + } | |
771 | + num = *count; | |
772 | + | |
773 | /* | |
774 | * Check quota for allocation of this block. | |
775 | */ | |
776 | @@ -1706,11 +1742,6 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, | |
777 | if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0)) | |
778 | my_rsv = &block_i->rsv_window_node; | |
779 | ||
780 | - if (!ext4_has_free_blocks(sbi)) { | |
781 | - *errp = -ENOSPC; | |
782 | - goto out; | |
783 | - } | |
784 | - | |
785 | /* | |
786 | * First, test whether the goal block is free. | |
787 | */ | |
788 | @@ -1734,7 +1765,7 @@ retry_alloc: | |
789 | my_rsv = NULL; | |
790 | ||
791 | if (free_blocks > 0) { | |
792 | - bitmap_bh = read_block_bitmap(sb, group_no); | |
793 | + bitmap_bh = ext4_read_block_bitmap(sb, group_no); | |
794 | if (!bitmap_bh) | |
795 | goto io_error; | |
796 | grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle, | |
797 | @@ -1770,7 +1801,7 @@ retry_alloc: | |
798 | continue; | |
799 | ||
800 | brelse(bitmap_bh); | |
801 | - bitmap_bh = read_block_bitmap(sb, group_no); | |
802 | + bitmap_bh = ext4_read_block_bitmap(sb, group_no); | |
803 | if (!bitmap_bh) | |
804 | goto io_error; | |
805 | /* | |
806 | @@ -1882,7 +1913,15 @@ allocated: | |
807 | le16_add_cpu(&gdp->bg_free_blocks_count, -num); | |
808 | gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp); | |
809 | spin_unlock(sb_bgl_lock(sbi, group_no)); | |
810 | - percpu_counter_sub(&sbi->s_freeblocks_counter, num); | |
811 | + if (!EXT4_I(inode)->i_delalloc_reserved_flag) | |
812 | + percpu_counter_sub(&sbi->s_freeblocks_counter, num); | |
813 | + | |
814 | + if (sbi->s_log_groups_per_flex) { | |
815 | + ext4_group_t flex_group = ext4_flex_group(sbi, group_no); | |
816 | + spin_lock(sb_bgl_lock(sbi, flex_group)); | |
817 | + sbi->s_flex_groups[flex_group].free_blocks -= num; | |
818 | + spin_unlock(sb_bgl_lock(sbi, flex_group)); | |
819 | + } | |
820 | ||
821 | BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); | |
822 | err = ext4_journal_dirty_metadata(handle, gdp_bh); | |
823 | @@ -1915,46 +1954,104 @@ out: | |
824 | return 0; | |
825 | } | |
826 | ||
827 | -ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode, | |
828 | - ext4_fsblk_t goal, int *errp) | |
829 | +#define EXT4_META_BLOCK 0x1 | |
830 | + | |
831 | +static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode, | |
832 | + ext4_lblk_t iblock, ext4_fsblk_t goal, | |
833 | + unsigned long *count, int *errp, int flags) | |
834 | { | |
835 | struct ext4_allocation_request ar; | |
836 | ext4_fsblk_t ret; | |
837 | ||
838 | if (!test_opt(inode->i_sb, MBALLOC)) { | |
839 | - unsigned long count = 1; | |
840 | - ret = ext4_new_blocks_old(handle, inode, goal, &count, errp); | |
841 | - return ret; | |
842 | + return ext4_old_new_blocks(handle, inode, goal, count, errp); | |
843 | } | |
844 | ||
845 | memset(&ar, 0, sizeof(ar)); | |
846 | + /* Fill with neighbour allocated blocks */ | |
847 | + | |
848 | ar.inode = inode; | |
849 | ar.goal = goal; | |
850 | - ar.len = 1; | |
851 | + ar.len = *count; | |
852 | + ar.logical = iblock; | |
853 | + | |
854 | + if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK)) | |
855 | + /* enable in-core preallocation for data block allocation */ | |
856 | + ar.flags = EXT4_MB_HINT_DATA; | |
857 | + else | |
858 | + /* disable in-core preallocation for non-regular files */ | |
859 | + ar.flags = 0; | |
860 | + | |
861 | ret = ext4_mb_new_blocks(handle, &ar, errp); | |
862 | + *count = ar.len; | |
863 | return ret; | |
864 | } | |
865 | ||
866 | -ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, | |
867 | +/* | |
868 | + * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks | |
869 | + * | |
870 | + * @handle: handle to this transaction | |
871 | + * @inode: file inode | |
872 | + * @goal: given target block(filesystem wide) | |
873 | + * @count: total number of blocks need | |
874 | + * @errp: error code | |
875 | + * | |
876 | + * Return 1st allocated block numberon success, *count stores total account | |
877 | + * error stores in errp pointer | |
878 | + */ | |
879 | +ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, | |
880 | ext4_fsblk_t goal, unsigned long *count, int *errp) | |
881 | { | |
882 | - struct ext4_allocation_request ar; | |
883 | ext4_fsblk_t ret; | |
884 | - | |
885 | - if (!test_opt(inode->i_sb, MBALLOC)) { | |
886 | - ret = ext4_new_blocks_old(handle, inode, goal, count, errp); | |
887 | - return ret; | |
888 | + ret = do_blk_alloc(handle, inode, 0, goal, | |
889 | + count, errp, EXT4_META_BLOCK); | |
890 | + /* | |
891 | + * Account for the allocated meta blocks | |
892 | + */ | |
893 | + if (!(*errp)) { | |
894 | + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); | |
895 | + EXT4_I(inode)->i_allocated_meta_blocks += *count; | |
896 | + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | |
897 | } | |
898 | - | |
899 | - memset(&ar, 0, sizeof(ar)); | |
900 | - ar.inode = inode; | |
901 | - ar.goal = goal; | |
902 | - ar.len = *count; | |
903 | - ret = ext4_mb_new_blocks(handle, &ar, errp); | |
904 | - *count = ar.len; | |
905 | return ret; | |
906 | } | |
907 | ||
908 | +/* | |
909 | + * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks | |
910 | + * | |
911 | + * @handle: handle to this transaction | |
912 | + * @inode: file inode | |
913 | + * @goal: given target block(filesystem wide) | |
914 | + * @errp: error code | |
915 | + * | |
916 | + * Return allocated block number on success | |
917 | + */ | |
918 | +ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode, | |
919 | + ext4_fsblk_t goal, int *errp) | |
920 | +{ | |
921 | + unsigned long count = 1; | |
922 | + return ext4_new_meta_blocks(handle, inode, goal, &count, errp); | |
923 | +} | |
924 | + | |
925 | +/* | |
926 | + * ext4_new_blocks() -- allocate data blocks | |
927 | + * | |
928 | + * @handle: handle to this transaction | |
929 | + * @inode: file inode | |
930 | + * @goal: given target block(filesystem wide) | |
931 | + * @count: total number of blocks need | |
932 | + * @errp: error code | |
933 | + * | |
934 | + * Return 1st allocated block numberon success, *count stores total account | |
935 | + * error stores in errp pointer | |
936 | + */ | |
937 | + | |
938 | +ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, | |
939 | + ext4_lblk_t iblock, ext4_fsblk_t goal, | |
940 | + unsigned long *count, int *errp) | |
941 | +{ | |
942 | + return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0); | |
943 | +} | |
944 | ||
945 | /** | |
946 | * ext4_count_free_blocks() -- count filesystem free blocks | |
947 | @@ -1986,7 +2083,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) | |
948 | continue; | |
949 | desc_count += le16_to_cpu(gdp->bg_free_blocks_count); | |
950 | brelse(bitmap_bh); | |
951 | - bitmap_bh = read_block_bitmap(sb, i); | |
952 | + bitmap_bh = ext4_read_block_bitmap(sb, i); | |
953 | if (bitmap_bh == NULL) | |
954 | continue; | |
955 | ||
956 | diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c | |
957 | index 2bf0331..ec8e33b 100644 | |
958 | --- a/fs/ext4/dir.c | |
959 | +++ b/fs/ext4/dir.c | |
960 | @@ -129,7 +129,8 @@ static int ext4_readdir(struct file * filp, | |
961 | struct buffer_head *bh = NULL; | |
962 | ||
963 | map_bh.b_state = 0; | |
964 | - err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0); | |
965 | + err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, | |
966 | + 0, 0, 0); | |
967 | if (err > 0) { | |
968 | pgoff_t index = map_bh.b_blocknr >> | |
969 | (PAGE_CACHE_SHIFT - inode->i_blkbits); | |
970 | @@ -272,7 +273,7 @@ static void free_rb_tree_fname(struct rb_root *root) | |
971 | ||
972 | while (n) { | |
973 | /* Do the node's children first */ | |
974 | - if ((n)->rb_left) { | |
975 | + if (n->rb_left) { | |
976 | n = n->rb_left; | |
977 | continue; | |
978 | } | |
979 | @@ -301,24 +302,18 @@ static void free_rb_tree_fname(struct rb_root *root) | |
980 | parent->rb_right = NULL; | |
981 | n = parent; | |
982 | } | |
983 | - root->rb_node = NULL; | |
984 | } | |
985 | ||
986 | ||
987 | -static struct dir_private_info *create_dir_info(loff_t pos) | |
988 | +static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos) | |
989 | { | |
990 | struct dir_private_info *p; | |
991 | ||
992 | - p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL); | |
993 | + p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); | |
994 | if (!p) | |
995 | return NULL; | |
996 | - p->root.rb_node = NULL; | |
997 | - p->curr_node = NULL; | |
998 | - p->extra_fname = NULL; | |
999 | - p->last_pos = 0; | |
1000 | p->curr_hash = pos2maj_hash(pos); | |
1001 | p->curr_minor_hash = pos2min_hash(pos); | |
1002 | - p->next_hash = 0; | |
1003 | return p; | |
1004 | } | |
1005 | ||
1006 | @@ -416,7 +411,7 @@ static int call_filldir(struct file * filp, void * dirent, | |
1007 | get_dtype(sb, fname->file_type)); | |
1008 | if (error) { | |
1009 | filp->f_pos = curr_pos; | |
1010 | - info->extra_fname = fname->next; | |
1011 | + info->extra_fname = fname; | |
1012 | return error; | |
1013 | } | |
1014 | fname = fname->next; | |
1015 | @@ -433,7 +428,7 @@ static int ext4_dx_readdir(struct file * filp, | |
1016 | int ret; | |
1017 | ||
1018 | if (!info) { | |
1019 | - info = create_dir_info(filp->f_pos); | |
1020 | + info = ext4_htree_create_dir_info(filp->f_pos); | |
1021 | if (!info) | |
1022 | return -ENOMEM; | |
1023 | filp->private_data = info; | |
1024 | @@ -455,11 +450,21 @@ static int ext4_dx_readdir(struct file * filp, | |
1025 | * If there are any leftover names on the hash collision | |
1026 | * chain, return them first. | |
1027 | */ | |
1028 | - if (info->extra_fname && | |
1029 | - call_filldir(filp, dirent, filldir, info->extra_fname)) | |
1030 | - goto finished; | |
1031 | + if (info->extra_fname) { | |
1032 | + if (call_filldir(filp, dirent, filldir, info->extra_fname)) | |
1033 | + goto finished; | |
1034 | ||
1035 | - if (!info->curr_node) | |
1036 | + info->extra_fname = NULL; | |
1037 | + info->curr_node = rb_next(info->curr_node); | |
1038 | + if (!info->curr_node) { | |
1039 | + if (info->next_hash == ~0) { | |
1040 | + filp->f_pos = EXT4_HTREE_EOF; | |
1041 | + goto finished; | |
1042 | + } | |
1043 | + info->curr_hash = info->next_hash; | |
1044 | + info->curr_minor_hash = 0; | |
1045 | + } | |
1046 | + } else if (!info->curr_node) | |
1047 | info->curr_node = rb_first(&info->root); | |
1048 | ||
1049 | while (1) { | |
1050 | diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h | |
1051 | index 8158083..2950032 100644 | |
1052 | --- a/fs/ext4/ext4.h | |
1053 | +++ b/fs/ext4/ext4.h | |
1054 | @@ -22,7 +22,7 @@ | |
1055 | #include "ext4_i.h" | |
1056 | ||
1057 | /* | |
1058 | - * The second extended filesystem constants/structures | |
1059 | + * The fourth extended filesystem constants/structures | |
1060 | */ | |
1061 | ||
1062 | /* | |
1063 | @@ -45,7 +45,7 @@ | |
1064 | #define ext4_debug(f, a...) \ | |
1065 | do { \ | |
1066 | printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ | |
1067 | - __FILE__, __LINE__, __FUNCTION__); \ | |
1068 | + __FILE__, __LINE__, __func__); \ | |
1069 | printk (KERN_DEBUG f, ## a); \ | |
1070 | } while (0) | |
1071 | #else | |
1072 | @@ -74,6 +74,9 @@ | |
1073 | #define EXT4_MB_HINT_GOAL_ONLY 256 | |
1074 | /* goal is meaningful */ | |
1075 | #define EXT4_MB_HINT_TRY_GOAL 512 | |
1076 | +/* blocks already pre-reserved by delayed allocation */ | |
1077 | +#define EXT4_MB_DELALLOC_RESERVED 1024 | |
1078 | + | |
1079 | ||
1080 | struct ext4_allocation_request { | |
1081 | /* target inode for block we're allocating */ | |
1082 | @@ -170,6 +173,15 @@ struct ext4_group_desc | |
1083 | __u32 bg_reserved2[3]; | |
1084 | }; | |
1085 | ||
1086 | +/* | |
1087 | + * Structure of a flex block group info | |
1088 | + */ | |
1089 | + | |
1090 | +struct flex_groups { | |
1091 | + __u32 free_inodes; | |
1092 | + __u32 free_blocks; | |
1093 | +}; | |
1094 | + | |
1095 | #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ | |
1096 | #define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ | |
1097 | #define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ | |
1098 | @@ -527,6 +539,7 @@ do { \ | |
1099 | #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ | |
1100 | #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ | |
1101 | #define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */ | |
1102 | +#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ | |
1103 | /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ | |
1104 | #ifndef _LINUX_EXT2_FS_H | |
1105 | #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt | |
1106 | @@ -647,7 +660,10 @@ struct ext4_super_block { | |
1107 | __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ | |
1108 | __le64 s_mmp_block; /* Block for multi-mount protection */ | |
1109 | __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ | |
1110 | - __u32 s_reserved[163]; /* Padding to the end of the block */ | |
1111 | + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ | |
1112 | + __u8 s_reserved_char_pad2; | |
1113 | + __le16 s_reserved_pad; | |
1114 | + __u32 s_reserved[162]; /* Padding to the end of the block */ | |
1115 | }; | |
1116 | ||
1117 | #ifdef __KERNEL__ | |
1118 | @@ -958,12 +974,17 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, | |
1119 | extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); | |
1120 | extern unsigned long ext4_bg_num_gdb(struct super_block *sb, | |
1121 | ext4_group_t group); | |
1122 | -extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode, | |
1123 | +extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode, | |
1124 | ext4_fsblk_t goal, int *errp); | |
1125 | -extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode, | |
1126 | +extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, | |
1127 | ext4_fsblk_t goal, unsigned long *count, int *errp); | |
1128 | -extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, | |
1129 | +extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, | |
1130 | + ext4_lblk_t iblock, ext4_fsblk_t goal, | |
1131 | + unsigned long *count, int *errp); | |
1132 | +extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode, | |
1133 | ext4_fsblk_t goal, unsigned long *count, int *errp); | |
1134 | +extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, | |
1135 | + ext4_fsblk_t nblocks); | |
1136 | extern void ext4_free_blocks (handle_t *handle, struct inode *inode, | |
1137 | ext4_fsblk_t block, unsigned long count, int metadata); | |
1138 | extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb, | |
1139 | @@ -1016,6 +1037,10 @@ extern int __init init_ext4_mballoc(void); | |
1140 | extern void exit_ext4_mballoc(void); | |
1141 | extern void ext4_mb_free_blocks(handle_t *, struct inode *, | |
1142 | unsigned long, unsigned long, int, unsigned long *); | |
1143 | +extern int ext4_mb_add_more_groupinfo(struct super_block *sb, | |
1144 | + ext4_group_t i, struct ext4_group_desc *desc); | |
1145 | +extern void ext4_mb_update_group_info(struct ext4_group_info *grp, | |
1146 | + ext4_grpblk_t add); | |
1147 | ||
1148 | ||
1149 | /* inode.c */ | |
1150 | @@ -1033,19 +1058,25 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, | |
1151 | extern struct inode *ext4_iget(struct super_block *, unsigned long); | |
1152 | extern int ext4_write_inode (struct inode *, int); | |
1153 | extern int ext4_setattr (struct dentry *, struct iattr *); | |
1154 | +extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, | |
1155 | + struct kstat *stat); | |
1156 | extern void ext4_delete_inode (struct inode *); | |
1157 | extern int ext4_sync_inode (handle_t *, struct inode *); | |
1158 | extern void ext4_discard_reservation (struct inode *); | |
1159 | extern void ext4_dirty_inode(struct inode *); | |
1160 | extern int ext4_change_inode_journal_flag(struct inode *, int); | |
1161 | extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); | |
1162 | +extern int ext4_can_truncate(struct inode *inode); | |
1163 | extern void ext4_truncate (struct inode *); | |
1164 | extern void ext4_set_inode_flags(struct inode *); | |
1165 | extern void ext4_get_inode_flags(struct ext4_inode_info *); | |
1166 | extern void ext4_set_aops(struct inode *inode); | |
1167 | extern int ext4_writepage_trans_blocks(struct inode *); | |
1168 | -extern int ext4_block_truncate_page(handle_t *handle, struct page *page, | |
1169 | +extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks); | |
1170 | +extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); | |
1171 | +extern int ext4_block_truncate_page(handle_t *handle, | |
1172 | struct address_space *mapping, loff_t from); | |
1173 | +extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page); | |
1174 | ||
1175 | /* ioctl.c */ | |
1176 | extern long ext4_ioctl(struct file *, unsigned int, unsigned long); | |
1177 | @@ -1159,10 +1190,21 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb, | |
1178 | } | |
1179 | ||
1180 | ||
1181 | +static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, | |
1182 | + ext4_group_t block_group) | |
1183 | +{ | |
1184 | + return block_group >> sbi->s_log_groups_per_flex; | |
1185 | +} | |
1186 | + | |
1187 | +static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) | |
1188 | +{ | |
1189 | + return 1 << sbi->s_log_groups_per_flex; | |
1190 | +} | |
1191 | + | |
1192 | #define ext4_std_error(sb, errno) \ | |
1193 | do { \ | |
1194 | if ((errno)) \ | |
1195 | - __ext4_std_error((sb), __FUNCTION__, (errno)); \ | |
1196 | + __ext4_std_error((sb), __func__, (errno)); \ | |
1197 | } while (0) | |
1198 | ||
1199 | /* | |
1200 | @@ -1187,11 +1229,13 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations; | |
1201 | /* extents.c */ | |
1202 | extern int ext4_ext_tree_init(handle_t *handle, struct inode *); | |
1203 | extern int ext4_ext_writepage_trans_blocks(struct inode *, int); | |
1204 | +extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, | |
1205 | + int chunk); | |
1206 | extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | |
1207 | ext4_lblk_t iblock, | |
1208 | unsigned long max_blocks, struct buffer_head *bh_result, | |
1209 | int create, int extend_disksize); | |
1210 | -extern void ext4_ext_truncate(struct inode *, struct page *); | |
1211 | +extern void ext4_ext_truncate(struct inode *); | |
1212 | extern void ext4_ext_init(struct super_block *); | |
1213 | extern void ext4_ext_release(struct super_block *); | |
1214 | extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, | |
1215 | @@ -1199,7 +1243,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, | |
1216 | extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, | |
1217 | sector_t block, unsigned long max_blocks, | |
1218 | struct buffer_head *bh, int create, | |
1219 | - int extend_disksize); | |
1220 | + int extend_disksize, int flag); | |
1221 | #endif /* __KERNEL__ */ | |
1222 | ||
1223 | #endif /* _EXT4_H */ | |
1224 | diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h | |
1225 | index 75333b5..d33dc56 100644 | |
1226 | --- a/fs/ext4/ext4_extents.h | |
1227 | +++ b/fs/ext4/ext4_extents.h | |
1228 | @@ -212,10 +212,13 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext) | |
1229 | (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); | |
1230 | } | |
1231 | ||
1232 | +extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); | |
1233 | extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); | |
1234 | extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); | |
1235 | extern int ext4_extent_tree_init(handle_t *, struct inode *); | |
1236 | -extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *); | |
1237 | +extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, | |
1238 | + int num, | |
1239 | + struct ext4_ext_path *path); | |
1240 | extern int ext4_ext_try_to_merge(struct inode *inode, | |
1241 | struct ext4_ext_path *path, | |
1242 | struct ext4_extent *); | |
1243 | diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h | |
1244 | index 26a4ae2..ef7409f 100644 | |
1245 | --- a/fs/ext4/ext4_i.h | |
1246 | +++ b/fs/ext4/ext4_i.h | |
1247 | @@ -79,7 +79,7 @@ struct ext4_ext_cache { | |
1248 | }; | |
1249 | ||
1250 | /* | |
1251 | - * third extended file system inode data in memory | |
1252 | + * fourth extended file system inode data in memory | |
1253 | */ | |
1254 | struct ext4_inode_info { | |
1255 | __le32 i_data[15]; /* unconverted */ | |
1256 | @@ -150,6 +150,7 @@ struct ext4_inode_info { | |
1257 | */ | |
1258 | struct rw_semaphore i_data_sem; | |
1259 | struct inode vfs_inode; | |
1260 | + struct jbd2_inode jinode; | |
1261 | ||
1262 | unsigned long i_ext_generation; | |
1263 | struct ext4_ext_cache i_cached_extent; | |
1264 | @@ -162,6 +163,13 @@ struct ext4_inode_info { | |
1265 | /* mballoc */ | |
1266 | struct list_head i_prealloc_list; | |
1267 | spinlock_t i_prealloc_lock; | |
1268 | + | |
1269 | + /* allocation reservation info for delalloc */ | |
1270 | + unsigned long i_reserved_data_blocks; | |
1271 | + unsigned long i_reserved_meta_blocks; | |
1272 | + unsigned long i_allocated_meta_blocks; | |
1273 | + unsigned short i_delalloc_reserved_flag; | |
1274 | + spinlock_t i_block_reservation_lock; | |
1275 | }; | |
1276 | ||
1277 | #endif /* _EXT4_I */ | |
1278 | diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h | |
1279 | index 9255a7d..b455c68 100644 | |
1280 | --- a/fs/ext4/ext4_jbd2.h | |
1281 | +++ b/fs/ext4/ext4_jbd2.h | |
1282 | @@ -51,6 +51,14 @@ | |
1283 | EXT4_XATTR_TRANS_BLOCKS - 2 + \ | |
1284 | 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) | |
1285 | ||
1286 | +/* | |
1287 | + * Define the number of metadata blocks we need to account to modify data. | |
1288 | + * | |
1289 | + * This include super block, inode block, quota blocks and xattr blocks | |
1290 | + */ | |
1291 | +#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ | |
1292 | + 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) | |
1293 | + | |
1294 | /* Delete operations potentially hit one directory's namespace plus an | |
1295 | * entire inode, plus arbitrary amounts of bitmap/indirection data. Be | |
1296 | * generous. We can grow the delete transaction later if necessary. */ | |
1297 | @@ -142,19 +150,17 @@ int __ext4_journal_dirty_metadata(const char *where, | |
1298 | handle_t *handle, struct buffer_head *bh); | |
1299 | ||
1300 | #define ext4_journal_get_undo_access(handle, bh) \ | |
1301 | - __ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh)) | |
1302 | + __ext4_journal_get_undo_access(__func__, (handle), (bh)) | |
1303 | #define ext4_journal_get_write_access(handle, bh) \ | |
1304 | - __ext4_journal_get_write_access(__FUNCTION__, (handle), (bh)) | |
1305 | + __ext4_journal_get_write_access(__func__, (handle), (bh)) | |
1306 | #define ext4_journal_revoke(handle, blocknr, bh) \ | |
1307 | - __ext4_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh)) | |
1308 | + __ext4_journal_revoke(__func__, (handle), (blocknr), (bh)) | |
1309 | #define ext4_journal_get_create_access(handle, bh) \ | |
1310 | - __ext4_journal_get_create_access(__FUNCTION__, (handle), (bh)) | |
1311 | + __ext4_journal_get_create_access(__func__, (handle), (bh)) | |
1312 | #define ext4_journal_dirty_metadata(handle, bh) \ | |
1313 | - __ext4_journal_dirty_metadata(__FUNCTION__, (handle), (bh)) | |
1314 | + __ext4_journal_dirty_metadata(__func__, (handle), (bh)) | |
1315 | #define ext4_journal_forget(handle, bh) \ | |
1316 | - __ext4_journal_forget(__FUNCTION__, (handle), (bh)) | |
1317 | - | |
1318 | -int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh); | |
1319 | + __ext4_journal_forget(__func__, (handle), (bh)) | |
1320 | ||
1321 | handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); | |
1322 | int __ext4_journal_stop(const char *where, handle_t *handle); | |
1323 | @@ -165,7 +171,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) | |
1324 | } | |
1325 | ||
1326 | #define ext4_journal_stop(handle) \ | |
1327 | - __ext4_journal_stop(__FUNCTION__, (handle)) | |
1328 | + __ext4_journal_stop(__func__, (handle)) | |
1329 | ||
1330 | static inline handle_t *ext4_journal_current_handle(void) | |
1331 | { | |
1332 | @@ -192,6 +198,11 @@ static inline int ext4_journal_force_commit(journal_t *journal) | |
1333 | return jbd2_journal_force_commit(journal); | |
1334 | } | |
1335 | ||
1336 | +static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) | |
1337 | +{ | |
1338 | + return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); | |
1339 | +} | |
1340 | + | |
1341 | /* super.c */ | |
1342 | int ext4_force_commit(struct super_block *sb); | |
1343 | ||
1344 | diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h | |
1345 | index 5802e69..6300226 100644 | |
1346 | --- a/fs/ext4/ext4_sb.h | |
1347 | +++ b/fs/ext4/ext4_sb.h | |
1348 | @@ -25,7 +25,7 @@ | |
1349 | #include <linux/rbtree.h> | |
1350 | ||
1351 | /* | |
1352 | - * third extended-fs super-block data in memory | |
1353 | + * fourth extended-fs super-block data in memory | |
1354 | */ | |
1355 | struct ext4_sb_info { | |
1356 | unsigned long s_desc_size; /* Size of a group descriptor in bytes */ | |
1357 | @@ -143,6 +143,9 @@ struct ext4_sb_info { | |
1358 | ||
1359 | /* locality groups */ | |
1360 | struct ext4_locality_group *s_locality_groups; | |
1361 | + | |
1362 | + unsigned int s_log_groups_per_flex; | |
1363 | + struct flex_groups *s_flex_groups; | |
1364 | }; | |
1365 | ||
1366 | #endif /* _EXT4_SB */ | |
1367 | diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c | |
1368 | index 47929c4..b24d3c5 100644 | |
1369 | --- a/fs/ext4/extents.c | |
1370 | +++ b/fs/ext4/extents.c | |
1371 | @@ -92,17 +92,16 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb) | |
1372 | ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); | |
1373 | } | |
1374 | ||
1375 | -static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed) | |
1376 | +static int ext4_ext_journal_restart(handle_t *handle, int needed) | |
1377 | { | |
1378 | int err; | |
1379 | ||
1380 | if (handle->h_buffer_credits > needed) | |
1381 | - return handle; | |
1382 | - if (!ext4_journal_extend(handle, needed)) | |
1383 | - return handle; | |
1384 | - err = ext4_journal_restart(handle, needed); | |
1385 | - | |
1386 | - return handle; | |
1387 | + return 0; | |
1388 | + err = ext4_journal_extend(handle, needed); | |
1389 | + if (err <= 0) | |
1390 | + return err; | |
1391 | + return ext4_journal_restart(handle, needed); | |
1392 | } | |
1393 | ||
1394 | /* | |
1395 | @@ -180,15 +179,18 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, | |
1396 | return bg_start + colour + block; | |
1397 | } | |
1398 | ||
1399 | +/* | |
1400 | + * Allocation for a meta data block | |
1401 | + */ | |
1402 | static ext4_fsblk_t | |
1403 | -ext4_ext_new_block(handle_t *handle, struct inode *inode, | |
1404 | +ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, | |
1405 | struct ext4_ext_path *path, | |
1406 | struct ext4_extent *ex, int *err) | |
1407 | { | |
1408 | ext4_fsblk_t goal, newblock; | |
1409 | ||
1410 | goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); | |
1411 | - newblock = ext4_new_block(handle, inode, goal, err); | |
1412 | + newblock = ext4_new_meta_block(handle, inode, goal, err); | |
1413 | return newblock; | |
1414 | } | |
1415 | ||
1416 | @@ -246,6 +248,36 @@ static int ext4_ext_space_root_idx(struct inode *inode) | |
1417 | return size; | |
1418 | } | |
1419 | ||
1420 | +/* | |
1421 | + * Calculate the number of metadata blocks needed | |
1422 | + * to allocate @blocks | |
1423 | + * Worse case is one block per extent | |
1424 | + */ | |
1425 | +int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks) | |
1426 | +{ | |
1427 | + int lcap, icap, rcap, leafs, idxs, num; | |
1428 | + int newextents = blocks; | |
1429 | + | |
1430 | + rcap = ext4_ext_space_root_idx(inode); | |
1431 | + lcap = ext4_ext_space_block(inode); | |
1432 | + icap = ext4_ext_space_block_idx(inode); | |
1433 | + | |
1434 | + /* number of new leaf blocks needed */ | |
1435 | + num = leafs = (newextents + lcap - 1) / lcap; | |
1436 | + | |
1437 | + /* | |
1438 | + * Worse case, we need separate index block(s) | |
1439 | + * to link all new leaf blocks | |
1440 | + */ | |
1441 | + idxs = (leafs + icap - 1) / icap; | |
1442 | + do { | |
1443 | + num += idxs; | |
1444 | + idxs = (idxs + icap - 1) / icap; | |
1445 | + } while (idxs > rcap); | |
1446 | + | |
1447 | + return num; | |
1448 | +} | |
1449 | + | |
1450 | static int | |
1451 | ext4_ext_max_entries(struct inode *inode, int depth) | |
1452 | { | |
1453 | @@ -524,6 +556,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, | |
1454 | alloc = 1; | |
1455 | } | |
1456 | path[0].p_hdr = eh; | |
1457 | + path[0].p_bh = NULL; | |
1458 | ||
1459 | i = depth; | |
1460 | /* walk through the tree */ | |
1461 | @@ -552,12 +585,14 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, | |
1462 | } | |
1463 | ||
1464 | path[ppos].p_depth = i; | |
1465 | - path[ppos].p_hdr = eh; | |
1466 | path[ppos].p_ext = NULL; | |
1467 | path[ppos].p_idx = NULL; | |
1468 | ||
1469 | /* find extent */ | |
1470 | ext4_ext_binsearch(inode, path + ppos, block); | |
1471 | + /* if not an empty leaf */ | |
1472 | + if (path[ppos].p_ext) | |
1473 | + path[ppos].p_block = ext_pblock(path[ppos].p_ext); | |
1474 | ||
1475 | ext4_ext_show_path(inode, path); | |
1476 | ||
1477 | @@ -688,7 +723,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, | |
1478 | /* allocate all needed blocks */ | |
1479 | ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); | |
1480 | for (a = 0; a < depth - at; a++) { | |
1481 | - newblock = ext4_ext_new_block(handle, inode, path, newext, &err); | |
1482 | + newblock = ext4_ext_new_meta_block(handle, inode, path, | |
1483 | + newext, &err); | |
1484 | if (newblock == 0) | |
1485 | goto cleanup; | |
1486 | ablocks[a] = newblock; | |
1487 | @@ -884,7 +920,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, | |
1488 | ext4_fsblk_t newblock; | |
1489 | int err = 0; | |
1490 | ||
1491 | - newblock = ext4_ext_new_block(handle, inode, path, newext, &err); | |
1492 | + newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err); | |
1493 | if (newblock == 0) | |
1494 | return err; | |
1495 | ||
1496 | @@ -981,6 +1017,8 @@ repeat: | |
1497 | /* if we found index with free entry, then use that | |
1498 | * entry: create all needed subtree and add new leaf */ | |
1499 | err = ext4_ext_split(handle, inode, path, newext, i); | |
1500 | + if (err) | |
1501 | + goto out; | |
1502 | ||
1503 | /* refill path */ | |
1504 | ext4_ext_drop_refs(path); | |
1505 | @@ -1403,7 +1441,7 @@ unsigned int ext4_ext_check_overlap(struct inode *inode, | |
1506 | ||
1507 | /* | |
1508 | * get the next allocated block if the extent in the path | |
1509 | - * is before the requested block(s) | |
1510 | + * is before the requested block(s) | |
1511 | */ | |
1512 | if (b2 < b1) { | |
1513 | b2 = ext4_ext_next_allocated_block(path); | |
1514 | @@ -1709,54 +1747,61 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, | |
1515 | } | |
1516 | ||
1517 | /* | |
1518 | - * ext4_ext_calc_credits_for_insert: | |
1519 | - * This routine returns max. credits that the extent tree can consume. | |
1520 | - * It should be OK for low-performance paths like ->writepage() | |
1521 | - * To allow many writing processes to fit into a single transaction, | |
1522 | - * the caller should calculate credits under i_data_sem and | |
1523 | - * pass the actual path. | |
1524 | + * ext4_ext_calc_credits_for_single_extent: | |
1525 | + * This routine returns max. credits that needed to insert an extent | |
1526 | + * to the extent tree. | |
1527 | + * When pass the actual path, the caller should calculate credits | |
1528 | + * under i_data_sem. | |
1529 | */ | |
1530 | -int ext4_ext_calc_credits_for_insert(struct inode *inode, | |
1531 | +int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, | |
1532 | struct ext4_ext_path *path) | |
1533 | { | |
1534 | - int depth, needed; | |
1535 | - | |
1536 | if (path) { | |
1537 | + int depth = ext_depth(inode); | |
1538 | + int ret = 0; | |
1539 | + | |
1540 | /* probably there is space in leaf? */ | |
1541 | - depth = ext_depth(inode); | |
1542 | if (le16_to_cpu(path[depth].p_hdr->eh_entries) | |
1543 | - < le16_to_cpu(path[depth].p_hdr->eh_max)) | |
1544 | - return 1; | |
1545 | - } | |
1546 | + < le16_to_cpu(path[depth].p_hdr->eh_max)) { | |
1547 | ||
1548 | - /* | |
1549 | - * given 32-bit logical block (4294967296 blocks), max. tree | |
1550 | - * can be 4 levels in depth -- 4 * 340^4 == 53453440000. | |
1551 | - * Let's also add one more level for imbalance. | |
1552 | - */ | |
1553 | - depth = 5; | |
1554 | - | |
1555 | - /* allocation of new data block(s) */ | |
1556 | - needed = 2; | |
1557 | + /* | |
1558 | + * There are some space in the leaf tree, no | |
1559 | + * need to account for leaf block credit | |
1560 | + * | |
1561 | + * bitmaps and block group descriptor blocks | |
1562 | + * and other metadat blocks still need to be | |
1563 | + * accounted. | |
1564 | + */ | |
1565 | + /* 1 bitmap, 1 block group descriptor */ | |
1566 | + ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); | |
1567 | + } | |
1568 | + } | |
1569 | ||
1570 | - /* | |
1571 | - * tree can be full, so it would need to grow in depth: | |
1572 | - * we need one credit to modify old root, credits for | |
1573 | - * new root will be added in split accounting | |
1574 | - */ | |
1575 | - needed += 1; | |
1576 | + return ext4_chunk_trans_blocks(inode, nrblocks); | |
1577 | +} | |
1578 | ||
1579 | - /* | |
1580 | - * Index split can happen, we would need: | |
1581 | - * allocate intermediate indexes (bitmap + group) | |
1582 | - * + change two blocks at each level, but root (already included) | |
1583 | - */ | |
1584 | - needed += (depth * 2) + (depth * 2); | |
1585 | +/* | |
1586 | + * How many index/leaf blocks need to change/allocate to modify nrblocks? | |
1587 | + * | |
1588 | + * if nrblocks are fit in a single extent (chunk flag is 1), then | |
1589 | + * in the worse case, each tree level index/leaf need to be changed | |
1590 | + * if the tree split due to insert a new extent, then the old tree | |
1591 | + * index/leaf need to be updated too | |
1592 | + * | |
1593 | + * If the nrblocks are discontiguous, they could cause | |
1594 | + * the whole tree split more than once, but this is really rare. | |
1595 | + */ | |
1596 | +int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) | |
1597 | +{ | |
1598 | + int index; | |
1599 | + int depth = ext_depth(inode); | |
1600 | ||
1601 | - /* any allocation modifies superblock */ | |
1602 | - needed += 1; | |
1603 | + if (chunk) | |
1604 | + index = depth * 2; | |
1605 | + else | |
1606 | + index = depth * 3; | |
1607 | ||
1608 | - return needed; | |
1609 | + return index; | |
1610 | } | |
1611 | ||
1612 | static int ext4_remove_blocks(handle_t *handle, struct inode *inode, | |
1613 | @@ -1872,22 +1917,22 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, | |
1614 | BUG_ON(b != ex_ee_block + ex_ee_len - 1); | |
1615 | } | |
1616 | ||
1617 | - /* at present, extent can't cross block group: */ | |
1618 | - /* leaf + bitmap + group desc + sb + inode */ | |
1619 | - credits = 5; | |
1620 | + /* | |
1621 | + * 3 for leaf, sb, and inode plus 2 (bmap and group | |
1622 | + * descriptor) for each block group; assume two block | |
1623 | + * groups plus ex_ee_len/blocks_per_block_group for | |
1624 | + * the worst case | |
1625 | + */ | |
1626 | + credits = 7 + 2*(ex_ee_len/EXT4_BLOCKS_PER_GROUP(inode->i_sb)); | |
1627 | if (ex == EXT_FIRST_EXTENT(eh)) { | |
1628 | correct_index = 1; | |
1629 | credits += (ext_depth(inode)) + 1; | |
1630 | } | |
1631 | -#ifdef CONFIG_QUOTA | |
1632 | credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); | |
1633 | -#endif | |
1634 | ||
1635 | - handle = ext4_ext_journal_restart(handle, credits); | |
1636 | - if (IS_ERR(handle)) { | |
1637 | - err = PTR_ERR(handle); | |
1638 | + err = ext4_ext_journal_restart(handle, credits); | |
1639 | + if (err) | |
1640 | goto out; | |
1641 | - } | |
1642 | ||
1643 | err = ext4_ext_get_access(handle, inode, path + depth); | |
1644 | if (err) | |
1645 | @@ -2287,7 +2332,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, | |
1646 | unsigned int newdepth; | |
1647 | /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */ | |
1648 | if (allocated <= EXT4_EXT_ZERO_LEN) { | |
1649 | - /* Mark first half uninitialized. | |
1650 | + /* | |
1651 | + * iblock == ee_block is handled by the zerouout | |
1652 | + * at the beginning. | |
1653 | + * Mark first half uninitialized. | |
1654 | * Mark second half initialized and zero out the | |
1655 | * initialized extent | |
1656 | */ | |
1657 | @@ -2310,7 +2358,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, | |
1658 | ex->ee_len = orig_ex.ee_len; | |
1659 | ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); | |
1660 | ext4_ext_dirty(handle, inode, path + depth); | |
1661 | - /* zeroed the full extent */ | |
1662 | + /* blocks available from iblock */ | |
1663 | return allocated; | |
1664 | ||
1665 | } else if (err) | |
1666 | @@ -2338,6 +2386,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, | |
1667 | err = PTR_ERR(path); | |
1668 | return err; | |
1669 | } | |
1670 | + /* get the second half extent details */ | |
1671 | ex = path[depth].p_ext; | |
1672 | err = ext4_ext_get_access(handle, inode, | |
1673 | path + depth); | |
1674 | @@ -2367,6 +2416,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, | |
1675 | ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); | |
1676 | ext4_ext_dirty(handle, inode, path + depth); | |
1677 | /* zeroed the full extent */ | |
1678 | + /* blocks available from iblock */ | |
1679 | return allocated; | |
1680 | ||
1681 | } else if (err) | |
1682 | @@ -2382,23 +2432,22 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, | |
1683 | */ | |
1684 | orig_ex.ee_len = cpu_to_le16(ee_len - | |
1685 | ext4_ext_get_actual_len(ex3)); | |
1686 | - if (newdepth != depth) { | |
1687 | - depth = newdepth; | |
1688 | - ext4_ext_drop_refs(path); | |
1689 | - path = ext4_ext_find_extent(inode, iblock, path); | |
1690 | - if (IS_ERR(path)) { | |
1691 | - err = PTR_ERR(path); | |
1692 | - goto out; | |
1693 | - } | |
1694 | - eh = path[depth].p_hdr; | |
1695 | - ex = path[depth].p_ext; | |
1696 | - if (ex2 != &newex) | |
1697 | - ex2 = ex; | |
1698 | - | |
1699 | - err = ext4_ext_get_access(handle, inode, path + depth); | |
1700 | - if (err) | |
1701 | - goto out; | |
1702 | + depth = newdepth; | |
1703 | + ext4_ext_drop_refs(path); | |
1704 | + path = ext4_ext_find_extent(inode, iblock, path); | |
1705 | + if (IS_ERR(path)) { | |
1706 | + err = PTR_ERR(path); | |
1707 | + goto out; | |
1708 | } | |
1709 | + eh = path[depth].p_hdr; | |
1710 | + ex = path[depth].p_ext; | |
1711 | + if (ex2 != &newex) | |
1712 | + ex2 = ex; | |
1713 | + | |
1714 | + err = ext4_ext_get_access(handle, inode, path + depth); | |
1715 | + if (err) | |
1716 | + goto out; | |
1717 | + | |
1718 | allocated = max_blocks; | |
1719 | ||
1720 | /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying | |
1721 | @@ -2416,6 +2465,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, | |
1722 | ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); | |
1723 | ext4_ext_dirty(handle, inode, path + depth); | |
1724 | /* zero out the first half */ | |
1725 | + /* blocks available from iblock */ | |
1726 | return allocated; | |
1727 | } | |
1728 | } | |
1729 | @@ -2529,6 +2579,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | |
1730 | int err = 0, depth, ret; | |
1731 | unsigned long allocated = 0; | |
1732 | struct ext4_allocation_request ar; | |
1733 | + loff_t disksize; | |
1734 | ||
1735 | __clear_bit(BH_New, &bh_result->b_state); | |
1736 | ext_debug("blocks %u/%lu requested for inode %u\n", | |
1737 | @@ -2616,8 +2667,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | |
1738 | */ | |
1739 | if (allocated > max_blocks) | |
1740 | allocated = max_blocks; | |
1741 | - /* mark the buffer unwritten */ | |
1742 | - __set_bit(BH_Unwritten, &bh_result->b_state); | |
1743 | + set_buffer_unwritten(bh_result); | |
1744 | goto out2; | |
1745 | } | |
1746 | ||
1747 | @@ -2716,14 +2766,19 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | |
1748 | goto out2; | |
1749 | } | |
1750 | ||
1751 | - if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize) | |
1752 | - EXT4_I(inode)->i_disksize = inode->i_size; | |
1753 | - | |
1754 | /* previous routine could use block we allocated */ | |
1755 | newblock = ext_pblock(&newex); | |
1756 | allocated = ext4_ext_get_actual_len(&newex); | |
1757 | outnew: | |
1758 | - __set_bit(BH_New, &bh_result->b_state); | |
1759 | + if (extend_disksize) { | |
1760 | + disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits; | |
1761 | + if (disksize > i_size_read(inode)) | |
1762 | + disksize = i_size_read(inode); | |
1763 | + if (disksize > EXT4_I(inode)->i_disksize) | |
1764 | + EXT4_I(inode)->i_disksize = disksize; | |
1765 | + } | |
1766 | + | |
1767 | + set_buffer_new(bh_result); | |
1768 | ||
1769 | /* Cache only when it is _not_ an uninitialized extent */ | |
1770 | if (create != EXT4_CREATE_UNINITIALIZED_EXT) | |
1771 | @@ -2733,7 +2788,7 @@ out: | |
1772 | if (allocated > max_blocks) | |
1773 | allocated = max_blocks; | |
1774 | ext4_ext_show_leaf(inode, path); | |
1775 | - __set_bit(BH_Mapped, &bh_result->b_state); | |
1776 | + set_buffer_mapped(bh_result); | |
1777 | bh_result->b_bdev = inode->i_sb->s_bdev; | |
1778 | bh_result->b_blocknr = newblock; | |
1779 | out2: | |
1780 | @@ -2744,7 +2799,7 @@ out2: | |
1781 | return err ? err : allocated; | |
1782 | } | |
1783 | ||
1784 | -void ext4_ext_truncate(struct inode * inode, struct page *page) | |
1785 | +void ext4_ext_truncate(struct inode *inode) | |
1786 | { | |
1787 | struct address_space *mapping = inode->i_mapping; | |
1788 | struct super_block *sb = inode->i_sb; | |
1789 | @@ -2755,33 +2810,27 @@ void ext4_ext_truncate(struct inode * inode, struct page *page) | |
1790 | /* | |
1791 | * probably first extent we're gonna free will be last in block | |
1792 | */ | |
1793 | - err = ext4_writepage_trans_blocks(inode) + 3; | |
1794 | + err = ext4_writepage_trans_blocks(inode); | |
1795 | handle = ext4_journal_start(inode, err); | |
1796 | - if (IS_ERR(handle)) { | |
1797 | - if (page) { | |
1798 | - clear_highpage(page); | |
1799 | - flush_dcache_page(page); | |
1800 | - unlock_page(page); | |
1801 | - page_cache_release(page); | |
1802 | - } | |
1803 | + if (IS_ERR(handle)) | |
1804 | return; | |
1805 | - } | |
1806 | ||
1807 | - if (page) | |
1808 | - ext4_block_truncate_page(handle, page, mapping, inode->i_size); | |
1809 | + if (inode->i_size & (sb->s_blocksize - 1)) | |
1810 | + ext4_block_truncate_page(handle, mapping, inode->i_size); | |
1811 | + | |
1812 | + if (ext4_orphan_add(handle, inode)) | |
1813 | + goto out_stop; | |
1814 | ||
1815 | down_write(&EXT4_I(inode)->i_data_sem); | |
1816 | ext4_ext_invalidate_cache(inode); | |
1817 | ||
1818 | - ext4_mb_discard_inode_preallocations(inode); | |
1819 | + ext4_discard_reservation(inode); | |
1820 | ||
1821 | /* | |
1822 | * TODO: optimization is possible here. | |
1823 | * Probably we need not scan at all, | |
1824 | * because page truncation is enough. | |
1825 | */ | |
1826 | - if (ext4_orphan_add(handle, inode)) | |
1827 | - goto out_stop; | |
1828 | ||
1829 | /* we have to know where to truncate from in crash case */ | |
1830 | EXT4_I(inode)->i_disksize = inode->i_size; | |
1831 | @@ -2798,6 +2847,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page) | |
1832 | handle->h_sync = 1; | |
1833 | ||
1834 | out_stop: | |
1835 | + up_write(&EXT4_I(inode)->i_data_sem); | |
1836 | /* | |
1837 | * If this was a simple ftruncate() and the file will remain alive, | |
1838 | * then we need to clear up the orphan record which we created above. | |
1839 | @@ -2808,33 +2858,11 @@ out_stop: | |
1840 | if (inode->i_nlink) | |
1841 | ext4_orphan_del(handle, inode); | |
1842 | ||
1843 | - up_write(&EXT4_I(inode)->i_data_sem); | |
1844 | inode->i_mtime = inode->i_ctime = ext4_current_time(inode); | |
1845 | ext4_mark_inode_dirty(handle, inode); | |
1846 | ext4_journal_stop(handle); | |
1847 | } | |
1848 | ||
1849 | -/* | |
1850 | - * ext4_ext_writepage_trans_blocks: | |
1851 | - * calculate max number of blocks we could modify | |
1852 | - * in order to allocate new block for an inode | |
1853 | - */ | |
1854 | -int ext4_ext_writepage_trans_blocks(struct inode *inode, int num) | |
1855 | -{ | |
1856 | - int needed; | |
1857 | - | |
1858 | - needed = ext4_ext_calc_credits_for_insert(inode, NULL); | |
1859 | - | |
1860 | - /* caller wants to allocate num blocks, but note it includes sb */ | |
1861 | - needed = needed * num - (num - 1); | |
1862 | - | |
1863 | -#ifdef CONFIG_QUOTA | |
1864 | - needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); | |
1865 | -#endif | |
1866 | - | |
1867 | - return needed; | |
1868 | -} | |
1869 | - | |
1870 | static void ext4_falloc_update_inode(struct inode *inode, | |
1871 | int mode, loff_t new_size, int update_ctime) | |
1872 | { | |
1873 | @@ -2895,10 +2923,9 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) | |
1874 | max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) | |
1875 | - block; | |
1876 | /* | |
1877 | - * credits to insert 1 extent into extent tree + buffers to be able to | |
1878 | - * modify 1 super block, 1 block bitmap and 1 group descriptor. | |
1879 | + * credits to insert 1 extent into extent tree | |
1880 | */ | |
1881 | - credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3; | |
1882 | + credits = ext4_chunk_trans_blocks(inode, max_blocks); | |
1883 | mutex_lock(&inode->i_mutex); | |
1884 | retry: | |
1885 | while (ret >= 0 && ret < max_blocks) { | |
1886 | @@ -2911,7 +2938,7 @@ retry: | |
1887 | } | |
1888 | ret = ext4_get_blocks_wrap(handle, inode, block, | |
1889 | max_blocks, &map_bh, | |
1890 | - EXT4_CREATE_UNINITIALIZED_EXT, 0); | |
1891 | + EXT4_CREATE_UNINITIALIZED_EXT, 0, 0); | |
1892 | if (ret <= 0) { | |
1893 | #ifdef EXT4FS_DEBUG | |
1894 | WARN_ON(ret <= 0); | |
1895 | diff --git a/fs/ext4/file.c b/fs/ext4/file.c | |
1896 | index 4159be6..430eb79 100644 | |
1897 | --- a/fs/ext4/file.c | |
1898 | +++ b/fs/ext4/file.c | |
1899 | @@ -123,6 +123,23 @@ force_commit: | |
1900 | return ret; | |
1901 | } | |
1902 | ||
1903 | +static struct vm_operations_struct ext4_file_vm_ops = { | |
1904 | + .fault = filemap_fault, | |
1905 | + .page_mkwrite = ext4_page_mkwrite, | |
1906 | +}; | |
1907 | + | |
1908 | +static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) | |
1909 | +{ | |
1910 | + struct address_space *mapping = file->f_mapping; | |
1911 | + | |
1912 | + if (!mapping->a_ops->readpage) | |
1913 | + return -ENOEXEC; | |
1914 | + file_accessed(file); | |
1915 | + vma->vm_ops = &ext4_file_vm_ops; | |
1916 | + vma->vm_flags |= VM_CAN_NONLINEAR; | |
1917 | + return 0; | |
1918 | +} | |
1919 | + | |
1920 | const struct file_operations ext4_file_operations = { | |
1921 | .llseek = generic_file_llseek, | |
1922 | .read = do_sync_read, | |
1923 | @@ -133,7 +150,7 @@ const struct file_operations ext4_file_operations = { | |
1924 | #ifdef CONFIG_COMPAT | |
1925 | .compat_ioctl = ext4_compat_ioctl, | |
1926 | #endif | |
1927 | - .mmap = generic_file_mmap, | |
1928 | + .mmap = ext4_file_mmap, | |
1929 | .open = generic_file_open, | |
1930 | .release = ext4_release_file, | |
1931 | .fsync = ext4_sync_file, | |
1932 | @@ -144,6 +161,7 @@ const struct file_operations ext4_file_operations = { | |
1933 | const struct inode_operations ext4_file_inode_operations = { | |
1934 | .truncate = ext4_truncate, | |
1935 | .setattr = ext4_setattr, | |
1936 | + .getattr = ext4_getattr, | |
1937 | #ifdef CONFIG_EXT4DEV_FS_XATTR | |
1938 | .setxattr = generic_setxattr, | |
1939 | .getxattr = generic_getxattr, | |
1940 | diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c | |
1941 | index 1c8ba48..a45c373 100644 | |
1942 | --- a/fs/ext4/fsync.c | |
1943 | +++ b/fs/ext4/fsync.c | |
1944 | @@ -27,6 +27,7 @@ | |
1945 | #include <linux/sched.h> | |
1946 | #include <linux/writeback.h> | |
1947 | #include <linux/jbd2.h> | |
1948 | +#include <linux/blkdev.h> | |
1949 | #include "ext4.h" | |
1950 | #include "ext4_jbd2.h" | |
1951 | ||
1952 | @@ -45,6 +46,7 @@ | |
1953 | int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync) | |
1954 | { | |
1955 | struct inode *inode = dentry->d_inode; | |
1956 | + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; | |
1957 | int ret = 0; | |
1958 | ||
1959 | J_ASSERT(ext4_journal_current_handle() == NULL); | |
1960 | @@ -85,6 +87,8 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync) | |
1961 | .nr_to_write = 0, /* sys_fsync did this */ | |
1962 | }; | |
1963 | ret = sync_inode(inode, &wbc); | |
1964 | + if (journal && (journal->j_flags & JBD2_BARRIER)) | |
1965 | + blkdev_issue_flush(inode->i_sb->s_bdev, NULL); | |
1966 | } | |
1967 | out: | |
1968 | return ret; | |
1969 | diff --git a/fs/ext4/group.h b/fs/ext4/group.h | |
1970 | index 7eb0604..c2c0a8d 100644 | |
1971 | --- a/fs/ext4/group.h | |
1972 | +++ b/fs/ext4/group.h | |
1973 | @@ -13,7 +13,7 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group, | |
1974 | struct ext4_group_desc *gdp); | |
1975 | extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, | |
1976 | struct ext4_group_desc *gdp); | |
1977 | -struct buffer_head *read_block_bitmap(struct super_block *sb, | |
1978 | +struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, | |
1979 | ext4_group_t block_group); | |
1980 | extern unsigned ext4_init_block_bitmap(struct super_block *sb, | |
1981 | struct buffer_head *bh, | |
1982 | diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c | |
1983 | index c6efbab..f344834 100644 | |
1984 | --- a/fs/ext4/ialloc.c | |
1985 | +++ b/fs/ext4/ialloc.c | |
1986 | @@ -97,34 +97,44 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, | |
1987 | * Return buffer_head of bitmap on success or NULL. | |
1988 | */ | |
1989 | static struct buffer_head * | |
1990 | -read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) | |
1991 | +ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) | |
1992 | { | |
1993 | struct ext4_group_desc *desc; | |
1994 | struct buffer_head *bh = NULL; | |
1995 | + ext4_fsblk_t bitmap_blk; | |
1996 | ||
1997 | desc = ext4_get_group_desc(sb, block_group, NULL); | |
1998 | if (!desc) | |
1999 | - goto error_out; | |
2000 | + return NULL; | |
2001 | + bitmap_blk = ext4_inode_bitmap(sb, desc); | |
2002 | + bh = sb_getblk(sb, bitmap_blk); | |
2003 | + if (unlikely(!bh)) { | |
2004 | + ext4_error(sb, __func__, | |
2005 | + "Cannot read inode bitmap - " | |
2006 | + "block_group = %lu, inode_bitmap = %llu", | |
2007 | + block_group, bitmap_blk); | |
2008 | + return NULL; | |
2009 | + } | |
2010 | + if (bh_uptodate_or_lock(bh)) | |
2011 | + return bh; | |
2012 | + | |
2013 | + spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); | |
2014 | if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { | |
2015 | - bh = sb_getblk(sb, ext4_inode_bitmap(sb, desc)); | |
2016 | - if (!buffer_uptodate(bh)) { | |
2017 | - lock_buffer(bh); | |
2018 | - if (!buffer_uptodate(bh)) { | |
2019 | - ext4_init_inode_bitmap(sb, bh, block_group, | |
2020 | - desc); | |
2021 | - set_buffer_uptodate(bh); | |
2022 | - } | |
2023 | - unlock_buffer(bh); | |
2024 | - } | |
2025 | - } else { | |
2026 | - bh = sb_bread(sb, ext4_inode_bitmap(sb, desc)); | |
2027 | + ext4_init_inode_bitmap(sb, bh, block_group, desc); | |
2028 | + set_buffer_uptodate(bh); | |
2029 | + unlock_buffer(bh); | |
2030 | + spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); | |
2031 | + return bh; | |
2032 | } | |
2033 | - if (!bh) | |
2034 | - ext4_error(sb, "read_inode_bitmap", | |
2035 | + spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); | |
2036 | + if (bh_submit_read(bh) < 0) { | |
2037 | + put_bh(bh); | |
2038 | + ext4_error(sb, __func__, | |
2039 | "Cannot read inode bitmap - " | |
2040 | "block_group = %lu, inode_bitmap = %llu", | |
2041 | - block_group, ext4_inode_bitmap(sb, desc)); | |
2042 | -error_out: | |
2043 | + block_group, bitmap_blk); | |
2044 | + return NULL; | |
2045 | + } | |
2046 | return bh; | |
2047 | } | |
2048 | ||
2049 | @@ -157,6 +167,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode) | |
2050 | struct ext4_super_block * es; | |
2051 | struct ext4_sb_info *sbi; | |
2052 | int fatal = 0, err; | |
2053 | + ext4_group_t flex_group; | |
2054 | ||
2055 | if (atomic_read(&inode->i_count) > 1) { | |
2056 | printk ("ext4_free_inode: inode has count=%d\n", | |
2057 | @@ -199,7 +210,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode) | |
2058 | } | |
2059 | block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); | |
2060 | bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); | |
2061 | - bitmap_bh = read_inode_bitmap(sb, block_group); | |
2062 | + bitmap_bh = ext4_read_inode_bitmap(sb, block_group); | |
2063 | if (!bitmap_bh) | |
2064 | goto error_return; | |
2065 | ||
2066 | @@ -232,6 +243,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode) | |
2067 | if (is_directory) | |
2068 | percpu_counter_dec(&sbi->s_dirs_counter); | |
2069 | ||
2070 | + if (sbi->s_log_groups_per_flex) { | |
2071 | + flex_group = ext4_flex_group(sbi, block_group); | |
2072 | + spin_lock(sb_bgl_lock(sbi, flex_group)); | |
2073 | + sbi->s_flex_groups[flex_group].free_inodes++; | |
2074 | + spin_unlock(sb_bgl_lock(sbi, flex_group)); | |
2075 | + } | |
2076 | } | |
2077 | BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); | |
2078 | err = ext4_journal_dirty_metadata(handle, bh2); | |
2079 | @@ -286,6 +303,80 @@ static int find_group_dir(struct super_block *sb, struct inode *parent, | |
2080 | return ret; | |
2081 | } | |
2082 | ||
2083 | +#define free_block_ratio 10 | |
2084 | + | |
2085 | +static int find_group_flex(struct super_block *sb, struct inode *parent, | |
2086 | + ext4_group_t *best_group) | |
2087 | +{ | |
2088 | + struct ext4_sb_info *sbi = EXT4_SB(sb); | |
2089 | + struct ext4_group_desc *desc; | |
2090 | + struct buffer_head *bh; | |
2091 | + struct flex_groups *flex_group = sbi->s_flex_groups; | |
2092 | + ext4_group_t parent_group = EXT4_I(parent)->i_block_group; | |
2093 | + ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group); | |
2094 | + ext4_group_t ngroups = sbi->s_groups_count; | |
2095 | + int flex_size = ext4_flex_bg_size(sbi); | |
2096 | + ext4_group_t best_flex = parent_fbg_group; | |
2097 | + int blocks_per_flex = sbi->s_blocks_per_group * flex_size; | |
2098 | + int flexbg_free_blocks; | |
2099 | + int flex_freeb_ratio; | |
2100 | + ext4_group_t n_fbg_groups; | |
2101 | + ext4_group_t i; | |
2102 | + | |
2103 | + n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >> | |
2104 | + sbi->s_log_groups_per_flex; | |
2105 | + | |
2106 | +find_close_to_parent: | |
2107 | + flexbg_free_blocks = flex_group[best_flex].free_blocks; | |
2108 | + flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; | |
2109 | + if (flex_group[best_flex].free_inodes && | |
2110 | + flex_freeb_ratio > free_block_ratio) | |
2111 | + goto found_flexbg; | |
2112 | + | |
2113 | + if (best_flex && best_flex == parent_fbg_group) { | |
2114 | + best_flex--; | |
2115 | + goto find_close_to_parent; | |
2116 | + } | |
2117 | + | |
2118 | + for (i = 0; i < n_fbg_groups; i++) { | |
2119 | + if (i == parent_fbg_group || i == parent_fbg_group - 1) | |
2120 | + continue; | |
2121 | + | |
2122 | + flexbg_free_blocks = flex_group[i].free_blocks; | |
2123 | + flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; | |
2124 | + | |
2125 | + if (flex_freeb_ratio > free_block_ratio && | |
2126 | + flex_group[i].free_inodes) { | |
2127 | + best_flex = i; | |
2128 | + goto found_flexbg; | |
2129 | + } | |
2130 | + | |
2131 | + if (flex_group[best_flex].free_inodes == 0 || | |
2132 | + (flex_group[i].free_blocks > | |
2133 | + flex_group[best_flex].free_blocks && | |
2134 | + flex_group[i].free_inodes)) | |
2135 | + best_flex = i; | |
2136 | + } | |
2137 | + | |
2138 | + if (!flex_group[best_flex].free_inodes || | |
2139 | + !flex_group[best_flex].free_blocks) | |
2140 | + return -1; | |
2141 | + | |
2142 | +found_flexbg: | |
2143 | + for (i = best_flex * flex_size; i < ngroups && | |
2144 | + i < (best_flex + 1) * flex_size; i++) { | |
2145 | + desc = ext4_get_group_desc(sb, i, &bh); | |
2146 | + if (le16_to_cpu(desc->bg_free_inodes_count)) { | |
2147 | + *best_group = i; | |
2148 | + goto out; | |
2149 | + } | |
2150 | + } | |
2151 | + | |
2152 | + return -1; | |
2153 | +out: | |
2154 | + return 0; | |
2155 | +} | |
2156 | + | |
2157 | /* | |
2158 | * Orlov's allocator for directories. | |
2159 | * | |
2160 | @@ -501,6 +592,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode) | |
2161 | struct inode *ret; | |
2162 | ext4_group_t i; | |
2163 | int free = 0; | |
2164 | + ext4_group_t flex_group; | |
2165 | ||
2166 | /* Cannot create files in a deleted directory */ | |
2167 | if (!dir || !dir->i_nlink) | |
2168 | @@ -514,6 +606,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode) | |
2169 | ||
2170 | sbi = EXT4_SB(sb); | |
2171 | es = sbi->s_es; | |
2172 | + | |
2173 | + if (sbi->s_log_groups_per_flex) { | |
2174 | + ret2 = find_group_flex(sb, dir, &group); | |
2175 | + goto got_group; | |
2176 | + } | |
2177 | + | |
2178 | if (S_ISDIR(mode)) { | |
2179 | if (test_opt (sb, OLDALLOC)) | |
2180 | ret2 = find_group_dir(sb, dir, &group); | |
2181 | @@ -522,6 +620,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode) | |
2182 | } else | |
2183 | ret2 = find_group_other(sb, dir, &group); | |
2184 | ||
2185 | +got_group: | |
2186 | err = -ENOSPC; | |
2187 | if (ret2 == -1) | |
2188 | goto out; | |
2189 | @@ -534,7 +633,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode) | |
2190 | goto fail; | |
2191 | ||
2192 | brelse(bitmap_bh); | |
2193 | - bitmap_bh = read_inode_bitmap(sb, group); | |
2194 | + bitmap_bh = ext4_read_inode_bitmap(sb, group); | |
2195 | if (!bitmap_bh) | |
2196 | goto fail; | |
2197 | ||
2198 | @@ -600,7 +699,7 @@ got: | |
2199 | /* We may have to initialize the block bitmap if it isn't already */ | |
2200 | if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && | |
2201 | gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { | |
2202 | - struct buffer_head *block_bh = read_block_bitmap(sb, group); | |
2203 | + struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group); | |
2204 | ||
2205 | BUFFER_TRACE(block_bh, "get block bitmap access"); | |
2206 | err = ext4_journal_get_write_access(handle, block_bh); | |
2207 | @@ -639,7 +738,7 @@ got: | |
2208 | ||
2209 | /* When marking the block group with | |
2210 | * ~EXT4_BG_INODE_UNINIT we don't want to depend | |
2211 | - * on the value of bg_itable_unsed even though | |
2212 | + * on the value of bg_itable_unused even though | |
2213 | * mke2fs could have initialized the same for us. | |
2214 | * Instead we calculated the value below | |
2215 | */ | |
2216 | @@ -676,6 +775,13 @@ got: | |
2217 | percpu_counter_inc(&sbi->s_dirs_counter); | |
2218 | sb->s_dirt = 1; | |
2219 | ||
2220 | + if (sbi->s_log_groups_per_flex) { | |
2221 | + flex_group = ext4_flex_group(sbi, group); | |
2222 | + spin_lock(sb_bgl_lock(sbi, flex_group)); | |
2223 | + sbi->s_flex_groups[flex_group].free_inodes--; | |
2224 | + spin_unlock(sb_bgl_lock(sbi, flex_group)); | |
2225 | + } | |
2226 | + | |
2227 | inode->i_uid = current->fsuid; | |
2228 | if (test_opt (sb, GRPID)) | |
2229 | inode->i_gid = dir->i_gid; | |
2230 | @@ -740,14 +846,10 @@ got: | |
2231 | goto fail_free_drop; | |
2232 | ||
2233 | if (test_opt(sb, EXTENTS)) { | |
2234 | - /* set extent flag only for diretory, file and normal symlink*/ | |
2235 | + /* set extent flag only for directory, file and normal symlink*/ | |
2236 | if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { | |
2237 | EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; | |
2238 | ext4_ext_tree_init(handle, inode); | |
2239 | - err = ext4_update_incompat_feature(handle, sb, | |
2240 | - EXT4_FEATURE_INCOMPAT_EXTENTS); | |
2241 | - if (err) | |
2242 | - goto fail_free_drop; | |
2243 | } | |
2244 | } | |
2245 | ||
2246 | @@ -799,7 +901,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) | |
2247 | ||
2248 | block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); | |
2249 | bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); | |
2250 | - bitmap_bh = read_inode_bitmap(sb, block_group); | |
2251 | + bitmap_bh = ext4_read_inode_bitmap(sb, block_group); | |
2252 | if (!bitmap_bh) { | |
2253 | ext4_warning(sb, __func__, | |
2254 | "inode bitmap error for orphan %lu", ino); | |
2255 | @@ -817,6 +919,14 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) | |
2256 | if (IS_ERR(inode)) | |
2257 | goto iget_failed; | |
2258 | ||
2259 | + /* | |
2260 | + * If the orphans has i_nlinks > 0 then it should be able to be | |
2261 | + * truncated, otherwise it won't be removed from the orphan list | |
2262 | + * during processing and an infinite loop will result. | |
2263 | + */ | |
2264 | + if (inode->i_nlink && !ext4_can_truncate(inode)) | |
2265 | + goto bad_orphan; | |
2266 | + | |
2267 | if (NEXT_ORPHAN(inode) > max_ino) | |
2268 | goto bad_orphan; | |
2269 | brelse(bitmap_bh); | |
2270 | @@ -838,6 +948,7 @@ bad_orphan: | |
2271 | printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", | |
2272 | NEXT_ORPHAN(inode)); | |
2273 | printk(KERN_NOTICE "max_ino=%lu\n", max_ino); | |
2274 | + printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink); | |
2275 | /* Avoid freeing blocks if we got a bad deleted inode */ | |
2276 | if (inode->i_nlink == 0) | |
2277 | inode->i_blocks = 0; | |
2278 | @@ -868,7 +979,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb) | |
2279 | continue; | |
2280 | desc_count += le16_to_cpu(gdp->bg_free_inodes_count); | |
2281 | brelse(bitmap_bh); | |
2282 | - bitmap_bh = read_inode_bitmap(sb, i); | |
2283 | + bitmap_bh = ext4_read_inode_bitmap(sb, i); | |
2284 | if (!bitmap_bh) | |
2285 | continue; | |
2286 | ||
2287 | diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c | |
2288 | index 8d97077..3c0195a 100644 | |
2289 | --- a/fs/ext4/inode.c | |
2290 | +++ b/fs/ext4/inode.c | |
2291 | @@ -32,12 +32,25 @@ | |
2292 | #include <linux/string.h> | |
2293 | #include <linux/buffer_head.h> | |
2294 | #include <linux/writeback.h> | |
2295 | +#include <linux/pagevec.h> | |
2296 | #include <linux/mpage.h> | |
2297 | #include <linux/uio.h> | |
2298 | #include <linux/bio.h> | |
2299 | #include "ext4_jbd2.h" | |
2300 | #include "xattr.h" | |
2301 | #include "acl.h" | |
2302 | +#include "ext4_extents.h" | |
2303 | + | |
2304 | +#define MPAGE_DA_EXTENT_TAIL 0x01 | |
2305 | + | |
2306 | +static inline int ext4_begin_ordered_truncate(struct inode *inode, | |
2307 | + loff_t new_size) | |
2308 | +{ | |
2309 | + return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode, | |
2310 | + new_size); | |
2311 | +} | |
2312 | + | |
2313 | +static void ext4_invalidatepage(struct page *page, unsigned long offset); | |
2314 | ||
2315 | /* | |
2316 | * Test whether an inode is a fast symlink. | |
2317 | @@ -180,14 +193,18 @@ static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) | |
2318 | void ext4_delete_inode (struct inode * inode) | |
2319 | { | |
2320 | handle_t *handle; | |
2321 | + int err; | |
2322 | ||
2323 | + if (ext4_should_order_data(inode)) | |
2324 | + ext4_begin_ordered_truncate(inode, 0); | |
2325 | truncate_inode_pages(&inode->i_data, 0); | |
2326 | ||
2327 | if (is_bad_inode(inode)) | |
2328 | goto no_delete; | |
2329 | ||
2330 | - handle = start_transaction(inode); | |
2331 | + handle = ext4_journal_start(inode, blocks_for_truncate(inode)+3); | |
2332 | if (IS_ERR(handle)) { | |
2333 | + ext4_std_error(inode->i_sb, PTR_ERR(handle)); | |
2334 | /* | |
2335 | * If we're going to skip the normal cleanup, we still need to | |
2336 | * make sure that the in-core orphan linked list is properly | |
2337 | @@ -200,8 +217,34 @@ void ext4_delete_inode (struct inode * inode) | |
2338 | if (IS_SYNC(inode)) | |
2339 | handle->h_sync = 1; | |
2340 | inode->i_size = 0; | |
2341 | + err = ext4_mark_inode_dirty(handle, inode); | |
2342 | + if (err) { | |
2343 | + ext4_warning(inode->i_sb, __func__, | |
2344 | + "couldn't mark inode dirty (err %d)", err); | |
2345 | + goto stop_handle; | |
2346 | + } | |
2347 | if (inode->i_blocks) | |
2348 | ext4_truncate(inode); | |
2349 | + | |
2350 | + /* | |
2351 | + * ext4_ext_truncate() doesn't reserve any slop when it | |
2352 | + * restarts journal transactions; therefore there may not be | |
2353 | + * enough credits left in the handle to remove the inode from | |
2354 | + * the orphan list and set the dtime field. | |
2355 | + */ | |
2356 | + if (handle->h_buffer_credits < 3) { | |
2357 | + err = ext4_journal_extend(handle, 3); | |
2358 | + if (err > 0) | |
2359 | + err = ext4_journal_restart(handle, 3); | |
2360 | + if (err != 0) { | |
2361 | + ext4_warning(inode->i_sb, __func__, | |
2362 | + "couldn't extend journal (err %d)", err); | |
2363 | + stop_handle: | |
2364 | + ext4_journal_stop(handle); | |
2365 | + goto no_delete; | |
2366 | + } | |
2367 | + } | |
2368 | + | |
2369 | /* | |
2370 | * Kill off the orphan record which ext4_truncate created. | |
2371 | * AKPM: I think this can be inside the above `if'. | |
2372 | @@ -508,11 +551,12 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks, | |
2373 | * direct blocks | |
2374 | */ | |
2375 | static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, | |
2376 | - ext4_fsblk_t goal, int indirect_blks, int blks, | |
2377 | - ext4_fsblk_t new_blocks[4], int *err) | |
2378 | + ext4_lblk_t iblock, ext4_fsblk_t goal, | |
2379 | + int indirect_blks, int blks, | |
2380 | + ext4_fsblk_t new_blocks[4], int *err) | |
2381 | { | |
2382 | int target, i; | |
2383 | - unsigned long count = 0; | |
2384 | + unsigned long count = 0, blk_allocated = 0; | |
2385 | int index = 0; | |
2386 | ext4_fsblk_t current_block = 0; | |
2387 | int ret = 0; | |
2388 | @@ -525,12 +569,13 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, | |
2389 | * the first direct block of this branch. That's the | |
2390 | * minimum number of blocks need to allocate(required) | |
2391 | */ | |
2392 | - target = blks + indirect_blks; | |
2393 | - | |
2394 | - while (1) { | |
2395 | + /* first we try to allocate the indirect blocks */ | |
2396 | + target = indirect_blks; | |
2397 | + while (target > 0) { | |
2398 | count = target; | |
2399 | /* allocating blocks for indirect blocks and direct blocks */ | |
2400 | - current_block = ext4_new_blocks(handle,inode,goal,&count,err); | |
2401 | + current_block = ext4_new_meta_blocks(handle, inode, | |
2402 | + goal, &count, err); | |
2403 | if (*err) | |
2404 | goto failed_out; | |
2405 | ||
2406 | @@ -540,16 +585,48 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, | |
2407 | new_blocks[index++] = current_block++; | |
2408 | count--; | |
2409 | } | |
2410 | - | |
2411 | - if (count > 0) | |
2412 | + if (count > 0) { | |
2413 | + /* | |
2414 | + * save the new block number | |
2415 | + * for the first direct block | |
2416 | + */ | |
2417 | + new_blocks[index] = current_block; | |
2418 | + printk(KERN_INFO "%s returned more blocks than " | |
2419 | + "requested\n", __func__); | |
2420 | + WARN_ON(1); | |
2421 | break; | |
2422 | + } | |
2423 | } | |
2424 | ||
2425 | - /* save the new block number for the first direct block */ | |
2426 | - new_blocks[index] = current_block; | |
2427 | - | |
2428 | + target = blks - count ; | |
2429 | + blk_allocated = count; | |
2430 | + if (!target) | |
2431 | + goto allocated; | |
2432 | + /* Now allocate data blocks */ | |
2433 | + count = target; | |
2434 | + /* allocating blocks for data blocks */ | |
2435 | + current_block = ext4_new_blocks(handle, inode, iblock, | |
2436 | + goal, &count, err); | |
2437 | + if (*err && (target == blks)) { | |
2438 | + /* | |
2439 | + * if the allocation failed and we didn't allocate | |
2440 | + * any blocks before | |
2441 | + */ | |
2442 | + goto failed_out; | |
2443 | + } | |
2444 | + if (!*err) { | |
2445 | + if (target == blks) { | |
2446 | + /* | |
2447 | + * save the new block number | |
2448 | + * for the first direct block | |
2449 | + */ | |
2450 | + new_blocks[index] = current_block; | |
2451 | + } | |
2452 | + blk_allocated += count; | |
2453 | + } | |
2454 | +allocated: | |
2455 | /* total number of blocks allocated for direct blocks */ | |
2456 | - ret = count; | |
2457 | + ret = blk_allocated; | |
2458 | *err = 0; | |
2459 | return ret; | |
2460 | failed_out: | |
2461 | @@ -584,8 +661,9 @@ failed_out: | |
2462 | * as described above and return 0. | |
2463 | */ | |
2464 | static int ext4_alloc_branch(handle_t *handle, struct inode *inode, | |
2465 | - int indirect_blks, int *blks, ext4_fsblk_t goal, | |
2466 | - ext4_lblk_t *offsets, Indirect *branch) | |
2467 | + ext4_lblk_t iblock, int indirect_blks, | |
2468 | + int *blks, ext4_fsblk_t goal, | |
2469 | + ext4_lblk_t *offsets, Indirect *branch) | |
2470 | { | |
2471 | int blocksize = inode->i_sb->s_blocksize; | |
2472 | int i, n = 0; | |
2473 | @@ -595,7 +673,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, | |
2474 | ext4_fsblk_t new_blocks[4]; | |
2475 | ext4_fsblk_t current_block; | |
2476 | ||
2477 | - num = ext4_alloc_blocks(handle, inode, goal, indirect_blks, | |
2478 | + num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, | |
2479 | *blks, new_blocks, &err); | |
2480 | if (err) | |
2481 | return err; | |
2482 | @@ -799,6 +877,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, | |
2483 | struct ext4_inode_info *ei = EXT4_I(inode); | |
2484 | int count = 0; | |
2485 | ext4_fsblk_t first_block = 0; | |
2486 | + loff_t disksize; | |
2487 | ||
2488 | ||
2489 | J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); | |
2490 | @@ -855,8 +934,9 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, | |
2491 | /* | |
2492 | * Block out ext4_truncate while we alter the tree | |
2493 | */ | |
2494 | - err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal, | |
2495 | - offsets + (partial - chain), partial); | |
2496 | + err = ext4_alloc_branch(handle, inode, iblock, indirect_blks, | |
2497 | + &count, goal, | |
2498 | + offsets + (partial - chain), partial); | |
2499 | ||
2500 | /* | |
2501 | * The ext4_splice_branch call will free and forget any buffers | |
2502 | @@ -873,8 +953,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, | |
2503 | * protect it if you're about to implement concurrent | |
2504 | * ext4_get_block() -bzzz | |
2505 | */ | |
2506 | - if (!err && extend_disksize && inode->i_size > ei->i_disksize) | |
2507 | - ei->i_disksize = inode->i_size; | |
2508 | + if (!err && extend_disksize) { | |
2509 | + disksize = ((loff_t) iblock + count) << inode->i_blkbits; | |
2510 | + if (disksize > i_size_read(inode)) | |
2511 | + disksize = i_size_read(inode); | |
2512 | + if (disksize > ei->i_disksize) | |
2513 | + ei->i_disksize = disksize; | |
2514 | + } | |
2515 | if (err) | |
2516 | goto cleanup; | |
2517 | ||
2518 | @@ -897,23 +982,74 @@ out: | |
2519 | return err; | |
2520 | } | |
2521 | ||
2522 | -/* Maximum number of blocks we map for direct IO at once. */ | |
2523 | -#define DIO_MAX_BLOCKS 4096 | |
2524 | /* | |
2525 | - * Number of credits we need for writing DIO_MAX_BLOCKS: | |
2526 | - * We need sb + group descriptor + bitmap + inode -> 4 | |
2527 | - * For B blocks with A block pointers per block we need: | |
2528 | - * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect). | |
2529 | - * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25. | |
2530 | + * Calculate the number of metadata blocks need to reserve | |
2531 | + * to allocate @blocks for non extent file based file | |
2532 | */ | |
2533 | -#define DIO_CREDITS 25 | |
2534 | +static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) | |
2535 | +{ | |
2536 | + int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); | |
2537 | + int ind_blks, dind_blks, tind_blks; | |
2538 | + | |
2539 | + /* number of new indirect blocks needed */ | |
2540 | + ind_blks = (blocks + icap - 1) / icap; | |
2541 | + | |
2542 | + dind_blks = (ind_blks + icap - 1) / icap; | |
2543 | ||
2544 | + tind_blks = 1; | |
2545 | + | |
2546 | + return ind_blks + dind_blks + tind_blks; | |
2547 | +} | |
2548 | ||
2549 | /* | |
2550 | + * Calculate the number of metadata blocks need to reserve | |
2551 | + * to allocate given number of blocks | |
2552 | + */ | |
2553 | +static int ext4_calc_metadata_amount(struct inode *inode, int blocks) | |
2554 | +{ | |
2555 | + if (!blocks) | |
2556 | + return 0; | |
2557 | + | |
2558 | + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) | |
2559 | + return ext4_ext_calc_metadata_amount(inode, blocks); | |
2560 | + | |
2561 | + return ext4_indirect_calc_metadata_amount(inode, blocks); | |
2562 | +} | |
2563 | + | |
2564 | +static void ext4_da_update_reserve_space(struct inode *inode, int used) | |
2565 | +{ | |
2566 | + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | |
2567 | + int total, mdb, mdb_free; | |
2568 | + | |
2569 | + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); | |
2570 | + /* recalculate the number of metablocks still need to be reserved */ | |
2571 | + total = EXT4_I(inode)->i_reserved_data_blocks - used; | |
2572 | + mdb = ext4_calc_metadata_amount(inode, total); | |
2573 | + | |
2574 | + /* figure out how many metablocks to release */ | |
2575 | + BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); | |
2576 | + mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; | |
2577 | + | |
2578 | + /* Account for allocated meta_blocks */ | |
2579 | + mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; | |
2580 | + | |
2581 | + /* update fs free blocks counter for truncate case */ | |
2582 | + percpu_counter_add(&sbi->s_freeblocks_counter, mdb_free); | |
2583 | + | |
2584 | + /* update per-inode reservations */ | |
2585 | + BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); | |
2586 | + EXT4_I(inode)->i_reserved_data_blocks -= used; | |
2587 | + | |
2588 | + BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); | |
2589 | + EXT4_I(inode)->i_reserved_meta_blocks = mdb; | |
2590 | + EXT4_I(inode)->i_allocated_meta_blocks = 0; | |
2591 | + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | |
2592 | +} | |
2593 | + | |
2594 | +/* | |
2595 | + * The ext4_get_blocks_wrap() function try to look up the requested blocks, | |
2596 | + * and returns if the blocks are already mapped. | |
2597 | * | |
2598 | - * | |
2599 | - * ext4_ext4 get_block() wrapper function | |
2600 | - * It will do a look up first, and returns if the blocks already mapped. | |
2601 | * Otherwise it takes the write lock of the i_data_sem and allocate blocks | |
2602 | * and store the allocated blocks in the result buffer head and mark it | |
2603 | * mapped. | |
2604 | @@ -934,7 +1070,7 @@ out: | |
2605 | */ | |
2606 | int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, | |
2607 | unsigned long max_blocks, struct buffer_head *bh, | |
2608 | - int create, int extend_disksize) | |
2609 | + int create, int extend_disksize, int flag) | |
2610 | { | |
2611 | int retval; | |
2612 | ||
2613 | @@ -975,6 +1111,15 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, | |
2614 | * with create == 1 flag. | |
2615 | */ | |
2616 | down_write((&EXT4_I(inode)->i_data_sem)); | |
2617 | + | |
2618 | + /* | |
2619 | + * if the caller is from delayed allocation writeout path | |
2620 | + * we have already reserved fs blocks for allocation | |
2621 | + * let the underlying get_block() function know to | |
2622 | + * avoid double accounting | |
2623 | + */ | |
2624 | + if (flag) | |
2625 | + EXT4_I(inode)->i_delalloc_reserved_flag = 1; | |
2626 | /* | |
2627 | * We need to check for EXT4 here because migrate | |
2628 | * could have changed the inode type in between | |
2629 | @@ -996,23 +1141,39 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, | |
2630 | ~EXT4_EXT_MIGRATE; | |
2631 | } | |
2632 | } | |
2633 | + | |
2634 | + if (flag) { | |
2635 | + EXT4_I(inode)->i_delalloc_reserved_flag = 0; | |
2636 | + /* | |
2637 | + * Update reserved blocks/metadata blocks | |
2638 | + * after successful block allocation | |
2639 | + * which were deferred till now | |
2640 | + */ | |
2641 | + if ((retval > 0) && buffer_delay(bh)) | |
2642 | + ext4_da_update_reserve_space(inode, retval); | |
2643 | + } | |
2644 | + | |
2645 | up_write((&EXT4_I(inode)->i_data_sem)); | |
2646 | return retval; | |
2647 | } | |
2648 | ||
2649 | +/* Maximum number of blocks we map for direct IO at once. */ | |
2650 | +#define DIO_MAX_BLOCKS 4096 | |
2651 | + | |
2652 | static int ext4_get_block(struct inode *inode, sector_t iblock, | |
2653 | struct buffer_head *bh_result, int create) | |
2654 | { | |
2655 | handle_t *handle = ext4_journal_current_handle(); | |
2656 | int ret = 0, started = 0; | |
2657 | unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; | |
2658 | + int dio_credits; | |
2659 | ||
2660 | if (create && !handle) { | |
2661 | /* Direct IO write... */ | |
2662 | if (max_blocks > DIO_MAX_BLOCKS) | |
2663 | max_blocks = DIO_MAX_BLOCKS; | |
2664 | - handle = ext4_journal_start(inode, DIO_CREDITS + | |
2665 | - 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb)); | |
2666 | + dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); | |
2667 | + handle = ext4_journal_start(inode, dio_credits); | |
2668 | if (IS_ERR(handle)) { | |
2669 | ret = PTR_ERR(handle); | |
2670 | goto out; | |
2671 | @@ -1021,7 +1182,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock, | |
2672 | } | |
2673 | ||
2674 | ret = ext4_get_blocks_wrap(handle, inode, iblock, | |
2675 | - max_blocks, bh_result, create, 0); | |
2676 | + max_blocks, bh_result, create, 0, 0); | |
2677 | if (ret > 0) { | |
2678 | bh_result->b_size = (ret << inode->i_blkbits); | |
2679 | ret = 0; | |
2680 | @@ -1047,7 +1208,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, | |
2681 | dummy.b_blocknr = -1000; | |
2682 | buffer_trace_init(&dummy.b_history); | |
2683 | err = ext4_get_blocks_wrap(handle, inode, block, 1, | |
2684 | - &dummy, create, 1); | |
2685 | + &dummy, create, 1, 0); | |
2686 | /* | |
2687 | * ext4_get_blocks_handle() returns number of blocks | |
2688 | * mapped. 0 in case of a HOLE. | |
2689 | @@ -1203,19 +1364,20 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, | |
2690 | to = from + len; | |
2691 | ||
2692 | retry: | |
2693 | - page = __grab_cache_page(mapping, index); | |
2694 | - if (!page) | |
2695 | - return -ENOMEM; | |
2696 | - *pagep = page; | |
2697 | - | |
2698 | handle = ext4_journal_start(inode, needed_blocks); | |
2699 | if (IS_ERR(handle)) { | |
2700 | - unlock_page(page); | |
2701 | - page_cache_release(page); | |
2702 | ret = PTR_ERR(handle); | |
2703 | goto out; | |
2704 | } | |
2705 | ||
2706 | + page = __grab_cache_page(mapping, index); | |
2707 | + if (!page) { | |
2708 | + ext4_journal_stop(handle); | |
2709 | + ret = -ENOMEM; | |
2710 | + goto out; | |
2711 | + } | |
2712 | + *pagep = page; | |
2713 | + | |
2714 | ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, | |
2715 | ext4_get_block); | |
2716 | ||
2717 | @@ -1225,8 +1387,8 @@ retry: | |
2718 | } | |
2719 | ||
2720 | if (ret) { | |
2721 | - ext4_journal_stop(handle); | |
2722 | unlock_page(page); | |
2723 | + ext4_journal_stop(handle); | |
2724 | page_cache_release(page); | |
2725 | } | |
2726 | ||
2727 | @@ -1236,15 +1398,6 @@ out: | |
2728 | return ret; | |
2729 | } | |
2730 | ||
2731 | -int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh) | |
2732 | -{ | |
2733 | - int err = jbd2_journal_dirty_data(handle, bh); | |
2734 | - if (err) | |
2735 | - ext4_journal_abort_handle(__func__, __func__, | |
2736 | - bh, handle, err); | |
2737 | - return err; | |
2738 | -} | |
2739 | - | |
2740 | /* For write_end() in data=journal mode */ | |
2741 | static int write_end_fn(handle_t *handle, struct buffer_head *bh) | |
2742 | { | |
2743 | @@ -1255,29 +1408,6 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh) | |
2744 | } | |
2745 | ||
2746 | /* | |
2747 | - * Generic write_end handler for ordered and writeback ext4 journal modes. | |
2748 | - * We can't use generic_write_end, because that unlocks the page and we need to | |
2749 | - * unlock the page after ext4_journal_stop, but ext4_journal_stop must run | |
2750 | - * after block_write_end. | |
2751 | - */ | |
2752 | -static int ext4_generic_write_end(struct file *file, | |
2753 | - struct address_space *mapping, | |
2754 | - loff_t pos, unsigned len, unsigned copied, | |
2755 | - struct page *page, void *fsdata) | |
2756 | -{ | |
2757 | - struct inode *inode = file->f_mapping->host; | |
2758 | - | |
2759 | - copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); | |
2760 | - | |
2761 | - if (pos+copied > inode->i_size) { | |
2762 | - i_size_write(inode, pos+copied); | |
2763 | - mark_inode_dirty(inode); | |
2764 | - } | |
2765 | - | |
2766 | - return copied; | |
2767 | -} | |
2768 | - | |
2769 | -/* | |
2770 | * We need to pick up the new inode size which generic_commit_write gave us | |
2771 | * `file' can be NULL - eg, when called from page_symlink(). | |
2772 | * | |
2773 | @@ -1290,15 +1420,10 @@ static int ext4_ordered_write_end(struct file *file, | |
2774 | struct page *page, void *fsdata) | |
2775 | { | |
2776 | handle_t *handle = ext4_journal_current_handle(); | |
2777 | - struct inode *inode = file->f_mapping->host; | |
2778 | - unsigned from, to; | |
2779 | + struct inode *inode = mapping->host; | |
2780 | int ret = 0, ret2; | |
2781 | ||
2782 | - from = pos & (PAGE_CACHE_SIZE - 1); | |
2783 | - to = from + len; | |
2784 | - | |
2785 | - ret = walk_page_buffers(handle, page_buffers(page), | |
2786 | - from, to, NULL, ext4_journal_dirty_data); | |
2787 | + ret = ext4_jbd2_file_inode(handle, inode); | |
2788 | ||
2789 | if (ret == 0) { | |
2790 | /* | |
2791 | @@ -1311,7 +1436,7 @@ static int ext4_ordered_write_end(struct file *file, | |
2792 | new_i_size = pos + copied; | |
2793 | if (new_i_size > EXT4_I(inode)->i_disksize) | |
2794 | EXT4_I(inode)->i_disksize = new_i_size; | |
2795 | - ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, | |
2796 | + ret2 = generic_write_end(file, mapping, pos, len, copied, | |
2797 | page, fsdata); | |
2798 | copied = ret2; | |
2799 | if (ret2 < 0) | |
2800 | @@ -1320,8 +1445,6 @@ static int ext4_ordered_write_end(struct file *file, | |
2801 | ret2 = ext4_journal_stop(handle); | |
2802 | if (!ret) | |
2803 | ret = ret2; | |
2804 | - unlock_page(page); | |
2805 | - page_cache_release(page); | |
2806 | ||
2807 | return ret ? ret : copied; | |
2808 | } | |
2809 | @@ -1332,7 +1455,7 @@ static int ext4_writeback_write_end(struct file *file, | |
2810 | struct page *page, void *fsdata) | |
2811 | { | |
2812 | handle_t *handle = ext4_journal_current_handle(); | |
2813 | - struct inode *inode = file->f_mapping->host; | |
2814 | + struct inode *inode = mapping->host; | |
2815 | int ret = 0, ret2; | |
2816 | loff_t new_i_size; | |
2817 | ||
2818 | @@ -1340,7 +1463,7 @@ static int ext4_writeback_write_end(struct file *file, | |
2819 | if (new_i_size > EXT4_I(inode)->i_disksize) | |
2820 | EXT4_I(inode)->i_disksize = new_i_size; | |
2821 | ||
2822 | - ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, | |
2823 | + ret2 = generic_write_end(file, mapping, pos, len, copied, | |
2824 | page, fsdata); | |
2825 | copied = ret2; | |
2826 | if (ret2 < 0) | |
2827 | @@ -1349,8 +1472,6 @@ static int ext4_writeback_write_end(struct file *file, | |
2828 | ret2 = ext4_journal_stop(handle); | |
2829 | if (!ret) | |
2830 | ret = ret2; | |
2831 | - unlock_page(page); | |
2832 | - page_cache_release(page); | |
2833 | ||
2834 | return ret ? ret : copied; | |
2835 | } | |
2836 | @@ -1389,15 +1510,1028 @@ static int ext4_journalled_write_end(struct file *file, | |
2837 | ret = ret2; | |
2838 | } | |
2839 | ||
2840 | + unlock_page(page); | |
2841 | ret2 = ext4_journal_stop(handle); | |
2842 | if (!ret) | |
2843 | ret = ret2; | |
2844 | - unlock_page(page); | |
2845 | page_cache_release(page); | |
2846 | ||
2847 | return ret ? ret : copied; | |
2848 | } | |
2849 | ||
2850 | +static int ext4_da_reserve_space(struct inode *inode, int nrblocks) | |
2851 | +{ | |
2852 | + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | |
2853 | + unsigned long md_needed, mdblocks, total = 0; | |
2854 | + | |
2855 | + /* | |
2856 | + * recalculate the amount of metadata blocks to reserve | |
2857 | + * in order to allocate nrblocks | |
2858 | + * worse case is one extent per block | |
2859 | + */ | |
2860 | + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); | |
2861 | + total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; | |
2862 | + mdblocks = ext4_calc_metadata_amount(inode, total); | |
2863 | + BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks); | |
2864 | + | |
2865 | + md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; | |
2866 | + total = md_needed + nrblocks; | |
2867 | + | |
2868 | + if (ext4_has_free_blocks(sbi, total) < total) { | |
2869 | + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | |
2870 | + return -ENOSPC; | |
2871 | + } | |
2872 | + /* reduce fs free blocks counter */ | |
2873 | + percpu_counter_sub(&sbi->s_freeblocks_counter, total); | |
2874 | + | |
2875 | + EXT4_I(inode)->i_reserved_data_blocks += nrblocks; | |
2876 | + EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; | |
2877 | + | |
2878 | + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | |
2879 | + return 0; /* success */ | |
2880 | +} | |
2881 | + | |
2882 | +static void ext4_da_release_space(struct inode *inode, int to_free) | |
2883 | +{ | |
2884 | + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | |
2885 | + int total, mdb, mdb_free, release; | |
2886 | + | |
2887 | + if (!to_free) | |
2888 | + return; /* Nothing to release, exit */ | |
2889 | + | |
2890 | + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); | |
2891 | + | |
2892 | + if (!EXT4_I(inode)->i_reserved_data_blocks) { | |
2893 | + /* | |
2894 | + * if there is no reserved blocks, but we try to free some | |
2895 | + * then the counter is messed up somewhere. | |
2896 | + * but since this function is called from invalidate | |
2897 | + * page, it's harmless to return without any action | |
2898 | + */ | |
2899 | + printk(KERN_INFO "ext4 delalloc try to release %d reserved " | |
2900 | + "blocks for inode %lu, but there is no reserved " | |
2901 | + "data blocks\n", to_free, inode->i_ino); | |
2902 | + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | |
2903 | + return; | |
2904 | + } | |
2905 | + | |
2906 | + /* recalculate the number of metablocks still need to be reserved */ | |
2907 | + total = EXT4_I(inode)->i_reserved_data_blocks - to_free; | |
2908 | + mdb = ext4_calc_metadata_amount(inode, total); | |
2909 | + | |
2910 | + /* figure out how many metablocks to release */ | |
2911 | + BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); | |
2912 | + mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; | |
2913 | + | |
2914 | + release = to_free + mdb_free; | |
2915 | + | |
2916 | + /* update fs free blocks counter for truncate case */ | |
2917 | + percpu_counter_add(&sbi->s_freeblocks_counter, release); | |
2918 | + | |
2919 | + /* update per-inode reservations */ | |
2920 | + BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); | |
2921 | + EXT4_I(inode)->i_reserved_data_blocks -= to_free; | |
2922 | + | |
2923 | + BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); | |
2924 | + EXT4_I(inode)->i_reserved_meta_blocks = mdb; | |
2925 | + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | |
2926 | +} | |
2927 | + | |
2928 | +static void ext4_da_page_release_reservation(struct page *page, | |
2929 | + unsigned long offset) | |
2930 | +{ | |
2931 | + int to_release = 0; | |
2932 | + struct buffer_head *head, *bh; | |
2933 | + unsigned int curr_off = 0; | |
2934 | + | |
2935 | + head = page_buffers(page); | |
2936 | + bh = head; | |
2937 | + do { | |
2938 | + unsigned int next_off = curr_off + bh->b_size; | |
2939 | + | |
2940 | + if ((offset <= curr_off) && (buffer_delay(bh))) { | |
2941 | + to_release++; | |
2942 | + clear_buffer_delay(bh); | |
2943 | + } | |
2944 | + curr_off = next_off; | |
2945 | + } while ((bh = bh->b_this_page) != head); | |
2946 | + ext4_da_release_space(page->mapping->host, to_release); | |
2947 | +} | |
2948 | + | |
2949 | +/* | |
2950 | + * Delayed allocation stuff | |
2951 | + */ | |
2952 | + | |
2953 | +struct mpage_da_data { | |
2954 | + struct inode *inode; | |
2955 | + struct buffer_head lbh; /* extent of blocks */ | |
2956 | + unsigned long first_page, next_page; /* extent of pages */ | |
2957 | + get_block_t *get_block; | |
2958 | + struct writeback_control *wbc; | |
2959 | + int io_done; | |
2960 | + long pages_written; | |
2961 | +}; | |
2962 | + | |
2963 | +/* | |
2964 | + * mpage_da_submit_io - walks through extent of pages and try to write | |
2965 | + * them with writepage() call back | |
2966 | + * | |
2967 | + * @mpd->inode: inode | |
2968 | + * @mpd->first_page: first page of the extent | |
2969 | + * @mpd->next_page: page after the last page of the extent | |
2970 | + * @mpd->get_block: the filesystem's block mapper function | |
2971 | + * | |
2972 | + * By the time mpage_da_submit_io() is called we expect all blocks | |
2973 | + * to be allocated. this may be wrong if allocation failed. | |
2974 | + * | |
2975 | + * As pages are already locked by write_cache_pages(), we can't use it | |
2976 | + */ | |
2977 | +static int mpage_da_submit_io(struct mpage_da_data *mpd) | |
2978 | +{ | |
2979 | + struct address_space *mapping = mpd->inode->i_mapping; | |
2980 | + int ret = 0, err, nr_pages, i; | |
2981 | + unsigned long index, end; | |
2982 | + struct pagevec pvec; | |
2983 | + | |
2984 | + BUG_ON(mpd->next_page <= mpd->first_page); | |
2985 | + pagevec_init(&pvec, 0); | |
2986 | + index = mpd->first_page; | |
2987 | + end = mpd->next_page - 1; | |
2988 | + | |
2989 | + while (index <= end) { | |
2990 | + /* XXX: optimize tail */ | |
2991 | + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); | |
2992 | + if (nr_pages == 0) | |
2993 | + break; | |
2994 | + for (i = 0; i < nr_pages; i++) { | |
2995 | + struct page *page = pvec.pages[i]; | |
2996 | + | |
2997 | + index = page->index; | |
2998 | + if (index > end) | |
2999 | + break; | |
3000 | + index++; | |
3001 | + | |
3002 | + err = mapping->a_ops->writepage(page, mpd->wbc); | |
3003 | + if (!err) | |
3004 | + mpd->pages_written++; | |
3005 | + /* | |
3006 | + * In error case, we have to continue because | |
3007 | + * remaining pages are still locked | |
3008 | + * XXX: unlock and re-dirty them? | |
3009 | + */ | |
3010 | + if (ret == 0) | |
3011 | + ret = err; | |
3012 | + } | |
3013 | + pagevec_release(&pvec); | |
3014 | + } | |
3015 | + return ret; | |
3016 | +} | |
3017 | + | |
3018 | +/* | |
3019 | + * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers | |
3020 | + * | |
3021 | + * @mpd->inode - inode to walk through | |
3022 | + * @exbh->b_blocknr - first block on a disk | |
3023 | + * @exbh->b_size - amount of space in bytes | |
3024 | + * @logical - first logical block to start assignment with | |
3025 | + * | |
3026 | + * the function goes through all passed space and put actual disk | |
3027 | + * block numbers into buffer heads, dropping BH_Delay | |
3028 | + */ | |
3029 | +static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, | |
3030 | + struct buffer_head *exbh) | |
3031 | +{ | |
3032 | + struct inode *inode = mpd->inode; | |
3033 | + struct address_space *mapping = inode->i_mapping; | |
3034 | + int blocks = exbh->b_size >> inode->i_blkbits; | |
3035 | + sector_t pblock = exbh->b_blocknr, cur_logical; | |
3036 | + struct buffer_head *head, *bh; | |
3037 | + pgoff_t index, end; | |
3038 | + struct pagevec pvec; | |
3039 | + int nr_pages, i; | |
3040 | + | |
3041 | + index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); | |
3042 | + end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); | |
3043 | + cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); | |
3044 | + | |
3045 | + pagevec_init(&pvec, 0); | |
3046 | + | |
3047 | + while (index <= end) { | |
3048 | + /* XXX: optimize tail */ | |
3049 | + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); | |
3050 | + if (nr_pages == 0) | |
3051 | + break; | |
3052 | + for (i = 0; i < nr_pages; i++) { | |
3053 | + struct page *page = pvec.pages[i]; | |
3054 | + | |
3055 | + index = page->index; | |
3056 | + if (index > end) | |
3057 | + break; | |
3058 | + index++; | |
3059 | + | |
3060 | + BUG_ON(!PageLocked(page)); | |
3061 | + BUG_ON(PageWriteback(page)); | |
3062 | + BUG_ON(!page_has_buffers(page)); | |
3063 | + | |
3064 | + bh = page_buffers(page); | |
3065 | + head = bh; | |
3066 | + | |
3067 | + /* skip blocks out of the range */ | |
3068 | + do { | |
3069 | + if (cur_logical >= logical) | |
3070 | + break; | |
3071 | + cur_logical++; | |
3072 | + } while ((bh = bh->b_this_page) != head); | |
3073 | + | |
3074 | + do { | |
3075 | + if (cur_logical >= logical + blocks) | |
3076 | + break; | |
3077 | + if (buffer_delay(bh)) { | |
3078 | + bh->b_blocknr = pblock; | |
3079 | + clear_buffer_delay(bh); | |
3080 | + bh->b_bdev = inode->i_sb->s_bdev; | |
3081 | + } else if (buffer_unwritten(bh)) { | |
3082 | + bh->b_blocknr = pblock; | |
3083 | + clear_buffer_unwritten(bh); | |
3084 | + set_buffer_mapped(bh); | |
3085 | + set_buffer_new(bh); | |
3086 | + bh->b_bdev = inode->i_sb->s_bdev; | |
3087 | + } else if (buffer_mapped(bh)) | |
3088 | + BUG_ON(bh->b_blocknr != pblock); | |
3089 | + | |
3090 | + cur_logical++; | |
3091 | + pblock++; | |
3092 | + } while ((bh = bh->b_this_page) != head); | |
3093 | + } | |
3094 | + pagevec_release(&pvec); | |
3095 | + } | |
3096 | +} | |
3097 | + | |
3098 | + | |
3099 | +/* | |
3100 | + * __unmap_underlying_blocks - just a helper function to unmap | |
3101 | + * set of blocks described by @bh | |
3102 | + */ | |
3103 | +static inline void __unmap_underlying_blocks(struct inode *inode, | |
3104 | + struct buffer_head *bh) | |
3105 | +{ | |
3106 | + struct block_device *bdev = inode->i_sb->s_bdev; | |
3107 | + int blocks, i; | |
3108 | + | |
3109 | + blocks = bh->b_size >> inode->i_blkbits; | |
3110 | + for (i = 0; i < blocks; i++) | |
3111 | + unmap_underlying_metadata(bdev, bh->b_blocknr + i); | |
3112 | +} | |
3113 | + | |
3114 | +/* | |
3115 | + * mpage_da_map_blocks - go through given space | |
3116 | + * | |
3117 | + * @mpd->lbh - bh describing space | |
3118 | + * @mpd->get_block - the filesystem's block mapper function | |
3119 | + * | |
3120 | + * The function skips space we know is already mapped to disk blocks. | |
3121 | + * | |
3122 | + */ | |
3123 | +static void mpage_da_map_blocks(struct mpage_da_data *mpd) | |
3124 | +{ | |
3125 | + int err = 0; | |
3126 | + struct buffer_head *lbh = &mpd->lbh; | |
3127 | + sector_t next = lbh->b_blocknr; | |
3128 | + struct buffer_head new; | |
3129 | + | |
3130 | + /* | |
3131 | + * We consider only non-mapped and non-allocated blocks | |
3132 | + */ | |
3133 | + if (buffer_mapped(lbh) && !buffer_delay(lbh)) | |
3134 | + return; | |
3135 | + | |
3136 | + new.b_state = lbh->b_state; | |
3137 | + new.b_blocknr = 0; | |
3138 | + new.b_size = lbh->b_size; | |
3139 | + | |
3140 | + /* | |
3141 | + * If we didn't accumulate anything | |
3142 | + * to write simply return | |
3143 | + */ | |
3144 | + if (!new.b_size) | |
3145 | + return; | |
3146 | + err = mpd->get_block(mpd->inode, next, &new, 1); | |
3147 | + if (err) | |
3148 | + return; | |
3149 | + BUG_ON(new.b_size == 0); | |
3150 | + | |
3151 | + if (buffer_new(&new)) | |
3152 | + __unmap_underlying_blocks(mpd->inode, &new); | |
3153 | + | |
3154 | + /* | |
3155 | + * If blocks are delayed marked, we need to | |
3156 | + * put actual blocknr and drop delayed bit | |
3157 | + */ | |
3158 | + if (buffer_delay(lbh) || buffer_unwritten(lbh)) | |
3159 | + mpage_put_bnr_to_bhs(mpd, next, &new); | |
3160 | + | |
3161 | + return; | |
3162 | +} | |
3163 | + | |
3164 | +#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ | |
3165 | + (1 << BH_Delay) | (1 << BH_Unwritten)) | |
3166 | + | |
3167 | +/* | |
3168 | + * mpage_add_bh_to_extent - try to add one more block to extent of blocks | |
3169 | + * | |
3170 | + * @mpd->lbh - extent of blocks | |
3171 | + * @logical - logical number of the block in the file | |
3172 | + * @bh - bh of the block (used to access block's state) | |
3173 | + * | |
3174 | + * the function is used to collect contig. blocks in same state | |
3175 | + */ | |
3176 | +static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, | |
3177 | + sector_t logical, struct buffer_head *bh) | |
3178 | +{ | |
3179 | + sector_t next; | |
3180 | + size_t b_size = bh->b_size; | |
3181 | + struct buffer_head *lbh = &mpd->lbh; | |
3182 | + int nrblocks = lbh->b_size >> mpd->inode->i_blkbits; | |
3183 | + | |
3184 | + /* check if thereserved journal credits might overflow */ | |
3185 | + if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { | |
3186 | + if (nrblocks >= EXT4_MAX_TRANS_DATA) { | |
3187 | + /* | |
3188 | + * With non-extent format we are limited by the journal | |
3189 | + * credit available. Total credit needed to insert | |
3190 | + * nrblocks contiguous blocks is dependent on the | |
3191 | + * nrblocks. So limit nrblocks. | |
3192 | + */ | |
3193 | + goto flush_it; | |
3194 | + } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) > | |
3195 | + EXT4_MAX_TRANS_DATA) { | |
3196 | + /* | |
3197 | + * Adding the new buffer_head would make it cross the | |
3198 | + * allowed limit for which we have journal credit | |
3199 | + * reserved. So limit the new bh->b_size | |
3200 | + */ | |
3201 | + b_size = (EXT4_MAX_TRANS_DATA - nrblocks) << | |
3202 | + mpd->inode->i_blkbits; | |
3203 | + /* we will do mpage_da_submit_io in the next loop */ | |
3204 | + } | |
3205 | + } | |
3206 | + /* | |
3207 | + * First block in the extent | |
3208 | + */ | |
3209 | + if (lbh->b_size == 0) { | |
3210 | + lbh->b_blocknr = logical; | |
3211 | + lbh->b_size = b_size; | |
3212 | + lbh->b_state = bh->b_state & BH_FLAGS; | |
3213 | + return; | |
3214 | + } | |
3215 | + | |
3216 | + next = lbh->b_blocknr + nrblocks; | |
3217 | + /* | |
3218 | + * Can we merge the block to our big extent? | |
3219 | + */ | |
3220 | + if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { | |
3221 | + lbh->b_size += b_size; | |
3222 | + return; | |
3223 | + } | |
3224 | + | |
3225 | +flush_it: | |
3226 | + /* | |
3227 | + * We couldn't merge the block to our extent, so we | |
3228 | + * need to flush current extent and start new one | |
3229 | + */ | |
3230 | + mpage_da_map_blocks(mpd); | |
3231 | + mpage_da_submit_io(mpd); | |
3232 | + mpd->io_done = 1; | |
3233 | + return; | |
3234 | +} | |
3235 | + | |
3236 | +/* | |
3237 | + * __mpage_da_writepage - finds extent of pages and blocks | |
3238 | + * | |
3239 | + * @page: page to consider | |
3240 | + * @wbc: not used, we just follow rules | |
3241 | + * @data: context | |
3242 | + * | |
3243 | + * The function finds extents of pages and scan them for all blocks. | |
3244 | + */ | |
3245 | +static int __mpage_da_writepage(struct page *page, | |
3246 | + struct writeback_control *wbc, void *data) | |
3247 | +{ | |
3248 | + struct mpage_da_data *mpd = data; | |
3249 | + struct inode *inode = mpd->inode; | |
3250 | + struct buffer_head *bh, *head, fake; | |
3251 | + sector_t logical; | |
3252 | + | |
3253 | + if (mpd->io_done) { | |
3254 | + /* | |
3255 | + * Rest of the page in the page_vec | |
3256 | + * redirty then and skip then. We will | |
3257 | + * try to to write them again after | |
3258 | + * starting a new transaction | |
3259 | + */ | |
3260 | + redirty_page_for_writepage(wbc, page); | |
3261 | + unlock_page(page); | |
3262 | + return MPAGE_DA_EXTENT_TAIL; | |
3263 | + } | |
3264 | + /* | |
3265 | + * Can we merge this page to current extent? | |
3266 | + */ | |
3267 | + if (mpd->next_page != page->index) { | |
3268 | + /* | |
3269 | + * Nope, we can't. So, we map non-allocated blocks | |
3270 | + * and start IO on them using writepage() | |
3271 | + */ | |
3272 | + if (mpd->next_page != mpd->first_page) { | |
3273 | + mpage_da_map_blocks(mpd); | |
3274 | + mpage_da_submit_io(mpd); | |
3275 | + /* | |
3276 | + * skip rest of the page in the page_vec | |
3277 | + */ | |
3278 | + mpd->io_done = 1; | |
3279 | + redirty_page_for_writepage(wbc, page); | |
3280 | + unlock_page(page); | |
3281 | + return MPAGE_DA_EXTENT_TAIL; | |
3282 | + } | |
3283 | + | |
3284 | + /* | |
3285 | + * Start next extent of pages ... | |
3286 | + */ | |
3287 | + mpd->first_page = page->index; | |
3288 | + | |
3289 | + /* | |
3290 | + * ... and blocks | |
3291 | + */ | |
3292 | + mpd->lbh.b_size = 0; | |
3293 | + mpd->lbh.b_state = 0; | |
3294 | + mpd->lbh.b_blocknr = 0; | |
3295 | + } | |
3296 | + | |
3297 | + mpd->next_page = page->index + 1; | |
3298 | + logical = (sector_t) page->index << | |
3299 | + (PAGE_CACHE_SHIFT - inode->i_blkbits); | |
3300 | + | |
3301 | + if (!page_has_buffers(page)) { | |
3302 | + /* | |
3303 | + * There is no attached buffer heads yet (mmap?) | |
3304 | + * we treat the page asfull of dirty blocks | |
3305 | + */ | |
3306 | + bh = &fake; | |
3307 | + bh->b_size = PAGE_CACHE_SIZE; | |
3308 | + bh->b_state = 0; | |
3309 | + set_buffer_dirty(bh); | |
3310 | + set_buffer_uptodate(bh); | |
3311 | + mpage_add_bh_to_extent(mpd, logical, bh); | |
3312 | + if (mpd->io_done) | |
3313 | + return MPAGE_DA_EXTENT_TAIL; | |
3314 | + } else { | |
3315 | + /* | |
3316 | + * Page with regular buffer heads, just add all dirty ones | |
3317 | + */ | |
3318 | + head = page_buffers(page); | |
3319 | + bh = head; | |
3320 | + do { | |
3321 | + BUG_ON(buffer_locked(bh)); | |
3322 | + if (buffer_dirty(bh) && | |
3323 | + (!buffer_mapped(bh) || buffer_delay(bh))) { | |
3324 | + mpage_add_bh_to_extent(mpd, logical, bh); | |
3325 | + if (mpd->io_done) | |
3326 | + return MPAGE_DA_EXTENT_TAIL; | |
3327 | + } | |
3328 | + logical++; | |
3329 | + } while ((bh = bh->b_this_page) != head); | |
3330 | + } | |
3331 | + | |
3332 | + return 0; | |
3333 | +} | |
3334 | + | |
3335 | +/* | |
3336 | + * mpage_da_writepages - walk the list of dirty pages of the given | |
3337 | + * address space, allocates non-allocated blocks, maps newly-allocated | |
3338 | + * blocks to existing bhs and issue IO them | |
3339 | + * | |
3340 | + * @mapping: address space structure to write | |
3341 | + * @wbc: subtract the number of written pages from *@wbc->nr_to_write | |
3342 | + * @get_block: the filesystem's block mapper function. | |
3343 | + * | |
3344 | + * This is a library function, which implements the writepages() | |
3345 | + * address_space_operation. | |
3346 | + */ | |
3347 | +static int mpage_da_writepages(struct address_space *mapping, | |
3348 | + struct writeback_control *wbc, | |
3349 | + get_block_t get_block) | |
3350 | +{ | |
3351 | + struct mpage_da_data mpd; | |
3352 | + long to_write; | |
3353 | + int ret; | |
3354 | + | |
3355 | + if (!get_block) | |
3356 | + return generic_writepages(mapping, wbc); | |
3357 | + | |
3358 | + mpd.wbc = wbc; | |
3359 | + mpd.inode = mapping->host; | |
3360 | + mpd.lbh.b_size = 0; | |
3361 | + mpd.lbh.b_state = 0; | |
3362 | + mpd.lbh.b_blocknr = 0; | |
3363 | + mpd.first_page = 0; | |
3364 | + mpd.next_page = 0; | |
3365 | + mpd.get_block = get_block; | |
3366 | + mpd.io_done = 0; | |
3367 | + mpd.pages_written = 0; | |
3368 | + | |
3369 | + to_write = wbc->nr_to_write; | |
3370 | + | |
3371 | + ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd); | |
3372 | + | |
3373 | + /* | |
3374 | + * Handle last extent of pages | |
3375 | + */ | |
3376 | + if (!mpd.io_done && mpd.next_page != mpd.first_page) { | |
3377 | + mpage_da_map_blocks(&mpd); | |
3378 | + mpage_da_submit_io(&mpd); | |
3379 | + } | |
3380 | + | |
3381 | + wbc->nr_to_write = to_write - mpd.pages_written; | |
3382 | + return ret; | |
3383 | +} | |
3384 | + | |
3385 | +/* | |
3386 | + * this is a special callback for ->write_begin() only | |
3387 | + * it's intention is to return mapped block or reserve space | |
3388 | + */ | |
3389 | +static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, | |
3390 | + struct buffer_head *bh_result, int create) | |
3391 | +{ | |
3392 | + int ret = 0; | |
3393 | + | |
3394 | + BUG_ON(create == 0); | |
3395 | + BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); | |
3396 | + | |
3397 | + /* | |
3398 | + * first, we need to know whether the block is allocated already | |
3399 | + * preallocated blocks are unmapped but should treated | |
3400 | + * the same as allocated blocks. | |
3401 | + */ | |
3402 | + ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0); | |
3403 | + if ((ret == 0) && !buffer_delay(bh_result)) { | |
3404 | + /* the block isn't (pre)allocated yet, let's reserve space */ | |
3405 | + /* | |
3406 | + * XXX: __block_prepare_write() unmaps passed block, | |
3407 | + * is it OK? | |
3408 | + */ | |
3409 | + ret = ext4_da_reserve_space(inode, 1); | |
3410 | + if (ret) | |
3411 | + /* not enough space to reserve */ | |
3412 | + return ret; | |
3413 | + | |
3414 | + map_bh(bh_result, inode->i_sb, 0); | |
3415 | + set_buffer_new(bh_result); | |
3416 | + set_buffer_delay(bh_result); | |
3417 | + } else if (ret > 0) { | |
3418 | + bh_result->b_size = (ret << inode->i_blkbits); | |
3419 | + ret = 0; | |
3420 | + } | |
3421 | + | |
3422 | + return ret; | |
3423 | +} | |
3424 | +#define EXT4_DELALLOC_RSVED 1 | |
3425 | +static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, | |
3426 | + struct buffer_head *bh_result, int create) | |
3427 | +{ | |
3428 | + int ret; | |
3429 | + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; | |
3430 | + loff_t disksize = EXT4_I(inode)->i_disksize; | |
3431 | + handle_t *handle = NULL; | |
3432 | + | |
3433 | + handle = ext4_journal_current_handle(); | |
3434 | + if (!handle) { | |
3435 | + ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, | |
3436 | + bh_result, 0, 0, 0); | |
3437 | + BUG_ON(!ret); | |
3438 | + } else { | |
3439 | + ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, | |
3440 | + bh_result, create, 0, EXT4_DELALLOC_RSVED); | |
3441 | + } | |
3442 | + | |
3443 | + if (ret > 0) { | |
3444 | + bh_result->b_size = (ret << inode->i_blkbits); | |
3445 | + | |
3446 | + /* | |
3447 | + * Update on-disk size along with block allocation | |
3448 | + * we don't use 'extend_disksize' as size may change | |
3449 | + * within already allocated block -bzzz | |
3450 | + */ | |
3451 | + disksize = ((loff_t) iblock + ret) << inode->i_blkbits; | |
3452 | + if (disksize > i_size_read(inode)) | |
3453 | + disksize = i_size_read(inode); | |
3454 | + if (disksize > EXT4_I(inode)->i_disksize) { | |
3455 | + /* | |
3456 | + * XXX: replace with spinlock if seen contended -bzzz | |
3457 | + */ | |
3458 | + down_write(&EXT4_I(inode)->i_data_sem); | |
3459 | + if (disksize > EXT4_I(inode)->i_disksize) | |
3460 | + EXT4_I(inode)->i_disksize = disksize; | |
3461 | + up_write(&EXT4_I(inode)->i_data_sem); | |
3462 | + | |
3463 | + if (EXT4_I(inode)->i_disksize == disksize) { | |
3464 | + ret = ext4_mark_inode_dirty(handle, inode); | |
3465 | + return ret; | |
3466 | + } | |
3467 | + } | |
3468 | + ret = 0; | |
3469 | + } | |
3470 | + return ret; | |
3471 | +} | |
3472 | + | |
3473 | +static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) | |
3474 | +{ | |
3475 | + /* | |
3476 | + * unmapped buffer is possible for holes. | |
3477 | + * delay buffer is possible with delayed allocation | |
3478 | + */ | |
3479 | + return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh)); | |
3480 | +} | |
3481 | + | |
3482 | +static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock, | |
3483 | + struct buffer_head *bh_result, int create) | |
3484 | +{ | |
3485 | + int ret = 0; | |
3486 | + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; | |
3487 | + | |
3488 | + /* | |
3489 | + * we don't want to do block allocation in writepage | |
3490 | + * so call get_block_wrap with create = 0 | |
3491 | + */ | |
3492 | + ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks, | |
3493 | + bh_result, 0, 0, 0); | |
3494 | + if (ret > 0) { | |
3495 | + bh_result->b_size = (ret << inode->i_blkbits); | |
3496 | + ret = 0; | |
3497 | + } | |
3498 | + return ret; | |
3499 | +} | |
3500 | + | |
3501 | +/* | |
3502 | + * get called vi ext4_da_writepages after taking page lock (have journal handle) | |
3503 | + * get called via journal_submit_inode_data_buffers (no journal handle) | |
3504 | + * get called via shrink_page_list via pdflush (no journal handle) | |
3505 | + * or grab_page_cache when doing write_begin (have journal handle) | |
3506 | + */ | |
3507 | +static int ext4_da_writepage(struct page *page, | |
3508 | + struct writeback_control *wbc) | |
3509 | +{ | |
3510 | + int ret = 0; | |
3511 | + loff_t size; | |
3512 | + unsigned long len; | |
3513 | + struct buffer_head *page_bufs; | |
3514 | + struct inode *inode = page->mapping->host; | |
3515 | + | |
3516 | + size = i_size_read(inode); | |
3517 | + if (page->index == size >> PAGE_CACHE_SHIFT) | |
3518 | + len = size & ~PAGE_CACHE_MASK; | |
3519 | + else | |
3520 | + len = PAGE_CACHE_SIZE; | |
3521 | + | |
3522 | + if (page_has_buffers(page)) { | |
3523 | + page_bufs = page_buffers(page); | |
3524 | + if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, | |
3525 | + ext4_bh_unmapped_or_delay)) { | |
3526 | + /* | |
3527 | + * We don't want to do block allocation | |
3528 | + * So redirty the page and return | |
3529 | + * We may reach here when we do a journal commit | |
3530 | + * via journal_submit_inode_data_buffers. | |
3531 | + * If we don't have mapping block we just ignore | |
3532 | + * them. We can also reach here via shrink_page_list | |
3533 | + */ | |
3534 | + redirty_page_for_writepage(wbc, page); | |
3535 | + unlock_page(page); | |
3536 | + return 0; | |
3537 | + } | |
3538 | + } else { | |
3539 | + /* | |
3540 | + * The test for page_has_buffers() is subtle: | |
3541 | + * We know the page is dirty but it lost buffers. That means | |
3542 | + * that at some moment in time after write_begin()/write_end() | |
3543 | + * has been called all buffers have been clean and thus they | |
3544 | + * must have been written at least once. So they are all | |
3545 | + * mapped and we can happily proceed with mapping them | |
3546 | + * and writing the page. | |
3547 | + * | |
3548 | + * Try to initialize the buffer_heads and check whether | |
3549 | + * all are mapped and non delay. We don't want to | |
3550 | + * do block allocation here. | |
3551 | + */ | |
3552 | + ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, | |
3553 | + ext4_normal_get_block_write); | |
3554 | + if (!ret) { | |
3555 | + page_bufs = page_buffers(page); | |
3556 | + /* check whether all are mapped and non delay */ | |
3557 | + if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, | |
3558 | + ext4_bh_unmapped_or_delay)) { | |
3559 | + redirty_page_for_writepage(wbc, page); | |
3560 | + unlock_page(page); | |
3561 | + return 0; | |
3562 | + } | |
3563 | + } else { | |
3564 | + /* | |
3565 | + * We can't do block allocation here | |
3566 | + * so just redity the page and unlock | |
3567 | + * and return | |
3568 | + */ | |
3569 | + redirty_page_for_writepage(wbc, page); | |
3570 | + unlock_page(page); | |
3571 | + return 0; | |
3572 | + } | |
3573 | + } | |
3574 | + | |
3575 | + if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) | |
3576 | + ret = nobh_writepage(page, ext4_normal_get_block_write, wbc); | |
3577 | + else | |
3578 | + ret = block_write_full_page(page, | |
3579 | + ext4_normal_get_block_write, | |
3580 | + wbc); | |
3581 | + | |
3582 | + return ret; | |
3583 | +} | |
3584 | + | |
3585 | +/* | |
3586 | + * This is called via ext4_da_writepages() to | |
3587 | + * calulate the total number of credits to reserve to fit | |
3588 | + * a single extent allocation into a single transaction, | |
3589 | + * ext4_da_writpeages() will loop calling this before | |
3590 | + * the block allocation. | |
3591 | + */ | |
3592 | + | |
3593 | +static int ext4_da_writepages_trans_blocks(struct inode *inode) | |
3594 | +{ | |
3595 | + int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; | |
3596 | + | |
3597 | + /* | |
3598 | + * With non-extent format the journal credit needed to | |
3599 | + * insert nrblocks contiguous block is dependent on | |
3600 | + * number of contiguous block. So we will limit | |
3601 | + * number of contiguous block to a sane value | |
3602 | + */ | |
3603 | + if (!(inode->i_flags & EXT4_EXTENTS_FL) && | |
3604 | + (max_blocks > EXT4_MAX_TRANS_DATA)) | |
3605 | + max_blocks = EXT4_MAX_TRANS_DATA; | |
3606 | + | |
3607 | + return ext4_chunk_trans_blocks(inode, max_blocks); | |
3608 | +} | |
3609 | + | |
3610 | +static int ext4_da_writepages(struct address_space *mapping, | |
3611 | + struct writeback_control *wbc) | |
3612 | +{ | |
3613 | + handle_t *handle = NULL; | |
3614 | + loff_t range_start = 0; | |
3615 | + struct inode *inode = mapping->host; | |
3616 | + int needed_blocks, ret = 0, nr_to_writebump = 0; | |
3617 | + long to_write, pages_skipped = 0; | |
3618 | + struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); | |
3619 | + | |
3620 | + /* | |
3621 | + * No pages to write? This is mainly a kludge to avoid starting | |
3622 | + * a transaction for special inodes like journal inode on last iput() | |
3623 | + * because that could violate lock ordering on umount | |
3624 | + */ | |
3625 | + if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) | |
3626 | + return 0; | |
3627 | + /* | |
3628 | + * Make sure nr_to_write is >= sbi->s_mb_stream_request | |
3629 | + * This make sure small files blocks are allocated in | |
3630 | + * single attempt. This ensure that small files | |
3631 | + * get less fragmented. | |
3632 | + */ | |
3633 | + if (wbc->nr_to_write < sbi->s_mb_stream_request) { | |
3634 | + nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; | |
3635 | + wbc->nr_to_write = sbi->s_mb_stream_request; | |
3636 | + } | |
3637 | + | |
3638 | + if (!wbc->range_cyclic) | |
3639 | + /* | |
3640 | + * If range_cyclic is not set force range_cont | |
3641 | + * and save the old writeback_index | |
3642 | + */ | |
3643 | + wbc->range_cont = 1; | |
3644 | + | |
3645 | + range_start = wbc->range_start; | |
3646 | + pages_skipped = wbc->pages_skipped; | |
3647 | + | |
3648 | +restart_loop: | |
3649 | + to_write = wbc->nr_to_write; | |
3650 | + while (!ret && to_write > 0) { | |
3651 | + | |
3652 | + /* | |
3653 | + * we insert one extent at a time. So we need | |
3654 | + * credit needed for single extent allocation. | |
3655 | + * journalled mode is currently not supported | |
3656 | + * by delalloc | |
3657 | + */ | |
3658 | + BUG_ON(ext4_should_journal_data(inode)); | |
3659 | + needed_blocks = ext4_da_writepages_trans_blocks(inode); | |
3660 | + | |
3661 | + /* start a new transaction*/ | |
3662 | + handle = ext4_journal_start(inode, needed_blocks); | |
3663 | + if (IS_ERR(handle)) { | |
3664 | + ret = PTR_ERR(handle); | |
3665 | + printk(KERN_EMERG "%s: jbd2_start: " | |
3666 | + "%ld pages, ino %lu; err %d\n", __func__, | |
3667 | + wbc->nr_to_write, inode->i_ino, ret); | |
3668 | + dump_stack(); | |
3669 | + goto out_writepages; | |
3670 | + } | |
3671 | + if (ext4_should_order_data(inode)) { | |
3672 | + /* | |
3673 | + * With ordered mode we need to add | |
3674 | + * the inode to the journal handl | |
3675 | + * when we do block allocation. | |
3676 | + */ | |
3677 | + ret = ext4_jbd2_file_inode(handle, inode); | |
3678 | + if (ret) { | |
3679 | + ext4_journal_stop(handle); | |
3680 | + goto out_writepages; | |
3681 | + } | |
3682 | + } | |
3683 | + | |
3684 | + to_write -= wbc->nr_to_write; | |
3685 | + ret = mpage_da_writepages(mapping, wbc, | |
3686 | + ext4_da_get_block_write); | |
3687 | + ext4_journal_stop(handle); | |
3688 | + if (ret == MPAGE_DA_EXTENT_TAIL) { | |
3689 | + /* | |
3690 | + * got one extent now try with | |
3691 | + * rest of the pages | |
3692 | + */ | |
3693 | + to_write += wbc->nr_to_write; | |
3694 | + ret = 0; | |
3695 | + } else if (wbc->nr_to_write) { | |
3696 | + /* | |
3697 | + * There is no more writeout needed | |
3698 | + * or we requested for a noblocking writeout | |
3699 | + * and we found the device congested | |
3700 | + */ | |
3701 | + to_write += wbc->nr_to_write; | |
3702 | + break; | |
3703 | + } | |
3704 | + wbc->nr_to_write = to_write; | |
3705 | + } | |
3706 | + | |
3707 | + if (wbc->range_cont && (pages_skipped != wbc->pages_skipped)) { | |
3708 | + /* We skipped pages in this loop */ | |
3709 | + wbc->range_start = range_start; | |
3710 | + wbc->nr_to_write = to_write + | |
3711 | + wbc->pages_skipped - pages_skipped; | |
3712 | + wbc->pages_skipped = pages_skipped; | |
3713 | + goto restart_loop; | |
3714 | + } | |
3715 | + | |
3716 | +out_writepages: | |
3717 | + wbc->nr_to_write = to_write - nr_to_writebump; | |
3718 | + wbc->range_start = range_start; | |
3719 | + return ret; | |
3720 | +} | |
3721 | + | |
3722 | +static int ext4_da_write_begin(struct file *file, struct address_space *mapping, | |
3723 | + loff_t pos, unsigned len, unsigned flags, | |
3724 | + struct page **pagep, void **fsdata) | |
3725 | +{ | |
3726 | + int ret, retries = 0; | |
3727 | + struct page *page; | |
3728 | + pgoff_t index; | |
3729 | + unsigned from, to; | |
3730 | + struct inode *inode = mapping->host; | |
3731 | + handle_t *handle; | |
3732 | + | |
3733 | + index = pos >> PAGE_CACHE_SHIFT; | |
3734 | + from = pos & (PAGE_CACHE_SIZE - 1); | |
3735 | + to = from + len; | |
3736 | + | |
3737 | +retry: | |
3738 | + /* | |
3739 | + * With delayed allocation, we don't log the i_disksize update | |
3740 | + * if there is delayed block allocation. But we still need | |
3741 | + * to journalling the i_disksize update if writes to the end | |
3742 | + * of file which has an already mapped buffer. | |
3743 | + */ | |
3744 | + handle = ext4_journal_start(inode, 1); | |
3745 | + if (IS_ERR(handle)) { | |
3746 | + ret = PTR_ERR(handle); | |
3747 | + goto out; | |
3748 | + } | |
3749 | + | |
3750 | + page = __grab_cache_page(mapping, index); | |
3751 | + if (!page) { | |
3752 | + ext4_journal_stop(handle); | |
3753 | + ret = -ENOMEM; | |
3754 | + goto out; | |
3755 | + } | |
3756 | + *pagep = page; | |
3757 | + | |
3758 | + ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, | |
3759 | + ext4_da_get_block_prep); | |
3760 | + if (ret < 0) { | |
3761 | + unlock_page(page); | |
3762 | + ext4_journal_stop(handle); | |
3763 | + page_cache_release(page); | |
3764 | + } | |
3765 | + | |
3766 | + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) | |
3767 | + goto retry; | |
3768 | +out: | |
3769 | + return ret; | |
3770 | +} | |
3771 | + | |
3772 | +/* | |
3773 | + * Check if we should update i_disksize | |
3774 | + * when write to the end of file but not require block allocation | |
3775 | + */ | |
3776 | +static int ext4_da_should_update_i_disksize(struct page *page, | |
3777 | + unsigned long offset) | |
3778 | +{ | |
3779 | + struct buffer_head *bh; | |
3780 | + struct inode *inode = page->mapping->host; | |
3781 | + unsigned int idx; | |
3782 | + int i; | |
3783 | + | |
3784 | + bh = page_buffers(page); | |
3785 | + idx = offset >> inode->i_blkbits; | |
3786 | + | |
3787 | + for (i=0; i < idx; i++) | |
3788 | + bh = bh->b_this_page; | |
3789 | + | |
3790 | + if (!buffer_mapped(bh) || (buffer_delay(bh))) | |
3791 | + return 0; | |
3792 | + return 1; | |
3793 | +} | |
3794 | + | |
3795 | +static int ext4_da_write_end(struct file *file, | |
3796 | + struct address_space *mapping, | |
3797 | + loff_t pos, unsigned len, unsigned copied, | |
3798 | + struct page *page, void *fsdata) | |
3799 | +{ | |
3800 | + struct inode *inode = mapping->host; | |
3801 | + int ret = 0, ret2; | |
3802 | + handle_t *handle = ext4_journal_current_handle(); | |
3803 | + loff_t new_i_size; | |
3804 | + unsigned long start, end; | |
3805 | + | |
3806 | + start = pos & (PAGE_CACHE_SIZE - 1); | |
3807 | + end = start + copied -1; | |
3808 | + | |
3809 | + /* | |
3810 | + * generic_write_end() will run mark_inode_dirty() if i_size | |
3811 | + * changes. So let's piggyback the i_disksize mark_inode_dirty | |
3812 | + * into that. | |
3813 | + */ | |
3814 | + | |
3815 | + new_i_size = pos + copied; | |
3816 | + if (new_i_size > EXT4_I(inode)->i_disksize) { | |
3817 | + if (ext4_da_should_update_i_disksize(page, end)) { | |
3818 | + down_write(&EXT4_I(inode)->i_data_sem); | |
3819 | + if (new_i_size > EXT4_I(inode)->i_disksize) { | |
3820 | + /* | |
3821 | + * Updating i_disksize when extending file | |
3822 | + * without needing block allocation | |
3823 | + */ | |
3824 | + if (ext4_should_order_data(inode)) | |
3825 | + ret = ext4_jbd2_file_inode(handle, | |
3826 | + inode); | |
3827 | + | |
3828 | + EXT4_I(inode)->i_disksize = new_i_size; | |
3829 | + } | |
3830 | + up_write(&EXT4_I(inode)->i_data_sem); | |
3831 | + } | |
3832 | + } | |
3833 | + ret2 = generic_write_end(file, mapping, pos, len, copied, | |
3834 | + page, fsdata); | |
3835 | + copied = ret2; | |
3836 | + if (ret2 < 0) | |
3837 | + ret = ret2; | |
3838 | + ret2 = ext4_journal_stop(handle); | |
3839 | + if (!ret) | |
3840 | + ret = ret2; | |
3841 | + | |
3842 | + return ret ? ret : copied; | |
3843 | +} | |
3844 | + | |
3845 | +static void ext4_da_invalidatepage(struct page *page, unsigned long offset) | |
3846 | +{ | |
3847 | + /* | |
3848 | + * Drop reserved blocks | |
3849 | + */ | |
3850 | + BUG_ON(!PageLocked(page)); | |
3851 | + if (!page_has_buffers(page)) | |
3852 | + goto out; | |
3853 | + | |
3854 | + ext4_da_page_release_reservation(page, offset); | |
3855 | + | |
3856 | +out: | |
3857 | + ext4_invalidatepage(page, offset); | |
3858 | + | |
3859 | + return; | |
3860 | +} | |
3861 | + | |
3862 | + | |
3863 | /* | |
3864 | * bmap() is special. It gets used by applications such as lilo and by | |
3865 | * the swapper to find the on-disk block of a specific piece of data. | |
3866 | @@ -1418,6 +2552,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) | |
3867 | journal_t *journal; | |
3868 | int err; | |
3869 | ||
3870 | + if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && | |
3871 | + test_opt(inode->i_sb, DELALLOC)) { | |
3872 | + /* | |
3873 | + * With delalloc we want to sync the file | |
3874 | + * so that we can make sure we allocate | |
3875 | + * blocks for file | |
3876 | + */ | |
3877 | + filemap_write_and_wait(mapping); | |
3878 | + } | |
3879 | + | |
3880 | if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { | |
3881 | /* | |
3882 | * This is a REALLY heavyweight approach, but the use of | |
3883 | @@ -1462,21 +2606,17 @@ static int bput_one(handle_t *handle, struct buffer_head *bh) | |
3884 | return 0; | |
3885 | } | |
3886 | ||
3887 | -static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) | |
3888 | -{ | |
3889 | - if (buffer_mapped(bh)) | |
3890 | - return ext4_journal_dirty_data(handle, bh); | |
3891 | - return 0; | |
3892 | -} | |
3893 | - | |
3894 | /* | |
3895 | - * Note that we always start a transaction even if we're not journalling | |
3896 | - * data. This is to preserve ordering: any hole instantiation within | |
3897 | - * __block_write_full_page -> ext4_get_block() should be journalled | |
3898 | - * along with the data so we don't crash and then get metadata which | |
3899 | - * refers to old data. | |
3900 | + * Note that we don't need to start a transaction unless we're journaling data | |
3901 | + * because we should have holes filled from ext4_page_mkwrite(). We even don't | |
3902 | + * need to file the inode to the transaction's list in ordered mode because if | |
3903 | + * we are writing back data added by write(), the inode is already there and if | |
3904 | + * we are writing back data modified via mmap(), noone guarantees in which | |
3905 | + * transaction the data will hit the disk. In case we are journaling data, we | |
3906 | + * cannot start transaction directly because transaction start ranks above page | |
3907 | + * lock so we have to do some magic. | |
3908 | * | |
3909 | - * In all journalling modes block_write_full_page() will start the I/O. | |
3910 | + * In all journaling modes block_write_full_page() will start the I/O. | |
3911 | * | |
3912 | * Problem: | |
3913 | * | |
3914 | @@ -1518,105 +2658,103 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) | |
3915 | * disastrous. Any write() or metadata operation will sync the fs for | |
3916 | * us. | |
3917 | * | |
3918 | - * AKPM2: if all the page's buffers are mapped to disk and !data=journal, | |
3919 | - * we don't need to open a transaction here. | |
3920 | */ | |
3921 | -static int ext4_ordered_writepage(struct page *page, | |
3922 | +static int __ext4_normal_writepage(struct page *page, | |
3923 | struct writeback_control *wbc) | |
3924 | { | |
3925 | struct inode *inode = page->mapping->host; | |
3926 | - struct buffer_head *page_bufs; | |
3927 | - handle_t *handle = NULL; | |
3928 | - int ret = 0; | |
3929 | - int err; | |
3930 | ||
3931 | - J_ASSERT(PageLocked(page)); | |
3932 | - | |
3933 | - /* | |
3934 | - * We give up here if we're reentered, because it might be for a | |
3935 | - * different filesystem. | |
3936 | - */ | |
3937 | - if (ext4_journal_current_handle()) | |
3938 | - goto out_fail; | |
3939 | - | |
3940 | - handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); | |
3941 | + if (test_opt(inode->i_sb, NOBH)) | |
3942 | + return nobh_writepage(page, | |
3943 | + ext4_normal_get_block_write, wbc); | |
3944 | + else | |
3945 | + return block_write_full_page(page, | |
3946 | + ext4_normal_get_block_write, | |
3947 | + wbc); | |
3948 | +} | |
3949 | ||
3950 | - if (IS_ERR(handle)) { | |
3951 | - ret = PTR_ERR(handle); | |
3952 | - goto out_fail; | |
3953 | - } | |
3954 | +static int ext4_normal_writepage(struct page *page, | |
3955 | + struct writeback_control *wbc) | |
3956 | +{ | |
3957 | + struct inode *inode = page->mapping->host; | |
3958 | + loff_t size = i_size_read(inode); | |
3959 | + loff_t len; | |
3960 | ||
3961 | - if (!page_has_buffers(page)) { | |
3962 | - create_empty_buffers(page, inode->i_sb->s_blocksize, | |
3963 | - (1 << BH_Dirty)|(1 << BH_Uptodate)); | |
3964 | + J_ASSERT(PageLocked(page)); | |
3965 | + if (page->index == size >> PAGE_CACHE_SHIFT) | |
3966 | + len = size & ~PAGE_CACHE_MASK; | |
3967 | + else | |
3968 | + len = PAGE_CACHE_SIZE; | |
3969 | + | |
3970 | + if (page_has_buffers(page)) { | |
3971 | + /* if page has buffers it should all be mapped | |
3972 | + * and allocated. If there are not buffers attached | |
3973 | + * to the page we know the page is dirty but it lost | |
3974 | + * buffers. That means that at some moment in time | |
3975 | + * after write_begin() / write_end() has been called | |
3976 | + * all buffers have been clean and thus they must have been | |
3977 | + * written at least once. So they are all mapped and we can | |
3978 | + * happily proceed with mapping them and writing the page. | |
3979 | + */ | |
3980 | + BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, | |
3981 | + ext4_bh_unmapped_or_delay)); | |
3982 | } | |
3983 | - page_bufs = page_buffers(page); | |
3984 | - walk_page_buffers(handle, page_bufs, 0, | |
3985 | - PAGE_CACHE_SIZE, NULL, bget_one); | |
3986 | - | |
3987 | - ret = block_write_full_page(page, ext4_get_block, wbc); | |
3988 | - | |
3989 | - /* | |
3990 | - * The page can become unlocked at any point now, and | |
3991 | - * truncate can then come in and change things. So we | |
3992 | - * can't touch *page from now on. But *page_bufs is | |
3993 | - * safe due to elevated refcount. | |
3994 | - */ | |
3995 | ||
3996 | - /* | |
3997 | - * And attach them to the current transaction. But only if | |
3998 | - * block_write_full_page() succeeded. Otherwise they are unmapped, | |
3999 | - * and generally junk. | |
4000 | - */ | |
4001 | - if (ret == 0) { | |
4002 | - err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, | |
4003 | - NULL, jbd2_journal_dirty_data_fn); | |
4004 | - if (!ret) | |
4005 | - ret = err; | |
4006 | - } | |
4007 | - walk_page_buffers(handle, page_bufs, 0, | |
4008 | - PAGE_CACHE_SIZE, NULL, bput_one); | |
4009 | - err = ext4_journal_stop(handle); | |
4010 | - if (!ret) | |
4011 | - ret = err; | |
4012 | - return ret; | |
4013 | + if (!ext4_journal_current_handle()) | |
4014 | + return __ext4_normal_writepage(page, wbc); | |
4015 | ||
4016 | -out_fail: | |
4017 | redirty_page_for_writepage(wbc, page); | |
4018 | unlock_page(page); | |
4019 | - return ret; | |
4020 | + return 0; | |
4021 | } | |
4022 | ||
4023 | -static int ext4_writeback_writepage(struct page *page, | |
4024 | +static int __ext4_journalled_writepage(struct page *page, | |
4025 | struct writeback_control *wbc) | |
4026 | { | |
4027 | - struct inode *inode = page->mapping->host; | |
4028 | + struct address_space *mapping = page->mapping; | |
4029 | + struct inode *inode = mapping->host; | |
4030 | + struct buffer_head *page_bufs; | |
4031 | handle_t *handle = NULL; | |
4032 | int ret = 0; | |
4033 | int err; | |
4034 | ||
4035 | - if (ext4_journal_current_handle()) | |
4036 | - goto out_fail; | |
4037 | + ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, | |
4038 | + ext4_normal_get_block_write); | |
4039 | + if (ret != 0) | |
4040 | + goto out_unlock; | |
4041 | + | |
4042 | + page_bufs = page_buffers(page); | |
4043 | + walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, | |
4044 | + bget_one); | |
4045 | + /* As soon as we unlock the page, it can go away, but we have | |
4046 | + * references to buffers so we are safe */ | |
4047 | + unlock_page(page); | |
4048 | ||
4049 | handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); | |
4050 | if (IS_ERR(handle)) { | |
4051 | ret = PTR_ERR(handle); | |
4052 | - goto out_fail; | |
4053 | + goto out; | |
4054 | } | |
4055 | ||
4056 | - if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) | |
4057 | - ret = nobh_writepage(page, ext4_get_block, wbc); | |
4058 | - else | |
4059 | - ret = block_write_full_page(page, ext4_get_block, wbc); | |
4060 | + ret = walk_page_buffers(handle, page_bufs, 0, | |
4061 | + PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); | |
4062 | ||
4063 | + err = walk_page_buffers(handle, page_bufs, 0, | |
4064 | + PAGE_CACHE_SIZE, NULL, write_end_fn); | |
4065 | + if (ret == 0) | |
4066 | + ret = err; | |
4067 | err = ext4_journal_stop(handle); | |
4068 | if (!ret) | |
4069 | ret = err; | |
4070 | - return ret; | |
4071 | ||
4072 | -out_fail: | |
4073 | - redirty_page_for_writepage(wbc, page); | |
4074 | + walk_page_buffers(handle, page_bufs, 0, | |
4075 | + PAGE_CACHE_SIZE, NULL, bput_one); | |
4076 | + EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; | |
4077 | + goto out; | |
4078 | + | |
4079 | +out_unlock: | |
4080 | unlock_page(page); | |
4081 | +out: | |
4082 | return ret; | |
4083 | } | |
4084 | ||
4085 | @@ -1624,59 +2762,53 @@ static int ext4_journalled_writepage(struct page *page, | |
4086 | struct writeback_control *wbc) | |
4087 | { | |
4088 | struct inode *inode = page->mapping->host; | |
4089 | - handle_t *handle = NULL; | |
4090 | - int ret = 0; | |
4091 | - int err; | |
4092 | + loff_t size = i_size_read(inode); | |
4093 | + loff_t len; | |
4094 | ||
4095 | - if (ext4_journal_current_handle()) | |
4096 | - goto no_write; | |
4097 | + J_ASSERT(PageLocked(page)); | |
4098 | + if (page->index == size >> PAGE_CACHE_SHIFT) | |
4099 | + len = size & ~PAGE_CACHE_MASK; | |
4100 | + else | |
4101 | + len = PAGE_CACHE_SIZE; | |
4102 | + | |
4103 | + if (page_has_buffers(page)) { | |
4104 | + /* if page has buffers it should all be mapped | |
4105 | + * and allocated. If there are not buffers attached | |
4106 | + * to the page we know the page is dirty but it lost | |
4107 | + * buffers. That means that at some moment in time | |
4108 | + * after write_begin() / write_end() has been called | |
4109 | + * all buffers have been clean and thus they must have been | |
4110 | + * written at least once. So they are all mapped and we can | |
4111 | + * happily proceed with mapping them and writing the page. | |
4112 | + */ | |
4113 | + BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, | |
4114 | + ext4_bh_unmapped_or_delay)); | |
4115 | + } | |
4116 | ||
4117 | - handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); | |
4118 | - if (IS_ERR(handle)) { | |
4119 | - ret = PTR_ERR(handle); | |
4120 | + if (ext4_journal_current_handle()) | |
4121 | goto no_write; | |
4122 | - } | |
4123 | ||
4124 | - if (!page_has_buffers(page) || PageChecked(page)) { | |
4125 | + if (PageChecked(page)) { | |
4126 | /* | |
4127 | * It's mmapped pagecache. Add buffers and journal it. There | |
4128 | * doesn't seem much point in redirtying the page here. | |
4129 | */ | |
4130 | ClearPageChecked(page); | |
4131 | - ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, | |
4132 | - ext4_get_block); | |
4133 | - if (ret != 0) { | |
4134 | - ext4_journal_stop(handle); | |
4135 | - goto out_unlock; | |
4136 | - } | |
4137 | - ret = walk_page_buffers(handle, page_buffers(page), 0, | |
4138 | - PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); | |
4139 | - | |
4140 | - err = walk_page_buffers(handle, page_buffers(page), 0, | |
4141 | - PAGE_CACHE_SIZE, NULL, write_end_fn); | |
4142 | - if (ret == 0) | |
4143 | - ret = err; | |
4144 | - EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; | |
4145 | - unlock_page(page); | |
4146 | + return __ext4_journalled_writepage(page, wbc); | |
4147 | } else { | |
4148 | /* | |
4149 | * It may be a page full of checkpoint-mode buffers. We don't | |
4150 | * really know unless we go poke around in the buffer_heads. | |
4151 | * But block_write_full_page will do the right thing. | |
4152 | */ | |
4153 | - ret = block_write_full_page(page, ext4_get_block, wbc); | |
4154 | + return block_write_full_page(page, | |
4155 | + ext4_normal_get_block_write, | |
4156 | + wbc); | |
4157 | } | |
4158 | - err = ext4_journal_stop(handle); | |
4159 | - if (!ret) | |
4160 | - ret = err; | |
4161 | -out: | |
4162 | - return ret; | |
4163 | - | |
4164 | no_write: | |
4165 | redirty_page_for_writepage(wbc, page); | |
4166 | -out_unlock: | |
4167 | unlock_page(page); | |
4168 | - goto out; | |
4169 | + return 0; | |
4170 | } | |
4171 | ||
4172 | static int ext4_readpage(struct file *file, struct page *page) | |
4173 | @@ -1819,7 +2951,7 @@ static int ext4_journalled_set_page_dirty(struct page *page) | |
4174 | static const struct address_space_operations ext4_ordered_aops = { | |
4175 | .readpage = ext4_readpage, | |
4176 | .readpages = ext4_readpages, | |
4177 | - .writepage = ext4_ordered_writepage, | |
4178 | + .writepage = ext4_normal_writepage, | |
4179 | .sync_page = block_sync_page, | |
4180 | .write_begin = ext4_write_begin, | |
4181 | .write_end = ext4_ordered_write_end, | |
4182 | @@ -1833,7 +2965,7 @@ static const struct address_space_operations ext4_ordered_aops = { | |
4183 | static const struct address_space_operations ext4_writeback_aops = { | |
4184 | .readpage = ext4_readpage, | |
4185 | .readpages = ext4_readpages, | |
4186 | - .writepage = ext4_writeback_writepage, | |
4187 | + .writepage = ext4_normal_writepage, | |
4188 | .sync_page = block_sync_page, | |
4189 | .write_begin = ext4_write_begin, | |
4190 | .write_end = ext4_writeback_write_end, | |
4191 | @@ -1857,10 +2989,31 @@ static const struct address_space_operations ext4_journalled_aops = { | |
4192 | .releasepage = ext4_releasepage, | |
4193 | }; | |
4194 | ||
4195 | +static const struct address_space_operations ext4_da_aops = { | |
4196 | + .readpage = ext4_readpage, | |
4197 | + .readpages = ext4_readpages, | |
4198 | + .writepage = ext4_da_writepage, | |
4199 | + .writepages = ext4_da_writepages, | |
4200 | + .sync_page = block_sync_page, | |
4201 | + .write_begin = ext4_da_write_begin, | |
4202 | + .write_end = ext4_da_write_end, | |
4203 | + .bmap = ext4_bmap, | |
4204 | + .invalidatepage = ext4_da_invalidatepage, | |
4205 | + .releasepage = ext4_releasepage, | |
4206 | + .direct_IO = ext4_direct_IO, | |
4207 | + .migratepage = buffer_migrate_page, | |
4208 | +}; | |
4209 | + | |
4210 | void ext4_set_aops(struct inode *inode) | |
4211 | { | |
4212 | - if (ext4_should_order_data(inode)) | |
4213 | + if (ext4_should_order_data(inode) && | |
4214 | + test_opt(inode->i_sb, DELALLOC)) | |
4215 | + inode->i_mapping->a_ops = &ext4_da_aops; | |
4216 | + else if (ext4_should_order_data(inode)) | |
4217 | inode->i_mapping->a_ops = &ext4_ordered_aops; | |
4218 | + else if (ext4_should_writeback_data(inode) && | |
4219 | + test_opt(inode->i_sb, DELALLOC)) | |
4220 | + inode->i_mapping->a_ops = &ext4_da_aops; | |
4221 | else if (ext4_should_writeback_data(inode)) | |
4222 | inode->i_mapping->a_ops = &ext4_writeback_aops; | |
4223 | else | |
4224 | @@ -1873,7 +3026,7 @@ void ext4_set_aops(struct inode *inode) | |
4225 | * This required during truncate. We need to physically zero the tail end | |
4226 | * of that block so it doesn't yield old data if the file is later grown. | |
4227 | */ | |
4228 | -int ext4_block_truncate_page(handle_t *handle, struct page *page, | |
4229 | +int ext4_block_truncate_page(handle_t *handle, | |
4230 | struct address_space *mapping, loff_t from) | |
4231 | { | |
4232 | ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; | |
4233 | @@ -1882,8 +3035,13 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page, | |
4234 | ext4_lblk_t iblock; | |
4235 | struct inode *inode = mapping->host; | |
4236 | struct buffer_head *bh; | |
4237 | + struct page *page; | |
4238 | int err = 0; | |
4239 | ||
4240 | + page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT); | |
4241 | + if (!page) | |
4242 | + return -EINVAL; | |
4243 | + | |
4244 | blocksize = inode->i_sb->s_blocksize; | |
4245 | length = blocksize - (offset & (blocksize - 1)); | |
4246 | iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); | |
4247 | @@ -1956,7 +3114,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page, | |
4248 | err = ext4_journal_dirty_metadata(handle, bh); | |
4249 | } else { | |
4250 | if (ext4_should_order_data(inode)) | |
4251 | - err = ext4_journal_dirty_data(handle, bh); | |
4252 | + err = ext4_jbd2_file_inode(handle, inode); | |
4253 | mark_buffer_dirty(bh); | |
4254 | } | |
4255 | ||
4256 | @@ -2179,7 +3337,21 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, | |
4257 | ||
4258 | if (this_bh) { | |
4259 | BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata"); | |
4260 | - ext4_journal_dirty_metadata(handle, this_bh); | |
4261 | + | |
4262 | + /* | |
4263 | + * The buffer head should have an attached journal head at this | |
4264 | + * point. However, if the data is corrupted and an indirect | |
4265 | + * block pointed to itself, it would have been detached when | |
4266 | + * the block was cleared. Check for this instead of OOPSing. | |
4267 | + */ | |
4268 | + if (bh2jh(this_bh)) | |
4269 | + ext4_journal_dirty_metadata(handle, this_bh); | |
4270 | + else | |
4271 | + ext4_error(inode->i_sb, __func__, | |
4272 | + "circular indirect block detected, " | |
4273 | + "inode=%lu, block=%llu", | |
4274 | + inode->i_ino, | |
4275 | + (unsigned long long) this_bh->b_blocknr); | |
4276 | } | |
4277 | } | |
4278 | ||
4279 | @@ -2305,6 +3477,19 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, | |
4280 | } | |
4281 | } | |
4282 | ||
4283 | +int ext4_can_truncate(struct inode *inode) | |
4284 | +{ | |
4285 | + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) | |
4286 | + return 0; | |
4287 | + if (S_ISREG(inode->i_mode)) | |
4288 | + return 1; | |
4289 | + if (S_ISDIR(inode->i_mode)) | |
4290 | + return 1; | |
4291 | + if (S_ISLNK(inode->i_mode)) | |
4292 | + return !ext4_inode_is_fast_symlink(inode); | |
4293 | + return 0; | |
4294 | +} | |
4295 | + | |
4296 | /* | |
4297 | * ext4_truncate() | |
4298 | * | |
4299 | @@ -2347,51 +3532,25 @@ void ext4_truncate(struct inode *inode) | |
4300 | int n; | |
4301 | ext4_lblk_t last_block; | |
4302 | unsigned blocksize = inode->i_sb->s_blocksize; | |
4303 | - struct page *page; | |
4304 | ||
4305 | - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || | |
4306 | - S_ISLNK(inode->i_mode))) | |
4307 | - return; | |
4308 | - if (ext4_inode_is_fast_symlink(inode)) | |
4309 | - return; | |
4310 | - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) | |
4311 | + if (!ext4_can_truncate(inode)) | |
4312 | return; | |
4313 | ||
4314 | - /* | |
4315 | - * We have to lock the EOF page here, because lock_page() nests | |
4316 | - * outside jbd2_journal_start(). | |
4317 | - */ | |
4318 | - if ((inode->i_size & (blocksize - 1)) == 0) { | |
4319 | - /* Block boundary? Nothing to do */ | |
4320 | - page = NULL; | |
4321 | - } else { | |
4322 | - page = grab_cache_page(mapping, | |
4323 | - inode->i_size >> PAGE_CACHE_SHIFT); | |
4324 | - if (!page) | |
4325 | - return; | |
4326 | - } | |
4327 | - | |
4328 | if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { | |
4329 | - ext4_ext_truncate(inode, page); | |
4330 | + ext4_ext_truncate(inode); | |
4331 | return; | |
4332 | } | |
4333 | ||
4334 | handle = start_transaction(inode); | |
4335 | - if (IS_ERR(handle)) { | |
4336 | - if (page) { | |
4337 | - clear_highpage(page); | |
4338 | - flush_dcache_page(page); | |
4339 | - unlock_page(page); | |
4340 | - page_cache_release(page); | |
4341 | - } | |
4342 | + if (IS_ERR(handle)) | |
4343 | return; /* AKPM: return what? */ | |
4344 | - } | |
4345 | ||
4346 | last_block = (inode->i_size + blocksize-1) | |
4347 | >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); | |
4348 | ||
4349 | - if (page) | |
4350 | - ext4_block_truncate_page(handle, page, mapping, inode->i_size); | |
4351 | + if (inode->i_size & (blocksize - 1)) | |
4352 | + if (ext4_block_truncate_page(handle, mapping, inode->i_size)) | |
4353 | + goto out_stop; | |
4354 | ||
4355 | n = ext4_block_to_path(inode, last_block, offsets, NULL); | |
4356 | if (n == 0) | |
4357 | @@ -2410,6 +3569,14 @@ void ext4_truncate(struct inode *inode) | |
4358 | goto out_stop; | |
4359 | ||
4360 | /* | |
4361 | + * From here we block out all ext4_get_block() callers who want to | |
4362 | + * modify the block allocation tree. | |
4363 | + */ | |
4364 | + down_write(&ei->i_data_sem); | |
4365 | + | |
4366 | + ext4_discard_reservation(inode); | |
4367 | + | |
4368 | + /* | |
4369 | * The orphan list entry will now protect us from any crash which | |
4370 | * occurs before the truncate completes, so it is now safe to propagate | |
4371 | * the new, shorter inode size (held for now in i_size) into the | |
4372 | @@ -2418,12 +3585,6 @@ void ext4_truncate(struct inode *inode) | |
4373 | */ | |
4374 | ei->i_disksize = inode->i_size; | |
4375 | ||
4376 | - /* | |
4377 | - * From here we block out all ext4_get_block() callers who want to | |
4378 | - * modify the block allocation tree. | |
4379 | - */ | |
4380 | - down_write(&ei->i_data_sem); | |
4381 | - | |
4382 | if (n == 1) { /* direct blocks */ | |
4383 | ext4_free_data(handle, inode, NULL, i_data+offsets[0], | |
4384 | i_data + EXT4_NDIR_BLOCKS); | |
4385 | @@ -2484,8 +3645,6 @@ do_indirects: | |
4386 | ; | |
4387 | } | |
4388 | ||
4389 | - ext4_discard_reservation(inode); | |
4390 | - | |
4391 | up_write(&ei->i_data_sem); | |
4392 | inode->i_mtime = inode->i_ctime = ext4_current_time(inode); | |
4393 | ext4_mark_inode_dirty(handle, inode); | |
4394 | @@ -2571,6 +3730,16 @@ static int __ext4_get_inode_loc(struct inode *inode, | |
4395 | } | |
4396 | if (!buffer_uptodate(bh)) { | |
4397 | lock_buffer(bh); | |
4398 | + | |
4399 | + /* | |
4400 | + * If the buffer has the write error flag, we have failed | |
4401 | + * to write out another inode in the same block. In this | |
4402 | + * case, we don't have to read the block because we may | |
4403 | + * read the old inode data successfully. | |
4404 | + */ | |
4405 | + if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) | |
4406 | + set_buffer_uptodate(bh); | |
4407 | + | |
4408 | if (buffer_uptodate(bh)) { | |
4409 | /* someone brought it uptodate while we waited */ | |
4410 | unlock_buffer(bh); | |
4411 | @@ -3107,7 +4276,14 @@ int ext4_write_inode(struct inode *inode, int wait) | |
4412 | * be freed, so we have a strong guarantee that no future commit will | |
4413 | * leave these blocks visible to the user.) | |
4414 | * | |
4415 | - * Called with inode->sem down. | |
4416 | + * Another thing we have to assure is that if we are in ordered mode | |
4417 | + * and inode is still attached to the committing transaction, we must | |
4418 | + * we start writeout of all the dirty pages which are being truncated. | |
4419 | + * This way we are sure that all the data written in the previous | |
4420 | + * transaction are already on disk (truncate waits for pages under | |
4421 | + * writeback). | |
4422 | + * | |
4423 | + * Called with inode->i_mutex down. | |
4424 | */ | |
4425 | int ext4_setattr(struct dentry *dentry, struct iattr *attr) | |
4426 | { | |
4427 | @@ -3173,6 +4349,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) | |
4428 | if (!error) | |
4429 | error = rc; | |
4430 | ext4_journal_stop(handle); | |
4431 | + | |
4432 | + if (ext4_should_order_data(inode)) { | |
4433 | + error = ext4_begin_ordered_truncate(inode, | |
4434 | + attr->ia_size); | |
4435 | + if (error) { | |
4436 | + /* Do as much error cleanup as possible */ | |
4437 | + handle = ext4_journal_start(inode, 3); | |
4438 | + if (IS_ERR(handle)) { | |
4439 | + ext4_orphan_del(NULL, inode); | |
4440 | + goto err_out; | |
4441 | + } | |
4442 | + ext4_orphan_del(handle, inode); | |
4443 | + ext4_journal_stop(handle); | |
4444 | + goto err_out; | |
4445 | + } | |
4446 | + } | |
4447 | } | |
4448 | ||
4449 | rc = inode_setattr(inode, attr); | |
4450 | @@ -3193,58 +4385,156 @@ err_out: | |
4451 | return error; | |
4452 | } | |
4453 | ||
4454 | +int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, | |
4455 | + struct kstat *stat) | |
4456 | +{ | |
4457 | + struct inode *inode; | |
4458 | + unsigned long delalloc_blocks; | |
4459 | + | |
4460 | + inode = dentry->d_inode; | |
4461 | + generic_fillattr(inode, stat); | |
4462 | ||
4463 | + /* | |
4464 | + * We can't update i_blocks if the block allocation is delayed | |
4465 | + * otherwise in the case of system crash before the real block | |
4466 | + * allocation is done, we will have i_blocks inconsistent with | |
4467 | + * on-disk file blocks. | |
4468 | + * We always keep i_blocks updated together with real | |
4469 | + * allocation. But to not confuse with user, stat | |
4470 | + * will return the blocks that include the delayed allocation | |
4471 | + * blocks for this file. | |
4472 | + */ | |
4473 | + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); | |
4474 | + delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; | |
4475 | + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | |
4476 | + | |
4477 | + stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; | |
4478 | + return 0; | |
4479 | +} | |
4480 | + | |
4481 | +static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, | |
4482 | + int chunk) | |
4483 | +{ | |
4484 | + int indirects; | |
4485 | + | |
4486 | + /* if nrblocks are contiguous */ | |
4487 | + if (chunk) { | |
4488 | + /* | |
4489 | + * With N contiguous data blocks, it need at most | |
4490 | + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks | |
4491 | + * 2 dindirect blocks | |
4492 | + * 1 tindirect block | |
4493 | + */ | |
4494 | + indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb); | |
4495 | + return indirects + 3; | |
4496 | + } | |
4497 | + /* | |
4498 | + * if nrblocks are not contiguous, worse case, each block touch | |
4499 | + * a indirect block, and each indirect block touch a double indirect | |
4500 | + * block, plus a triple indirect block | |
4501 | + */ | |
4502 | + indirects = nrblocks * 2 + 1; | |
4503 | + return indirects; | |
4504 | +} | |
4505 | + | |
4506 | +static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) | |
4507 | +{ | |
4508 | + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) | |
4509 | + return ext4_indirect_trans_blocks(inode, nrblocks, 0); | |
4510 | + return ext4_ext_index_trans_blocks(inode, nrblocks, 0); | |
4511 | +} | |
4512 | /* | |
4513 | - * How many blocks doth make a writepage()? | |
4514 | - * | |
4515 | - * With N blocks per page, it may be: | |
4516 | - * N data blocks | |
4517 | - * 2 indirect block | |
4518 | - * 2 dindirect | |
4519 | - * 1 tindirect | |
4520 | - * N+5 bitmap blocks (from the above) | |
4521 | - * N+5 group descriptor summary blocks | |
4522 | - * 1 inode block | |
4523 | - * 1 superblock. | |
4524 | - * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files | |
4525 | - * | |
4526 | - * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS | |
4527 | - * | |
4528 | - * With ordered or writeback data it's the same, less the N data blocks. | |
4529 | - * | |
4530 | - * If the inode's direct blocks can hold an integral number of pages then a | |
4531 | - * page cannot straddle two indirect blocks, and we can only touch one indirect | |
4532 | - * and dindirect block, and the "5" above becomes "3". | |
4533 | - * | |
4534 | - * This still overestimates under most circumstances. If we were to pass the | |
4535 | - * start and end offsets in here as well we could do block_to_path() on each | |
4536 | - * block and work out the exact number of indirects which are touched. Pah. | |
4537 | + * Account for index blocks, block groups bitmaps and block group | |
4538 | + * descriptor blocks if modify datablocks and index blocks | |
4539 | + * worse case, the indexs blocks spread over different block groups | |
4540 | + * | |
4541 | + * If datablocks are discontiguous, they are possible to spread over | |
4542 | + * different block groups too. If they are contiugous, with flexbg, | |
4543 | + * they could still across block group boundary. | |
4544 | + * | |
4545 | + * Also account for superblock, inode, quota and xattr blocks | |
4546 | */ | |
4547 | +int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) | |
4548 | +{ | |
4549 | + int groups, gdpblocks; | |
4550 | + int idxblocks; | |
4551 | + int ret = 0; | |
4552 | + | |
4553 | + /* | |
4554 | + * How many index blocks need to touch to modify nrblocks? | |
4555 | + * The "Chunk" flag indicating whether the nrblocks is | |
4556 | + * physically contiguous on disk | |
4557 | + * | |
4558 | + * For Direct IO and fallocate, they calls get_block to allocate | |
4559 | + * one single extent at a time, so they could set the "Chunk" flag | |
4560 | + */ | |
4561 | + idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); | |
4562 | ||
4563 | + ret = idxblocks; | |
4564 | + | |
4565 | + /* | |
4566 | + * Now let's see how many group bitmaps and group descriptors need | |
4567 | + * to account | |
4568 | + */ | |
4569 | + groups = idxblocks; | |
4570 | + if (chunk) | |
4571 | + groups += 1; | |
4572 | + else | |
4573 | + groups += nrblocks; | |
4574 | + | |
4575 | + gdpblocks = groups; | |
4576 | + if (groups > EXT4_SB(inode->i_sb)->s_groups_count) | |
4577 | + groups = EXT4_SB(inode->i_sb)->s_groups_count; | |
4578 | + if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) | |
4579 | + gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; | |
4580 | + | |
4581 | + /* bitmaps and block group descriptor blocks */ | |
4582 | + ret += groups + gdpblocks; | |
4583 | + | |
4584 | + /* Blocks for super block, inode, quota and xattr blocks */ | |
4585 | + ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); | |
4586 | + | |
4587 | + return ret; | |
4588 | +} | |
4589 | + | |
4590 | +/* | |
4591 | + * Calulate the total number of credits to reserve to fit | |
4592 | + * the modification of a single pages into a single transaction, | |
4593 | + * which may include multiple chunks of block allocations. | |
4594 | + * | |
4595 | + * This could be called via ext4_write_begin() | |
4596 | + * | |
4597 | + * We need to consider the worse case, when | |
4598 | + * one new block per extent. | |
4599 | + */ | |
4600 | int ext4_writepage_trans_blocks(struct inode *inode) | |
4601 | { | |
4602 | int bpp = ext4_journal_blocks_per_page(inode); | |
4603 | - int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3; | |
4604 | int ret; | |
4605 | ||
4606 | - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) | |
4607 | - return ext4_ext_writepage_trans_blocks(inode, bpp); | |
4608 | + ret = ext4_meta_trans_blocks(inode, bpp, 0); | |
4609 | ||
4610 | + /* Account for data blocks for journalled mode */ | |
4611 | if (ext4_should_journal_data(inode)) | |
4612 | - ret = 3 * (bpp + indirects) + 2; | |
4613 | - else | |
4614 | - ret = 2 * (bpp + indirects) + 2; | |
4615 | - | |
4616 | -#ifdef CONFIG_QUOTA | |
4617 | - /* We know that structure was already allocated during DQUOT_INIT so | |
4618 | - * we will be updating only the data blocks + inodes */ | |
4619 | - ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); | |
4620 | -#endif | |
4621 | - | |
4622 | + ret += bpp; | |
4623 | return ret; | |
4624 | } | |
4625 | ||
4626 | /* | |
4627 | + * Calculate the journal credits for a chunk of data modification. | |
4628 | + * | |
4629 | + * This is called from DIO, fallocate or whoever calling | |
4630 | + * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks. | |
4631 | + * | |
4632 | + * journal buffers for data blocks are not included here, as DIO | |
4633 | + * and fallocate do no need to journal data buffers. | |
4634 | + */ | |
4635 | +int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) | |
4636 | +{ | |
4637 | + return ext4_meta_trans_blocks(inode, nrblocks, 1); | |
4638 | +} | |
4639 | + | |
4640 | +/* | |
4641 | * The caller must have previously called ext4_reserve_inode_write(). | |
4642 | * Give this, we know that the caller already has write access to iloc->bh. | |
4643 | */ | |
4644 | @@ -3506,3 +4796,64 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) | |
4645 | ||
4646 | return err; | |
4647 | } | |
4648 | + | |
4649 | +static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) | |
4650 | +{ | |
4651 | + return !buffer_mapped(bh); | |
4652 | +} | |
4653 | + | |
4654 | +int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page) | |
4655 | +{ | |
4656 | + loff_t size; | |
4657 | + unsigned long len; | |
4658 | + int ret = -EINVAL; | |
4659 | + struct file *file = vma->vm_file; | |
4660 | + struct inode *inode = file->f_path.dentry->d_inode; | |
4661 | + struct address_space *mapping = inode->i_mapping; | |
4662 | + | |
4663 | + /* | |
4664 | + * Get i_alloc_sem to stop truncates messing with the inode. We cannot | |
4665 | + * get i_mutex because we are already holding mmap_sem. | |
4666 | + */ | |
4667 | + down_read(&inode->i_alloc_sem); | |
4668 | + size = i_size_read(inode); | |
4669 | + if (page->mapping != mapping || size <= page_offset(page) | |
4670 | + || !PageUptodate(page)) { | |
4671 | + /* page got truncated from under us? */ | |
4672 | + goto out_unlock; | |
4673 | + } | |
4674 | + ret = 0; | |
4675 | + if (PageMappedToDisk(page)) | |
4676 | + goto out_unlock; | |
4677 | + | |
4678 | + if (page->index == size >> PAGE_CACHE_SHIFT) | |
4679 | + len = size & ~PAGE_CACHE_MASK; | |
4680 | + else | |
4681 | + len = PAGE_CACHE_SIZE; | |
4682 | + | |
4683 | + if (page_has_buffers(page)) { | |
4684 | + /* return if we have all the buffers mapped */ | |
4685 | + if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, | |
4686 | + ext4_bh_unmapped)) | |
4687 | + goto out_unlock; | |
4688 | + } | |
4689 | + /* | |
4690 | + * OK, we need to fill the hole... Do write_begin write_end | |
4691 | + * to do block allocation/reservation.We are not holding | |
4692 | + * inode.i__mutex here. That allow * parallel write_begin, | |
4693 | + * write_end call. lock_page prevent this from happening | |
4694 | + * on the same page though | |
4695 | + */ | |
4696 | + ret = mapping->a_ops->write_begin(file, mapping, page_offset(page), | |
4697 | + len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); | |
4698 | + if (ret < 0) | |
4699 | + goto out_unlock; | |
4700 | + ret = mapping->a_ops->write_end(file, mapping, page_offset(page), | |
4701 | + len, len, page, NULL); | |
4702 | + if (ret < 0) | |
4703 | + goto out_unlock; | |
4704 | + ret = 0; | |
4705 | +out_unlock: | |
4706 | + up_read(&inode->i_alloc_sem); | |
4707 | + return ret; | |
4708 | +} | |
4709 | diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c | |
4710 | index c9900aa..e0e3a5e 100644 | |
4711 | --- a/fs/ext4/mballoc.c | |
4712 | +++ b/fs/ext4/mballoc.c | |
4713 | @@ -381,22 +381,28 @@ static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr) | |
4714 | ||
4715 | static inline int mb_find_next_zero_bit(void *addr, int max, int start) | |
4716 | { | |
4717 | - int fix = 0; | |
4718 | + int fix = 0, ret, tmpmax; | |
4719 | addr = mb_correct_addr_and_bit(&fix, addr); | |
4720 | - max += fix; | |
4721 | + tmpmax = max + fix; | |
4722 | start += fix; | |
4723 | ||
4724 | - return ext4_find_next_zero_bit(addr, max, start) - fix; | |
4725 | + ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix; | |
4726 | + if (ret > max) | |
4727 | + return max; | |
4728 | + return ret; | |
4729 | } | |
4730 | ||
4731 | static inline int mb_find_next_bit(void *addr, int max, int start) | |
4732 | { | |
4733 | - int fix = 0; | |
4734 | + int fix = 0, ret, tmpmax; | |
4735 | addr = mb_correct_addr_and_bit(&fix, addr); | |
4736 | - max += fix; | |
4737 | + tmpmax = max + fix; | |
4738 | start += fix; | |
4739 | ||
4740 | - return ext4_find_next_bit(addr, max, start) - fix; | |
4741 | + ret = ext4_find_next_bit(addr, tmpmax, start) - fix; | |
4742 | + if (ret > max) | |
4743 | + return max; | |
4744 | + return ret; | |
4745 | } | |
4746 | ||
4747 | static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) | |
4748 | @@ -781,13 +787,16 @@ static int ext4_mb_init_cache(struct page *page, char *incore) | |
4749 | if (bh_uptodate_or_lock(bh[i])) | |
4750 | continue; | |
4751 | ||
4752 | + spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); | |
4753 | if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { | |
4754 | ext4_init_block_bitmap(sb, bh[i], | |
4755 | first_group + i, desc); | |
4756 | set_buffer_uptodate(bh[i]); | |
4757 | unlock_buffer(bh[i]); | |
4758 | + spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); | |
4759 | continue; | |
4760 | } | |
4761 | + spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); | |
4762 | get_bh(bh[i]); | |
4763 | bh[i]->b_end_io = end_buffer_read_sync; | |
4764 | submit_bh(READ, bh[i]); | |
4765 | @@ -803,6 +812,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) | |
4766 | if (!buffer_uptodate(bh[i])) | |
4767 | goto out; | |
4768 | ||
4769 | + err = 0; | |
4770 | first_block = page->index * blocks_per_page; | |
4771 | for (i = 0; i < blocks_per_page; i++) { | |
4772 | int group; | |
4773 | @@ -883,6 +893,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | |
4774 | int pnum; | |
4775 | int poff; | |
4776 | struct page *page; | |
4777 | + int ret; | |
4778 | ||
4779 | mb_debug("load group %lu\n", group); | |
4780 | ||
4781 | @@ -914,15 +925,21 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | |
4782 | if (page) { | |
4783 | BUG_ON(page->mapping != inode->i_mapping); | |
4784 | if (!PageUptodate(page)) { | |
4785 | - ext4_mb_init_cache(page, NULL); | |
4786 | + ret = ext4_mb_init_cache(page, NULL); | |
4787 | + if (ret) { | |
4788 | + unlock_page(page); | |
4789 | + goto err; | |
4790 | + } | |
4791 | mb_cmp_bitmaps(e4b, page_address(page) + | |
4792 | (poff * sb->s_blocksize)); | |
4793 | } | |
4794 | unlock_page(page); | |
4795 | } | |
4796 | } | |
4797 | - if (page == NULL || !PageUptodate(page)) | |
4798 | + if (page == NULL || !PageUptodate(page)) { | |
4799 | + ret = -EIO; | |
4800 | goto err; | |
4801 | + } | |
4802 | e4b->bd_bitmap_page = page; | |
4803 | e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); | |
4804 | mark_page_accessed(page); | |
4805 | @@ -938,14 +955,20 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | |
4806 | page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); | |
4807 | if (page) { | |
4808 | BUG_ON(page->mapping != inode->i_mapping); | |
4809 | - if (!PageUptodate(page)) | |
4810 | - ext4_mb_init_cache(page, e4b->bd_bitmap); | |
4811 | - | |
4812 | + if (!PageUptodate(page)) { | |
4813 | + ret = ext4_mb_init_cache(page, e4b->bd_bitmap); | |
4814 | + if (ret) { | |
4815 | + unlock_page(page); | |
4816 | + goto err; | |
4817 | + } | |
4818 | + } | |
4819 | unlock_page(page); | |
4820 | } | |
4821 | } | |
4822 | - if (page == NULL || !PageUptodate(page)) | |
4823 | + if (page == NULL || !PageUptodate(page)) { | |
4824 | + ret = -EIO; | |
4825 | goto err; | |
4826 | + } | |
4827 | e4b->bd_buddy_page = page; | |
4828 | e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); | |
4829 | mark_page_accessed(page); | |
4830 | @@ -962,7 +985,7 @@ err: | |
4831 | page_cache_release(e4b->bd_buddy_page); | |
4832 | e4b->bd_buddy = NULL; | |
4833 | e4b->bd_bitmap = NULL; | |
4834 | - return -EIO; | |
4835 | + return ret; | |
4836 | } | |
4837 | ||
4838 | static void ext4_mb_release_desc(struct ext4_buddy *e4b) | |
4839 | @@ -1031,7 +1054,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) | |
4840 | } | |
4841 | } | |
4842 | ||
4843 | -static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, | |
4844 | +static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, | |
4845 | int first, int count) | |
4846 | { | |
4847 | int block = 0; | |
4848 | @@ -1071,11 +1094,12 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, | |
4849 | blocknr += block; | |
4850 | blocknr += | |
4851 | le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); | |
4852 | - | |
4853 | + ext4_unlock_group(sb, e4b->bd_group); | |
4854 | ext4_error(sb, __func__, "double-free of inode" | |
4855 | " %lu's block %llu(bit %u in group %lu)\n", | |
4856 | inode ? inode->i_ino : 0, blocknr, block, | |
4857 | e4b->bd_group); | |
4858 | + ext4_lock_group(sb, e4b->bd_group); | |
4859 | } | |
4860 | mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); | |
4861 | e4b->bd_info->bb_counters[order]++; | |
4862 | @@ -1113,8 +1137,6 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, | |
4863 | } while (1); | |
4864 | } | |
4865 | mb_check_buddy(e4b); | |
4866 | - | |
4867 | - return 0; | |
4868 | } | |
4869 | ||
4870 | static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, | |
4871 | @@ -1730,10 +1752,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) | |
4872 | ac->ac_g_ex.fe_start = sbi->s_mb_last_start; | |
4873 | spin_unlock(&sbi->s_md_lock); | |
4874 | } | |
4875 | - | |
4876 | - /* searching for the right group start from the goal value specified */ | |
4877 | - group = ac->ac_g_ex.fe_group; | |
4878 | - | |
4879 | /* Let's just scan groups to find more-less suitable blocks */ | |
4880 | cr = ac->ac_2order ? 0 : 1; | |
4881 | /* | |
4882 | @@ -1743,6 +1761,12 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) | |
4883 | repeat: | |
4884 | for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { | |
4885 | ac->ac_criteria = cr; | |
4886 | + /* | |
4887 | + * searching for the right group start | |
4888 | + * from the goal value specified | |
4889 | + */ | |
4890 | + group = ac->ac_g_ex.fe_group; | |
4891 | + | |
4892 | for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) { | |
4893 | struct ext4_group_info *grp; | |
4894 | struct ext4_group_desc *desc; | |
4895 | @@ -1963,6 +1987,8 @@ static int ext4_mb_seq_history_open(struct inode *inode, struct file *file) | |
4896 | int rc; | |
4897 | int size; | |
4898 | ||
4899 | + if (unlikely(sbi->s_mb_history == NULL)) | |
4900 | + return -ENOMEM; | |
4901 | s = kmalloc(sizeof(*s), GFP_KERNEL); | |
4902 | if (s == NULL) | |
4903 | return -ENOMEM; | |
4904 | @@ -2165,9 +2191,7 @@ static void ext4_mb_history_init(struct super_block *sb) | |
4905 | sbi->s_mb_history_cur = 0; | |
4906 | spin_lock_init(&sbi->s_mb_history_lock); | |
4907 | i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history); | |
4908 | - sbi->s_mb_history = kmalloc(i, GFP_KERNEL); | |
4909 | - if (likely(sbi->s_mb_history != NULL)) | |
4910 | - memset(sbi->s_mb_history, 0, i); | |
4911 | + sbi->s_mb_history = kzalloc(i, GFP_KERNEL); | |
4912 | /* if we can't allocate history, then we simple won't use it */ | |
4913 | } | |
4914 | ||
4915 | @@ -2215,21 +2239,192 @@ ext4_mb_store_history(struct ext4_allocation_context *ac) | |
4916 | #define ext4_mb_history_init(sb) | |
4917 | #endif | |
4918 | ||
4919 | + | |
4920 | +/* Create and initialize ext4_group_info data for the given group. */ | |
4921 | +int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, | |
4922 | + struct ext4_group_desc *desc) | |
4923 | +{ | |
4924 | + int i, len; | |
4925 | + int metalen = 0; | |
4926 | + struct ext4_sb_info *sbi = EXT4_SB(sb); | |
4927 | + struct ext4_group_info **meta_group_info; | |
4928 | + | |
4929 | + /* | |
4930 | + * First check if this group is the first of a reserved block. | |
4931 | + * If it's true, we have to allocate a new table of pointers | |
4932 | + * to ext4_group_info structures | |
4933 | + */ | |
4934 | + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { | |
4935 | + metalen = sizeof(*meta_group_info) << | |
4936 | + EXT4_DESC_PER_BLOCK_BITS(sb); | |
4937 | + meta_group_info = kmalloc(metalen, GFP_KERNEL); | |
4938 | + if (meta_group_info == NULL) { | |
4939 | + printk(KERN_ERR "EXT4-fs: can't allocate mem for a " | |
4940 | + "buddy group\n"); | |
4941 | + goto exit_meta_group_info; | |
4942 | + } | |
4943 | + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = | |
4944 | + meta_group_info; | |
4945 | + } | |
4946 | + | |
4947 | + /* | |
4948 | + * calculate needed size. if change bb_counters size, | |
4949 | + * don't forget about ext4_mb_generate_buddy() | |
4950 | + */ | |
4951 | + len = offsetof(typeof(**meta_group_info), | |
4952 | + bb_counters[sb->s_blocksize_bits + 2]); | |
4953 | + | |
4954 | + meta_group_info = | |
4955 | + sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; | |
4956 | + i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); | |
4957 | + | |
4958 | + meta_group_info[i] = kzalloc(len, GFP_KERNEL); | |
4959 | + if (meta_group_info[i] == NULL) { | |
4960 | + printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); | |
4961 | + goto exit_group_info; | |
4962 | + } | |
4963 | + set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, | |
4964 | + &(meta_group_info[i]->bb_state)); | |
4965 | + | |
4966 | + /* | |
4967 | + * initialize bb_free to be able to skip | |
4968 | + * empty groups without initialization | |
4969 | + */ | |
4970 | + if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { | |
4971 | + meta_group_info[i]->bb_free = | |
4972 | + ext4_free_blocks_after_init(sb, group, desc); | |
4973 | + } else { | |
4974 | + meta_group_info[i]->bb_free = | |
4975 | + le16_to_cpu(desc->bg_free_blocks_count); | |
4976 | + } | |
4977 | + | |
4978 | + INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); | |
4979 | + | |
4980 | +#ifdef DOUBLE_CHECK | |
4981 | + { | |
4982 | + struct buffer_head *bh; | |
4983 | + meta_group_info[i]->bb_bitmap = | |
4984 | + kmalloc(sb->s_blocksize, GFP_KERNEL); | |
4985 | + BUG_ON(meta_group_info[i]->bb_bitmap == NULL); | |
4986 | + bh = ext4_read_block_bitmap(sb, group); | |
4987 | + BUG_ON(bh == NULL); | |
4988 | + memcpy(meta_group_info[i]->bb_bitmap, bh->b_data, | |
4989 | + sb->s_blocksize); | |
4990 | + put_bh(bh); | |
4991 | + } | |
4992 | +#endif | |
4993 | + | |
4994 | + return 0; | |
4995 | + | |
4996 | +exit_group_info: | |
4997 | + /* If a meta_group_info table has been allocated, release it now */ | |
4998 | + if (group % EXT4_DESC_PER_BLOCK(sb) == 0) | |
4999 | + kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); | |
5000 | +exit_meta_group_info: | |
5001 | + return -ENOMEM; | |
5002 | +} /* ext4_mb_add_groupinfo */ | |
5003 | + | |
5004 | +/* | |
5005 | + * Add a group to the existing groups. | |
5006 | + * This function is used for online resize | |
5007 | + */ | |
5008 | +int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group, | |
5009 | + struct ext4_group_desc *desc) | |
5010 | +{ | |
5011 | + struct ext4_sb_info *sbi = EXT4_SB(sb); | |
5012 | + struct inode *inode = sbi->s_buddy_cache; | |
5013 | + int blocks_per_page; | |
5014 | + int block; | |
5015 | + int pnum; | |
5016 | + struct page *page; | |
5017 | + int err; | |
5018 | + | |
5019 | + /* Add group based on group descriptor*/ | |
5020 | + err = ext4_mb_add_groupinfo(sb, group, desc); | |
5021 | + if (err) | |
5022 | + return err; | |
5023 | + | |
5024 | + /* | |
5025 | + * Cache pages containing dynamic mb_alloc datas (buddy and bitmap | |
5026 | + * datas) are set not up to date so that they will be re-initilaized | |
5027 | + * during the next call to ext4_mb_load_buddy | |
5028 | + */ | |
5029 | + | |
5030 | + /* Set buddy page as not up to date */ | |
5031 | + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; | |
5032 | + block = group * 2; | |
5033 | + pnum = block / blocks_per_page; | |
5034 | + page = find_get_page(inode->i_mapping, pnum); | |
5035 | + if (page != NULL) { | |
5036 | + ClearPageUptodate(page); | |
5037 | + page_cache_release(page); | |
5038 | + } | |
5039 | + | |
5040 | + /* Set bitmap page as not up to date */ | |
5041 | + block++; | |
5042 | + pnum = block / blocks_per_page; | |
5043 | + page = find_get_page(inode->i_mapping, pnum); | |
5044 | + if (page != NULL) { | |
5045 | + ClearPageUptodate(page); | |
5046 | + page_cache_release(page); | |
5047 | + } | |
5048 | + | |
5049 | + return 0; | |
5050 | +} | |
5051 | + | |
5052 | +/* | |
5053 | + * Update an existing group. | |
5054 | + * This function is used for online resize | |
5055 | + */ | |
5056 | +void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add) | |
5057 | +{ | |
5058 | + grp->bb_free += add; | |
5059 | +} | |
5060 | + | |
5061 | static int ext4_mb_init_backend(struct super_block *sb) | |
5062 | { | |
5063 | ext4_group_t i; | |
5064 | - int j, len, metalen; | |
5065 | + int metalen; | |
5066 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
5067 | - int num_meta_group_infos = | |
5068 | - (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >> | |
5069 | - EXT4_DESC_PER_BLOCK_BITS(sb); | |
5070 | + struct ext4_super_block *es = sbi->s_es; | |
5071 | + int num_meta_group_infos; | |
5072 | + int num_meta_group_infos_max; | |
5073 | + int array_size; | |
5074 | struct ext4_group_info **meta_group_info; | |
5075 | + struct ext4_group_desc *desc; | |
5076 | ||
5077 | + /* This is the number of blocks used by GDT */ | |
5078 | + num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - | |
5079 | + 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); | |
5080 | + | |
5081 | + /* | |
5082 | + * This is the total number of blocks used by GDT including | |
5083 | + * the number of reserved blocks for GDT. | |
5084 | + * The s_group_info array is allocated with this value | |
5085 | + * to allow a clean online resize without a complex | |
5086 | + * manipulation of pointer. | |
5087 | + * The drawback is the unused memory when no resize | |
5088 | + * occurs but it's very low in terms of pages | |
5089 | + * (see comments below) | |
5090 | + * Need to handle this properly when META_BG resizing is allowed | |
5091 | + */ | |
5092 | + num_meta_group_infos_max = num_meta_group_infos + | |
5093 | + le16_to_cpu(es->s_reserved_gdt_blocks); | |
5094 | + | |
5095 | + /* | |
5096 | + * array_size is the size of s_group_info array. We round it | |
5097 | + * to the next power of two because this approximation is done | |
5098 | + * internally by kmalloc so we can have some more memory | |
5099 | + * for free here (e.g. may be used for META_BG resize). | |
5100 | + */ | |
5101 | + array_size = 1; | |
5102 | + while (array_size < sizeof(*sbi->s_group_info) * | |
5103 | + num_meta_group_infos_max) | |
5104 | + array_size = array_size << 1; | |
5105 | /* An 8TB filesystem with 64-bit pointers requires a 4096 byte | |
5106 | * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. | |
5107 | * So a two level scheme suffices for now. */ | |
5108 | - sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * | |
5109 | - num_meta_group_infos, GFP_KERNEL); | |
5110 | + sbi->s_group_info = kmalloc(array_size, GFP_KERNEL); | |
5111 | if (sbi->s_group_info == NULL) { | |
5112 | printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); | |
5113 | return -ENOMEM; | |
5114 | @@ -2256,63 +2451,15 @@ static int ext4_mb_init_backend(struct super_block *sb) | |
5115 | sbi->s_group_info[i] = meta_group_info; | |
5116 | } | |
5117 | ||
5118 | - /* | |
5119 | - * calculate needed size. if change bb_counters size, | |
5120 | - * don't forget about ext4_mb_generate_buddy() | |
5121 | - */ | |
5122 | - len = sizeof(struct ext4_group_info); | |
5123 | - len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2); | |
5124 | for (i = 0; i < sbi->s_groups_count; i++) { | |
5125 | - struct ext4_group_desc *desc; | |
5126 | - | |
5127 | - meta_group_info = | |
5128 | - sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)]; | |
5129 | - j = i & (EXT4_DESC_PER_BLOCK(sb) - 1); | |
5130 | - | |
5131 | - meta_group_info[j] = kzalloc(len, GFP_KERNEL); | |
5132 | - if (meta_group_info[j] == NULL) { | |
5133 | - printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); | |
5134 | - goto err_freebuddy; | |
5135 | - } | |
5136 | desc = ext4_get_group_desc(sb, i, NULL); | |
5137 | if (desc == NULL) { | |
5138 | printk(KERN_ERR | |
5139 | "EXT4-fs: can't read descriptor %lu\n", i); | |
5140 | - i++; | |
5141 | goto err_freebuddy; | |
5142 | } | |
5143 | - memset(meta_group_info[j], 0, len); | |
5144 | - set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, | |
5145 | - &(meta_group_info[j]->bb_state)); | |
5146 | - | |
5147 | - /* | |
5148 | - * initialize bb_free to be able to skip | |
5149 | - * empty groups without initialization | |
5150 | - */ | |
5151 | - if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { | |
5152 | - meta_group_info[j]->bb_free = | |
5153 | - ext4_free_blocks_after_init(sb, i, desc); | |
5154 | - } else { | |
5155 | - meta_group_info[j]->bb_free = | |
5156 | - le16_to_cpu(desc->bg_free_blocks_count); | |
5157 | - } | |
5158 | - | |
5159 | - INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list); | |
5160 | - | |
5161 | -#ifdef DOUBLE_CHECK | |
5162 | - { | |
5163 | - struct buffer_head *bh; | |
5164 | - meta_group_info[j]->bb_bitmap = | |
5165 | - kmalloc(sb->s_blocksize, GFP_KERNEL); | |
5166 | - BUG_ON(meta_group_info[j]->bb_bitmap == NULL); | |
5167 | - bh = read_block_bitmap(sb, i); | |
5168 | - BUG_ON(bh == NULL); | |
5169 | - memcpy(meta_group_info[j]->bb_bitmap, bh->b_data, | |
5170 | - sb->s_blocksize); | |
5171 | - put_bh(bh); | |
5172 | - } | |
5173 | -#endif | |
5174 | - | |
5175 | + if (ext4_mb_add_groupinfo(sb, i, desc) != 0) | |
5176 | + goto err_freebuddy; | |
5177 | } | |
5178 | ||
5179 | return 0; | |
5180 | @@ -2333,9 +2480,10 @@ err_freesgi: | |
5181 | int ext4_mb_init(struct super_block *sb, int needs_recovery) | |
5182 | { | |
5183 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
5184 | - unsigned i; | |
5185 | + unsigned i, j; | |
5186 | unsigned offset; | |
5187 | unsigned max; | |
5188 | + int ret; | |
5189 | ||
5190 | if (!test_opt(sb, MBALLOC)) | |
5191 | return 0; | |
5192 | @@ -2370,12 +2518,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) | |
5193 | } while (i <= sb->s_blocksize_bits + 1); | |
5194 | ||
5195 | /* init file for buddy data */ | |
5196 | - i = ext4_mb_init_backend(sb); | |
5197 | - if (i) { | |
5198 | + ret = ext4_mb_init_backend(sb); | |
5199 | + if (ret != 0) { | |
5200 | clear_opt(sbi->s_mount_opt, MBALLOC); | |
5201 | kfree(sbi->s_mb_offsets); | |
5202 | kfree(sbi->s_mb_maxs); | |
5203 | - return i; | |
5204 | + return ret; | |
5205 | } | |
5206 | ||
5207 | spin_lock_init(&sbi->s_md_lock); | |
5208 | @@ -2392,7 +2540,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) | |
5209 | sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT; | |
5210 | sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; | |
5211 | ||
5212 | - i = sizeof(struct ext4_locality_group) * NR_CPUS; | |
5213 | + i = sizeof(struct ext4_locality_group) * nr_cpu_ids; | |
5214 | sbi->s_locality_groups = kmalloc(i, GFP_KERNEL); | |
5215 | if (sbi->s_locality_groups == NULL) { | |
5216 | clear_opt(sbi->s_mount_opt, MBALLOC); | |
5217 | @@ -2400,11 +2548,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) | |
5218 | kfree(sbi->s_mb_maxs); | |
5219 | return -ENOMEM; | |
5220 | } | |
5221 | - for (i = 0; i < NR_CPUS; i++) { | |
5222 | + for (i = 0; i < nr_cpu_ids; i++) { | |
5223 | struct ext4_locality_group *lg; | |
5224 | lg = &sbi->s_locality_groups[i]; | |
5225 | mutex_init(&lg->lg_mutex); | |
5226 | - INIT_LIST_HEAD(&lg->lg_prealloc_list); | |
5227 | + for (j = 0; j < PREALLOC_TB_SIZE; j++) | |
5228 | + INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); | |
5229 | spin_lock_init(&lg->lg_prealloc_lock); | |
5230 | } | |
5231 | ||
5232 | @@ -2548,8 +2697,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb) | |
5233 | ext4_lock_group(sb, md->group); | |
5234 | for (i = 0; i < md->num; i++) { | |
5235 | mb_debug(" %u", md->blocks[i]); | |
5236 | - err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1); | |
5237 | - BUG_ON(err != 0); | |
5238 | + mb_free_blocks(NULL, &e4b, md->blocks[i], 1); | |
5239 | } | |
5240 | mb_debug("\n"); | |
5241 | ext4_unlock_group(sb, md->group); | |
5242 | @@ -2575,25 +2723,24 @@ ext4_mb_free_committed_blocks(struct super_block *sb) | |
5243 | ||
5244 | ||
5245 | ||
5246 | -#define MB_PROC_VALUE_READ(name) \ | |
5247 | -static int ext4_mb_read_##name(char *page, char **start, \ | |
5248 | - off_t off, int count, int *eof, void *data) \ | |
5249 | +#define MB_PROC_FOPS(name) \ | |
5250 | +static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v) \ | |
5251 | { \ | |
5252 | - struct ext4_sb_info *sbi = data; \ | |
5253 | - int len; \ | |
5254 | - *eof = 1; \ | |
5255 | - if (off != 0) \ | |
5256 | - return 0; \ | |
5257 | - len = sprintf(page, "%ld\n", sbi->s_mb_##name); \ | |
5258 | - *start = page; \ | |
5259 | - return len; \ | |
5260 | -} | |
5261 | - | |
5262 | -#define MB_PROC_VALUE_WRITE(name) \ | |
5263 | -static int ext4_mb_write_##name(struct file *file, \ | |
5264 | - const char __user *buf, unsigned long cnt, void *data) \ | |
5265 | + struct ext4_sb_info *sbi = m->private; \ | |
5266 | + \ | |
5267 | + seq_printf(m, "%ld\n", sbi->s_mb_##name); \ | |
5268 | + return 0; \ | |
5269 | +} \ | |
5270 | + \ | |
5271 | +static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\ | |
5272 | { \ | |
5273 | - struct ext4_sb_info *sbi = data; \ | |
5274 | + return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\ | |
5275 | +} \ | |
5276 | + \ | |
5277 | +static ssize_t ext4_mb_##name##_proc_write(struct file *file, \ | |
5278 | + const char __user *buf, size_t cnt, loff_t *ppos) \ | |
5279 | +{ \ | |
5280 | + struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\ | |
5281 | char str[32]; \ | |
5282 | long value; \ | |
5283 | if (cnt >= sizeof(str)) \ | |
5284 | @@ -2605,31 +2752,32 @@ static int ext4_mb_write_##name(struct file *file, \ | |
5285 | return -ERANGE; \ | |
5286 | sbi->s_mb_##name = value; \ | |
5287 | return cnt; \ | |
5288 | -} | |
5289 | +} \ | |
5290 | + \ | |
5291 | +static const struct file_operations ext4_mb_##name##_proc_fops = { \ | |
5292 | + .owner = THIS_MODULE, \ | |
5293 | + .open = ext4_mb_##name##_proc_open, \ | |
5294 | + .read = seq_read, \ | |
5295 | + .llseek = seq_lseek, \ | |
5296 | + .release = single_release, \ | |
5297 | + .write = ext4_mb_##name##_proc_write, \ | |
5298 | +}; | |
5299 | ||
5300 | -MB_PROC_VALUE_READ(stats); | |
5301 | -MB_PROC_VALUE_WRITE(stats); | |
5302 | -MB_PROC_VALUE_READ(max_to_scan); | |
5303 | -MB_PROC_VALUE_WRITE(max_to_scan); | |
5304 | -MB_PROC_VALUE_READ(min_to_scan); | |
5305 | -MB_PROC_VALUE_WRITE(min_to_scan); | |
5306 | -MB_PROC_VALUE_READ(order2_reqs); | |
5307 | -MB_PROC_VALUE_WRITE(order2_reqs); | |
5308 | -MB_PROC_VALUE_READ(stream_request); | |
5309 | -MB_PROC_VALUE_WRITE(stream_request); | |
5310 | -MB_PROC_VALUE_READ(group_prealloc); | |
5311 | -MB_PROC_VALUE_WRITE(group_prealloc); | |
5312 | +MB_PROC_FOPS(stats); | |
5313 | +MB_PROC_FOPS(max_to_scan); | |
5314 | +MB_PROC_FOPS(min_to_scan); | |
5315 | +MB_PROC_FOPS(order2_reqs); | |
5316 | +MB_PROC_FOPS(stream_request); | |
5317 | +MB_PROC_FOPS(group_prealloc); | |
5318 | ||
5319 | #define MB_PROC_HANDLER(name, var) \ | |
5320 | do { \ | |
5321 | - proc = create_proc_entry(name, mode, sbi->s_mb_proc); \ | |
5322 | + proc = proc_create_data(name, mode, sbi->s_mb_proc, \ | |
5323 | + &ext4_mb_##var##_proc_fops, sbi); \ | |
5324 | if (proc == NULL) { \ | |
5325 | printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \ | |
5326 | goto err_out; \ | |
5327 | } \ | |
5328 | - proc->data = sbi; \ | |
5329 | - proc->read_proc = ext4_mb_read_##var ; \ | |
5330 | - proc->write_proc = ext4_mb_write_##var; \ | |
5331 | } while (0) | |
5332 | ||
5333 | static int ext4_mb_init_per_dev_proc(struct super_block *sb) | |
5334 | @@ -2639,6 +2787,10 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb) | |
5335 | struct proc_dir_entry *proc; | |
5336 | char devname[64]; | |
5337 | ||
5338 | + if (proc_root_ext4 == NULL) { | |
5339 | + sbi->s_mb_proc = NULL; | |
5340 | + return -EINVAL; | |
5341 | + } | |
5342 | bdevname(sb->s_bdev, devname); | |
5343 | sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4); | |
5344 | ||
5345 | @@ -2747,7 +2899,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, | |
5346 | ||
5347 | ||
5348 | err = -EIO; | |
5349 | - bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group); | |
5350 | + bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); | |
5351 | if (!bitmap_bh) | |
5352 | goto out_err; | |
5353 | ||
5354 | @@ -2816,7 +2968,23 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, | |
5355 | le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len); | |
5356 | gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); | |
5357 | spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); | |
5358 | - percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); | |
5359 | + | |
5360 | + /* | |
5361 | + * free blocks account has already be reduced/reserved | |
5362 | + * at write_begin() time for delayed allocation | |
5363 | + * do not double accounting | |
5364 | + */ | |
5365 | + if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) | |
5366 | + percpu_counter_sub(&sbi->s_freeblocks_counter, | |
5367 | + ac->ac_b_ex.fe_len); | |
5368 | + | |
5369 | + if (sbi->s_log_groups_per_flex) { | |
5370 | + ext4_group_t flex_group = ext4_flex_group(sbi, | |
5371 | + ac->ac_b_ex.fe_group); | |
5372 | + spin_lock(sb_bgl_lock(sbi, flex_group)); | |
5373 | + sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len; | |
5374 | + spin_unlock(sb_bgl_lock(sbi, flex_group)); | |
5375 | + } | |
5376 | ||
5377 | err = ext4_journal_dirty_metadata(handle, bitmap_bh); | |
5378 | if (err) | |
5379 | @@ -3096,6 +3264,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, | |
5380 | struct ext4_prealloc_space *pa) | |
5381 | { | |
5382 | unsigned int len = ac->ac_o_ex.fe_len; | |
5383 | + | |
5384 | ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, | |
5385 | &ac->ac_b_ex.fe_group, | |
5386 | &ac->ac_b_ex.fe_start); | |
5387 | @@ -3113,14 +3282,45 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, | |
5388 | } | |
5389 | ||
5390 | /* | |
5391 | + * Return the prealloc space that have minimal distance | |
5392 | + * from the goal block. @cpa is the prealloc | |
5393 | + * space that is having currently known minimal distance | |
5394 | + * from the goal block. | |
5395 | + */ | |
5396 | +static struct ext4_prealloc_space * | |
5397 | +ext4_mb_check_group_pa(ext4_fsblk_t goal_block, | |
5398 | + struct ext4_prealloc_space *pa, | |
5399 | + struct ext4_prealloc_space *cpa) | |
5400 | +{ | |
5401 | + ext4_fsblk_t cur_distance, new_distance; | |
5402 | + | |
5403 | + if (cpa == NULL) { | |
5404 | + atomic_inc(&pa->pa_count); | |
5405 | + return pa; | |
5406 | + } | |
5407 | + cur_distance = abs(goal_block - cpa->pa_pstart); | |
5408 | + new_distance = abs(goal_block - pa->pa_pstart); | |
5409 | + | |
5410 | + if (cur_distance < new_distance) | |
5411 | + return cpa; | |
5412 | + | |
5413 | + /* drop the previous reference */ | |
5414 | + atomic_dec(&cpa->pa_count); | |
5415 | + atomic_inc(&pa->pa_count); | |
5416 | + return pa; | |
5417 | +} | |
5418 | + | |
5419 | +/* | |
5420 | * search goal blocks in preallocated space | |
5421 | */ | |
5422 | static noinline_for_stack int | |
5423 | ext4_mb_use_preallocated(struct ext4_allocation_context *ac) | |
5424 | { | |
5425 | + int order, i; | |
5426 | struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); | |
5427 | struct ext4_locality_group *lg; | |
5428 | - struct ext4_prealloc_space *pa; | |
5429 | + struct ext4_prealloc_space *pa, *cpa = NULL; | |
5430 | + ext4_fsblk_t goal_block; | |
5431 | ||
5432 | /* only data can be preallocated */ | |
5433 | if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) | |
5434 | @@ -3158,22 +3358,38 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) | |
5435 | lg = ac->ac_lg; | |
5436 | if (lg == NULL) | |
5437 | return 0; | |
5438 | + order = fls(ac->ac_o_ex.fe_len) - 1; | |
5439 | + if (order > PREALLOC_TB_SIZE - 1) | |
5440 | + /* The max size of hash table is PREALLOC_TB_SIZE */ | |
5441 | + order = PREALLOC_TB_SIZE - 1; | |
5442 | + | |
5443 | + goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) + | |
5444 | + ac->ac_g_ex.fe_start + | |
5445 | + le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block); | |
5446 | + /* | |
5447 | + * search for the prealloc space that is having | |
5448 | + * minimal distance from the goal block. | |
5449 | + */ | |
5450 | + for (i = order; i < PREALLOC_TB_SIZE; i++) { | |
5451 | + rcu_read_lock(); | |
5452 | + list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i], | |
5453 | + pa_inode_list) { | |
5454 | + spin_lock(&pa->pa_lock); | |
5455 | + if (pa->pa_deleted == 0 && | |
5456 | + pa->pa_free >= ac->ac_o_ex.fe_len) { | |
5457 | ||
5458 | - rcu_read_lock(); | |
5459 | - list_for_each_entry_rcu(pa, &lg->lg_prealloc_list, pa_inode_list) { | |
5460 | - spin_lock(&pa->pa_lock); | |
5461 | - if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) { | |
5462 | - atomic_inc(&pa->pa_count); | |
5463 | - ext4_mb_use_group_pa(ac, pa); | |
5464 | + cpa = ext4_mb_check_group_pa(goal_block, | |
5465 | + pa, cpa); | |
5466 | + } | |
5467 | spin_unlock(&pa->pa_lock); | |
5468 | - ac->ac_criteria = 20; | |
5469 | - rcu_read_unlock(); | |
5470 | - return 1; | |
5471 | } | |
5472 | - spin_unlock(&pa->pa_lock); | |
5473 | + rcu_read_unlock(); | |
5474 | + } | |
5475 | + if (cpa) { | |
5476 | + ext4_mb_use_group_pa(ac, cpa); | |
5477 | + ac->ac_criteria = 20; | |
5478 | + return 1; | |
5479 | } | |
5480 | - rcu_read_unlock(); | |
5481 | - | |
5482 | return 0; | |
5483 | } | |
5484 | ||
5485 | @@ -3396,6 +3612,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) | |
5486 | pa->pa_free = pa->pa_len; | |
5487 | atomic_set(&pa->pa_count, 1); | |
5488 | spin_lock_init(&pa->pa_lock); | |
5489 | + INIT_LIST_HEAD(&pa->pa_inode_list); | |
5490 | pa->pa_deleted = 0; | |
5491 | pa->pa_linear = 1; | |
5492 | ||
5493 | @@ -3416,10 +3633,10 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) | |
5494 | list_add(&pa->pa_group_list, &grp->bb_prealloc_list); | |
5495 | ext4_unlock_group(sb, ac->ac_b_ex.fe_group); | |
5496 | ||
5497 | - spin_lock(pa->pa_obj_lock); | |
5498 | - list_add_tail_rcu(&pa->pa_inode_list, &lg->lg_prealloc_list); | |
5499 | - spin_unlock(pa->pa_obj_lock); | |
5500 | - | |
5501 | + /* | |
5502 | + * We will later add the new pa to the right bucket | |
5503 | + * after updating the pa_free in ext4_mb_release_context | |
5504 | + */ | |
5505 | return 0; | |
5506 | } | |
5507 | ||
5508 | @@ -3473,8 +3690,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, | |
5509 | if (bit >= end) | |
5510 | break; | |
5511 | next = mb_find_next_bit(bitmap_bh->b_data, end, bit); | |
5512 | - if (next > end) | |
5513 | - next = end; | |
5514 | start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + | |
5515 | le32_to_cpu(sbi->s_es->s_first_data_block); | |
5516 | mb_debug(" free preallocated %u/%u in group %u\n", | |
5517 | @@ -3569,22 +3784,25 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, | |
5518 | if (list_empty(&grp->bb_prealloc_list)) | |
5519 | return 0; | |
5520 | ||
5521 | - bitmap_bh = read_block_bitmap(sb, group); | |
5522 | + bitmap_bh = ext4_read_block_bitmap(sb, group); | |
5523 | if (bitmap_bh == NULL) { | |
5524 | - /* error handling here */ | |
5525 | - ext4_mb_release_desc(&e4b); | |
5526 | - BUG_ON(bitmap_bh == NULL); | |
5527 | + ext4_error(sb, __func__, "Error in reading block " | |
5528 | + "bitmap for %lu\n", group); | |
5529 | + return 0; | |
5530 | } | |
5531 | ||
5532 | err = ext4_mb_load_buddy(sb, group, &e4b); | |
5533 | - BUG_ON(err != 0); /* error handling here */ | |
5534 | + if (err) { | |
5535 | + ext4_error(sb, __func__, "Error in loading buddy " | |
5536 | + "information for %lu\n", group); | |
5537 | + put_bh(bitmap_bh); | |
5538 | + return 0; | |
5539 | + } | |
5540 | ||
5541 | if (needed == 0) | |
5542 | needed = EXT4_BLOCKS_PER_GROUP(sb) + 1; | |
5543 | ||
5544 | - grp = ext4_get_group_info(sb, group); | |
5545 | INIT_LIST_HEAD(&list); | |
5546 | - | |
5547 | ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); | |
5548 | repeat: | |
5549 | ext4_lock_group(sb, group); | |
5550 | @@ -3741,13 +3959,18 @@ repeat: | |
5551 | ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); | |
5552 | ||
5553 | err = ext4_mb_load_buddy(sb, group, &e4b); | |
5554 | - BUG_ON(err != 0); /* error handling here */ | |
5555 | + if (err) { | |
5556 | + ext4_error(sb, __func__, "Error in loading buddy " | |
5557 | + "information for %lu\n", group); | |
5558 | + continue; | |
5559 | + } | |
5560 | ||
5561 | - bitmap_bh = read_block_bitmap(sb, group); | |
5562 | + bitmap_bh = ext4_read_block_bitmap(sb, group); | |
5563 | if (bitmap_bh == NULL) { | |
5564 | - /* error handling here */ | |
5565 | + ext4_error(sb, __func__, "Error in reading block " | |
5566 | + "bitmap for %lu\n", group); | |
5567 | ext4_mb_release_desc(&e4b); | |
5568 | - BUG_ON(bitmap_bh == NULL); | |
5569 | + continue; | |
5570 | } | |
5571 | ||
5572 | ext4_lock_group(sb, group); | |
5573 | @@ -3950,22 +4173,168 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, | |
5574 | ||
5575 | } | |
5576 | ||
5577 | +static noinline_for_stack void | |
5578 | +ext4_mb_discard_lg_preallocations(struct super_block *sb, | |
5579 | + struct ext4_locality_group *lg, | |
5580 | + int order, int total_entries) | |
5581 | +{ | |
5582 | + ext4_group_t group = 0; | |
5583 | + struct ext4_buddy e4b; | |
5584 | + struct list_head discard_list; | |
5585 | + struct ext4_prealloc_space *pa, *tmp; | |
5586 | + struct ext4_allocation_context *ac; | |
5587 | + | |
5588 | + mb_debug("discard locality group preallocation\n"); | |
5589 | + | |
5590 | + INIT_LIST_HEAD(&discard_list); | |
5591 | + ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); | |
5592 | + | |
5593 | + spin_lock(&lg->lg_prealloc_lock); | |
5594 | + list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], | |
5595 | + pa_inode_list) { | |
5596 | + spin_lock(&pa->pa_lock); | |
5597 | + if (atomic_read(&pa->pa_count)) { | |
5598 | + /* | |
5599 | + * This is the pa that we just used | |
5600 | + * for block allocation. So don't | |
5601 | + * free that | |
5602 | + */ | |
5603 | + spin_unlock(&pa->pa_lock); | |
5604 | + continue; | |
5605 | + } | |
5606 | + if (pa->pa_deleted) { | |
5607 | + spin_unlock(&pa->pa_lock); | |
5608 | + continue; | |
5609 | + } | |
5610 | + /* only lg prealloc space */ | |
5611 | + BUG_ON(!pa->pa_linear); | |
5612 | + | |
5613 | + /* seems this one can be freed ... */ | |
5614 | + pa->pa_deleted = 1; | |
5615 | + spin_unlock(&pa->pa_lock); | |
5616 | + | |
5617 | + list_del_rcu(&pa->pa_inode_list); | |
5618 | + list_add(&pa->u.pa_tmp_list, &discard_list); | |
5619 | + | |
5620 | + total_entries--; | |
5621 | + if (total_entries <= 5) { | |
5622 | + /* | |
5623 | + * we want to keep only 5 entries | |
5624 | + * allowing it to grow to 8. This | |
5625 | + * mak sure we don't call discard | |
5626 | + * soon for this list. | |
5627 | + */ | |
5628 | + break; | |
5629 | + } | |
5630 | + } | |
5631 | + spin_unlock(&lg->lg_prealloc_lock); | |
5632 | + | |
5633 | + list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { | |
5634 | + | |
5635 | + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); | |
5636 | + if (ext4_mb_load_buddy(sb, group, &e4b)) { | |
5637 | + ext4_error(sb, __func__, "Error in loading buddy " | |
5638 | + "information for %lu\n", group); | |
5639 | + continue; | |
5640 | + } | |
5641 | + ext4_lock_group(sb, group); | |
5642 | + list_del(&pa->pa_group_list); | |
5643 | + ext4_mb_release_group_pa(&e4b, pa, ac); | |
5644 | + ext4_unlock_group(sb, group); | |
5645 | + | |
5646 | + ext4_mb_release_desc(&e4b); | |
5647 | + list_del(&pa->u.pa_tmp_list); | |
5648 | + call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); | |
5649 | + } | |
5650 | + if (ac) | |
5651 | + kmem_cache_free(ext4_ac_cachep, ac); | |
5652 | +} | |
5653 | + | |
5654 | +/* | |
5655 | + * We have incremented pa_count. So it cannot be freed at this | |
5656 | + * point. Also we hold lg_mutex. So no parallel allocation is | |
5657 | + * possible from this lg. That means pa_free cannot be updated. | |
5658 | + * | |
5659 | + * A parallel ext4_mb_discard_group_preallocations is possible. | |
5660 | + * which can cause the lg_prealloc_list to be updated. | |
5661 | + */ | |
5662 | + | |
5663 | +static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) | |
5664 | +{ | |
5665 | + int order, added = 0, lg_prealloc_count = 1; | |
5666 | + struct super_block *sb = ac->ac_sb; | |
5667 | + struct ext4_locality_group *lg = ac->ac_lg; | |
5668 | + struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa; | |
5669 | + | |
5670 | + order = fls(pa->pa_free) - 1; | |
5671 | + if (order > PREALLOC_TB_SIZE - 1) | |
5672 | + /* The max size of hash table is PREALLOC_TB_SIZE */ | |
5673 | + order = PREALLOC_TB_SIZE - 1; | |
5674 | + /* Add the prealloc space to lg */ | |
5675 | + rcu_read_lock(); | |
5676 | + list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], | |
5677 | + pa_inode_list) { | |
5678 | + spin_lock(&tmp_pa->pa_lock); | |
5679 | + if (tmp_pa->pa_deleted) { | |
5680 | + spin_unlock(&pa->pa_lock); | |
5681 | + continue; | |
5682 | + } | |
5683 | + if (!added && pa->pa_free < tmp_pa->pa_free) { | |
5684 | + /* Add to the tail of the previous entry */ | |
5685 | + list_add_tail_rcu(&pa->pa_inode_list, | |
5686 | + &tmp_pa->pa_inode_list); | |
5687 | + added = 1; | |
5688 | + /* | |
5689 | + * we want to count the total | |
5690 | + * number of entries in the list | |
5691 | + */ | |
5692 | + } | |
5693 | + spin_unlock(&tmp_pa->pa_lock); | |
5694 | + lg_prealloc_count++; | |
5695 | + } | |
5696 | + if (!added) | |
5697 | + list_add_tail_rcu(&pa->pa_inode_list, | |
5698 | + &lg->lg_prealloc_list[order]); | |
5699 | + rcu_read_unlock(); | |
5700 | + | |
5701 | + /* Now trim the list to be not more than 8 elements */ | |
5702 | + if (lg_prealloc_count > 8) { | |
5703 | + ext4_mb_discard_lg_preallocations(sb, lg, | |
5704 | + order, lg_prealloc_count); | |
5705 | + return; | |
5706 | + } | |
5707 | + return ; | |
5708 | +} | |
5709 | + | |
5710 | /* | |
5711 | * release all resource we used in allocation | |
5712 | */ | |
5713 | static int ext4_mb_release_context(struct ext4_allocation_context *ac) | |
5714 | { | |
5715 | - if (ac->ac_pa) { | |
5716 | - if (ac->ac_pa->pa_linear) { | |
5717 | + struct ext4_prealloc_space *pa = ac->ac_pa; | |
5718 | + if (pa) { | |
5719 | + if (pa->pa_linear) { | |
5720 | /* see comment in ext4_mb_use_group_pa() */ | |
5721 | - spin_lock(&ac->ac_pa->pa_lock); | |
5722 | - ac->ac_pa->pa_pstart += ac->ac_b_ex.fe_len; | |
5723 | - ac->ac_pa->pa_lstart += ac->ac_b_ex.fe_len; | |
5724 | - ac->ac_pa->pa_free -= ac->ac_b_ex.fe_len; | |
5725 | - ac->ac_pa->pa_len -= ac->ac_b_ex.fe_len; | |
5726 | - spin_unlock(&ac->ac_pa->pa_lock); | |
5727 | + spin_lock(&pa->pa_lock); | |
5728 | + pa->pa_pstart += ac->ac_b_ex.fe_len; | |
5729 | + pa->pa_lstart += ac->ac_b_ex.fe_len; | |
5730 | + pa->pa_free -= ac->ac_b_ex.fe_len; | |
5731 | + pa->pa_len -= ac->ac_b_ex.fe_len; | |
5732 | + spin_unlock(&pa->pa_lock); | |
5733 | + /* | |
5734 | + * We want to add the pa to the right bucket. | |
5735 | + * Remove it from the list and while adding | |
5736 | + * make sure the list to which we are adding | |
5737 | + * doesn't grow big. | |
5738 | + */ | |
5739 | + if (likely(pa->pa_free)) { | |
5740 | + spin_lock(pa->pa_obj_lock); | |
5741 | + list_del_rcu(&pa->pa_inode_list); | |
5742 | + spin_unlock(pa->pa_obj_lock); | |
5743 | + ext4_mb_add_n_trim(ac); | |
5744 | + } | |
5745 | } | |
5746 | - ext4_mb_put_pa(ac, ac->ac_sb, ac->ac_pa); | |
5747 | + ext4_mb_put_pa(ac, ac->ac_sb, pa); | |
5748 | } | |
5749 | if (ac->ac_bitmap_page) | |
5750 | page_cache_release(ac->ac_bitmap_page); | |
5751 | @@ -4011,10 +4380,21 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, | |
5752 | sbi = EXT4_SB(sb); | |
5753 | ||
5754 | if (!test_opt(sb, MBALLOC)) { | |
5755 | - block = ext4_new_blocks_old(handle, ar->inode, ar->goal, | |
5756 | + block = ext4_old_new_blocks(handle, ar->inode, ar->goal, | |
5757 | &(ar->len), errp); | |
5758 | return block; | |
5759 | } | |
5760 | + if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) { | |
5761 | + /* | |
5762 | + * With delalloc we already reserved the blocks | |
5763 | + */ | |
5764 | + ar->len = ext4_has_free_blocks(sbi, ar->len); | |
5765 | + } | |
5766 | + | |
5767 | + if (ar->len == 0) { | |
5768 | + *errp = -ENOSPC; | |
5769 | + return 0; | |
5770 | + } | |
5771 | ||
5772 | while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) { | |
5773 | ar->flags |= EXT4_MB_HINT_NOPREALLOC; | |
5774 | @@ -4026,10 +4406,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, | |
5775 | } | |
5776 | inquota = ar->len; | |
5777 | ||
5778 | + if (EXT4_I(ar->inode)->i_delalloc_reserved_flag) | |
5779 | + ar->flags |= EXT4_MB_DELALLOC_RESERVED; | |
5780 | + | |
5781 | ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); | |
5782 | if (!ac) { | |
5783 | + ar->len = 0; | |
5784 | *errp = -ENOMEM; | |
5785 | - return 0; | |
5786 | + goto out1; | |
5787 | } | |
5788 | ||
5789 | ext4_mb_poll_new_transaction(sb, handle); | |
5790 | @@ -4037,12 +4421,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, | |
5791 | *errp = ext4_mb_initialize_context(ac, ar); | |
5792 | if (*errp) { | |
5793 | ar->len = 0; | |
5794 | - goto out; | |
5795 | + goto out2; | |
5796 | } | |
5797 | ||
5798 | ac->ac_op = EXT4_MB_HISTORY_PREALLOC; | |
5799 | if (!ext4_mb_use_preallocated(ac)) { | |
5800 | - | |
5801 | ac->ac_op = EXT4_MB_HISTORY_ALLOC; | |
5802 | ext4_mb_normalize_request(ac, ar); | |
5803 | repeat: | |
5804 | @@ -4085,11 +4468,12 @@ repeat: | |
5805 | ||
5806 | ext4_mb_release_context(ac); | |
5807 | ||
5808 | -out: | |
5809 | +out2: | |
5810 | + kmem_cache_free(ext4_ac_cachep, ac); | |
5811 | +out1: | |
5812 | if (ar->len < inquota) | |
5813 | DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len); | |
5814 | ||
5815 | - kmem_cache_free(ext4_ac_cachep, ac); | |
5816 | return block; | |
5817 | } | |
5818 | static void ext4_mb_poll_new_transaction(struct super_block *sb, | |
5819 | @@ -4242,12 +4626,16 @@ do_more: | |
5820 | overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); | |
5821 | count -= overflow; | |
5822 | } | |
5823 | - bitmap_bh = read_block_bitmap(sb, block_group); | |
5824 | - if (!bitmap_bh) | |
5825 | + bitmap_bh = ext4_read_block_bitmap(sb, block_group); | |
5826 | + if (!bitmap_bh) { | |
5827 | + err = -EIO; | |
5828 | goto error_return; | |
5829 | + } | |
5830 | gdp = ext4_get_group_desc(sb, block_group, &gd_bh); | |
5831 | - if (!gdp) | |
5832 | + if (!gdp) { | |
5833 | + err = -EIO; | |
5834 | goto error_return; | |
5835 | + } | |
5836 | ||
5837 | if (in_range(ext4_block_bitmap(sb, gdp), block, count) || | |
5838 | in_range(ext4_inode_bitmap(sb, gdp), block, count) || | |
5839 | @@ -4309,10 +4697,9 @@ do_more: | |
5840 | ext4_mb_free_metadata(handle, &e4b, block_group, bit, count); | |
5841 | } else { | |
5842 | ext4_lock_group(sb, block_group); | |
5843 | - err = mb_free_blocks(inode, &e4b, bit, count); | |
5844 | + mb_free_blocks(inode, &e4b, bit, count); | |
5845 | ext4_mb_return_to_preallocation(inode, &e4b, block, count); | |
5846 | ext4_unlock_group(sb, block_group); | |
5847 | - BUG_ON(err != 0); | |
5848 | } | |
5849 | ||
5850 | spin_lock(sb_bgl_lock(sbi, block_group)); | |
5851 | @@ -4321,6 +4708,13 @@ do_more: | |
5852 | spin_unlock(sb_bgl_lock(sbi, block_group)); | |
5853 | percpu_counter_add(&sbi->s_freeblocks_counter, count); | |
5854 | ||
5855 | + if (sbi->s_log_groups_per_flex) { | |
5856 | + ext4_group_t flex_group = ext4_flex_group(sbi, block_group); | |
5857 | + spin_lock(sb_bgl_lock(sbi, flex_group)); | |
5858 | + sbi->s_flex_groups[flex_group].free_blocks += count; | |
5859 | + spin_unlock(sb_bgl_lock(sbi, flex_group)); | |
5860 | + } | |
5861 | + | |
5862 | ext4_mb_release_desc(&e4b); | |
5863 | ||
5864 | *freed += count; | |
5865 | diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h | |
5866 | index bfe6add..c7c9906 100644 | |
5867 | --- a/fs/ext4/mballoc.h | |
5868 | +++ b/fs/ext4/mballoc.h | |
5869 | @@ -164,11 +164,17 @@ struct ext4_free_extent { | |
5870 | * Locality group: | |
5871 | * we try to group all related changes together | |
5872 | * so that writeback can flush/allocate them together as well | |
5873 | + * Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC | |
5874 | + * (512). We store prealloc space into the hash based on the pa_free blocks | |
5875 | + * order value.ie, fls(pa_free)-1; | |
5876 | */ | |
5877 | +#define PREALLOC_TB_SIZE 10 | |
5878 | struct ext4_locality_group { | |
5879 | /* for allocator */ | |
5880 | - struct mutex lg_mutex; /* to serialize allocates */ | |
5881 | - struct list_head lg_prealloc_list;/* list of preallocations */ | |
5882 | + /* to serialize allocates */ | |
5883 | + struct mutex lg_mutex; | |
5884 | + /* list of preallocations */ | |
5885 | + struct list_head lg_prealloc_list[PREALLOC_TB_SIZE]; | |
5886 | spinlock_t lg_prealloc_lock; | |
5887 | }; | |
5888 | ||
5889 | diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c | |
5890 | index b9e077b..46fc0b5 100644 | |
5891 | --- a/fs/ext4/migrate.c | |
5892 | +++ b/fs/ext4/migrate.c | |
5893 | @@ -53,7 +53,8 @@ static int finish_range(handle_t *handle, struct inode *inode, | |
5894 | * credit. But below we try to not accumalate too much | |
5895 | * of them by restarting the journal. | |
5896 | */ | |
5897 | - needed = ext4_ext_calc_credits_for_insert(inode, path); | |
5898 | + needed = ext4_ext_calc_credits_for_single_extent(inode, | |
5899 | + lb->last_block - lb->first_block + 1, path); | |
5900 | ||
5901 | /* | |
5902 | * Make sure the credit we accumalated is not really high | |
5903 | diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c | |
5904 | index ab16bea..387ad98 100644 | |
5905 | --- a/fs/ext4/namei.c | |
5906 | +++ b/fs/ext4/namei.c | |
5907 | @@ -183,6 +183,16 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, | |
5908 | struct inode *inode); | |
5909 | ||
5910 | /* | |
5911 | + * p is at least 6 bytes before the end of page | |
5912 | + */ | |
5913 | +static inline struct ext4_dir_entry_2 * | |
5914 | +ext4_next_entry(struct ext4_dir_entry_2 *p) | |
5915 | +{ | |
5916 | + return (struct ext4_dir_entry_2 *)((char *)p + | |
5917 | + ext4_rec_len_from_disk(p->rec_len)); | |
5918 | +} | |
5919 | + | |
5920 | +/* | |
5921 | * Future: use high four bits of block for coalesce-on-delete flags | |
5922 | * Mask them off for now. | |
5923 | */ | |
5924 | @@ -231,13 +241,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) | |
5925 | { | |
5926 | unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - | |
5927 | EXT4_DIR_REC_LEN(2) - infosize; | |
5928 | - return 0? 20: entry_space / sizeof(struct dx_entry); | |
5929 | + return entry_space / sizeof(struct dx_entry); | |
5930 | } | |
5931 | ||
5932 | static inline unsigned dx_node_limit (struct inode *dir) | |
5933 | { | |
5934 | unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); | |
5935 | - return 0? 22: entry_space / sizeof(struct dx_entry); | |
5936 | + return entry_space / sizeof(struct dx_entry); | |
5937 | } | |
5938 | ||
5939 | /* | |
5940 | @@ -554,15 +564,6 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, | |
5941 | ||
5942 | ||
5943 | /* | |
5944 | - * p is at least 6 bytes before the end of page | |
5945 | - */ | |
5946 | -static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p) | |
5947 | -{ | |
5948 | - return (struct ext4_dir_entry_2 *)((char *)p + | |
5949 | - ext4_rec_len_from_disk(p->rec_len)); | |
5950 | -} | |
5951 | - | |
5952 | -/* | |
5953 | * This function fills a red-black tree with information from a | |
5954 | * directory block. It returns the number directory entries loaded | |
5955 | * into the tree. If there is an error it is returned in err. | |
5956 | @@ -993,19 +994,21 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry, | |
5957 | de = (struct ext4_dir_entry_2 *) bh->b_data; | |
5958 | top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize - | |
5959 | EXT4_DIR_REC_LEN(0)); | |
5960 | - for (; de < top; de = ext4_next_entry(de)) | |
5961 | - if (ext4_match (namelen, name, de)) { | |
5962 | - if (!ext4_check_dir_entry("ext4_find_entry", | |
5963 | - dir, de, bh, | |
5964 | - (block<<EXT4_BLOCK_SIZE_BITS(sb)) | |
5965 | - +((char *)de - bh->b_data))) { | |
5966 | - brelse (bh); | |
5967 | + for (; de < top; de = ext4_next_entry(de)) { | |
5968 | + int off = (block << EXT4_BLOCK_SIZE_BITS(sb)) | |
5969 | + + ((char *) de - bh->b_data); | |
5970 | + | |
5971 | + if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) { | |
5972 | + brelse(bh); | |
5973 | *err = ERR_BAD_DX_DIR; | |
5974 | goto errout; | |
5975 | } | |
5976 | - *res_dir = de; | |
5977 | - dx_release (frames); | |
5978 | - return bh; | |
5979 | + | |
5980 | + if (ext4_match(namelen, name, de)) { | |
5981 | + *res_dir = de; | |
5982 | + dx_release(frames); | |
5983 | + return bh; | |
5984 | + } | |
5985 | } | |
5986 | brelse (bh); | |
5987 | /* Check to see if we should continue to search */ | |
5988 | diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c | |
5989 | index 9ff7b1c..b3d3560 100644 | |
5990 | --- a/fs/ext4/resize.c | |
5991 | +++ b/fs/ext4/resize.c | |
5992 | @@ -73,7 +73,7 @@ static int verify_group_input(struct super_block *sb, | |
5993 | "Inode bitmap not in group (block %llu)", | |
5994 | (unsigned long long)input->inode_bitmap); | |
5995 | else if (outside(input->inode_table, start, end) || | |
5996 | - outside(itend - 1, start, end)) | |
5997 | + outside(itend - 1, start, end)) | |
5998 | ext4_warning(sb, __func__, | |
5999 | "Inode table not in group (blocks %llu-%llu)", | |
6000 | (unsigned long long)input->inode_table, itend - 1); | |
6001 | @@ -104,7 +104,7 @@ static int verify_group_input(struct super_block *sb, | |
6002 | (unsigned long long)input->inode_bitmap, | |
6003 | start, metaend - 1); | |
6004 | else if (inside(input->inode_table, start, metaend) || | |
6005 | - inside(itend - 1, start, metaend)) | |
6006 | + inside(itend - 1, start, metaend)) | |
6007 | ext4_warning(sb, __func__, | |
6008 | "Inode table (%llu-%llu) overlaps" | |
6009 | "GDT table (%llu-%llu)", | |
6010 | @@ -158,9 +158,9 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh, | |
6011 | if (err) { | |
6012 | if ((err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) | |
6013 | return err; | |
6014 | - if ((err = ext4_journal_get_write_access(handle, bh))) | |
6015 | + if ((err = ext4_journal_get_write_access(handle, bh))) | |
6016 | return err; | |
6017 | - } | |
6018 | + } | |
6019 | ||
6020 | return 0; | |
6021 | } | |
6022 | @@ -416,11 +416,11 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, | |
6023 | "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n", | |
6024 | gdb_num); | |
6025 | ||
6026 | - /* | |
6027 | - * If we are not using the primary superblock/GDT copy don't resize, | |
6028 | - * because the user tools have no way of handling this. Probably a | |
6029 | - * bad time to do it anyways. | |
6030 | - */ | |
6031 | + /* | |
6032 | + * If we are not using the primary superblock/GDT copy don't resize, | |
6033 | + * because the user tools have no way of handling this. Probably a | |
6034 | + * bad time to do it anyways. | |
6035 | + */ | |
6036 | if (EXT4_SB(sb)->s_sbh->b_blocknr != | |
6037 | le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { | |
6038 | ext4_warning(sb, __func__, | |
6039 | @@ -507,14 +507,14 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, | |
6040 | return 0; | |
6041 | ||
6042 | exit_inode: | |
6043 | - //ext4_journal_release_buffer(handle, iloc.bh); | |
6044 | + /* ext4_journal_release_buffer(handle, iloc.bh); */ | |
6045 | brelse(iloc.bh); | |
6046 | exit_dindj: | |
6047 | - //ext4_journal_release_buffer(handle, dind); | |
6048 | + /* ext4_journal_release_buffer(handle, dind); */ | |
6049 | exit_primary: | |
6050 | - //ext4_journal_release_buffer(handle, *primary); | |
6051 | + /* ext4_journal_release_buffer(handle, *primary); */ | |
6052 | exit_sbh: | |
6053 | - //ext4_journal_release_buffer(handle, *primary); | |
6054 | + /* ext4_journal_release_buffer(handle, *primary); */ | |
6055 | exit_dind: | |
6056 | brelse(dind); | |
6057 | exit_bh: | |
6058 | @@ -773,7 +773,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | |
6059 | ||
6060 | if (reserved_gdb || gdb_off == 0) { | |
6061 | if (!EXT4_HAS_COMPAT_FEATURE(sb, | |
6062 | - EXT4_FEATURE_COMPAT_RESIZE_INODE)){ | |
6063 | + EXT4_FEATURE_COMPAT_RESIZE_INODE) | |
6064 | + || !le16_to_cpu(es->s_reserved_gdt_blocks)) { | |
6065 | ext4_warning(sb, __func__, | |
6066 | "No reserved GDT blocks, can't resize"); | |
6067 | return -EPERM; | |
6068 | @@ -818,12 +819,12 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | |
6069 | if ((err = ext4_journal_get_write_access(handle, sbi->s_sbh))) | |
6070 | goto exit_journal; | |
6071 | ||
6072 | - /* | |
6073 | - * We will only either add reserved group blocks to a backup group | |
6074 | - * or remove reserved blocks for the first group in a new group block. | |
6075 | - * Doing both would be mean more complex code, and sane people don't | |
6076 | - * use non-sparse filesystems anymore. This is already checked above. | |
6077 | - */ | |
6078 | + /* | |
6079 | + * We will only either add reserved group blocks to a backup group | |
6080 | + * or remove reserved blocks for the first group in a new group block. | |
6081 | + * Doing both would be mean more complex code, and sane people don't | |
6082 | + * use non-sparse filesystems anymore. This is already checked above. | |
6083 | + */ | |
6084 | if (gdb_off) { | |
6085 | primary = sbi->s_group_desc[gdb_num]; | |
6086 | if ((err = ext4_journal_get_write_access(handle, primary))) | |
6087 | @@ -835,24 +836,24 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | |
6088 | } else if ((err = add_new_gdb(handle, inode, input, &primary))) | |
6089 | goto exit_journal; | |
6090 | ||
6091 | - /* | |
6092 | - * OK, now we've set up the new group. Time to make it active. | |
6093 | - * | |
6094 | - * Current kernels don't lock all allocations via lock_super(), | |
6095 | - * so we have to be safe wrt. concurrent accesses the group | |
6096 | - * data. So we need to be careful to set all of the relevant | |
6097 | - * group descriptor data etc. *before* we enable the group. | |
6098 | - * | |
6099 | - * The key field here is sbi->s_groups_count: as long as | |
6100 | - * that retains its old value, nobody is going to access the new | |
6101 | - * group. | |
6102 | - * | |
6103 | - * So first we update all the descriptor metadata for the new | |
6104 | - * group; then we update the total disk blocks count; then we | |
6105 | - * update the groups count to enable the group; then finally we | |
6106 | - * update the free space counts so that the system can start | |
6107 | - * using the new disk blocks. | |
6108 | - */ | |
6109 | + /* | |
6110 | + * OK, now we've set up the new group. Time to make it active. | |
6111 | + * | |
6112 | + * Current kernels don't lock all allocations via lock_super(), | |
6113 | + * so we have to be safe wrt. concurrent accesses the group | |
6114 | + * data. So we need to be careful to set all of the relevant | |
6115 | + * group descriptor data etc. *before* we enable the group. | |
6116 | + * | |
6117 | + * The key field here is sbi->s_groups_count: as long as | |
6118 | + * that retains its old value, nobody is going to access the new | |
6119 | + * group. | |
6120 | + * | |
6121 | + * So first we update all the descriptor metadata for the new | |
6122 | + * group; then we update the total disk blocks count; then we | |
6123 | + * update the groups count to enable the group; then finally we | |
6124 | + * update the free space counts so that the system can start | |
6125 | + * using the new disk blocks. | |
6126 | + */ | |
6127 | ||
6128 | /* Update group descriptor block for new group */ | |
6129 | gdp = (struct ext4_group_desc *)((char *)primary->b_data + | |
6130 | @@ -866,6 +867,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | |
6131 | gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); | |
6132 | ||
6133 | /* | |
6134 | + * We can allocate memory for mb_alloc based on the new group | |
6135 | + * descriptor | |
6136 | + */ | |
6137 | + if (test_opt(sb, MBALLOC)) { | |
6138 | + err = ext4_mb_add_more_groupinfo(sb, input->group, gdp); | |
6139 | + if (err) | |
6140 | + goto exit_journal; | |
6141 | + } | |
6142 | + /* | |
6143 | * Make the new blocks and inodes valid next. We do this before | |
6144 | * increasing the group count so that once the group is enabled, | |
6145 | * all of its blocks and inodes are already valid. | |
6146 | @@ -937,7 +947,8 @@ exit_put: | |
6147 | return err; | |
6148 | } /* ext4_group_add */ | |
6149 | ||
6150 | -/* Extend the filesystem to the new number of blocks specified. This entry | |
6151 | +/* | |
6152 | + * Extend the filesystem to the new number of blocks specified. This entry | |
6153 | * point is only used to extend the current filesystem to the end of the last | |
6154 | * existing group. It can be accessed via ioctl, or by "remount,resize=<size>" | |
6155 | * for emergencies (because it has no dependencies on reserved blocks). | |
6156 | @@ -957,6 +968,8 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, | |
6157 | handle_t *handle; | |
6158 | int err; | |
6159 | unsigned long freed_blocks; | |
6160 | + ext4_group_t group; | |
6161 | + struct ext4_group_info *grp; | |
6162 | ||
6163 | /* We don't need to worry about locking wrt other resizers just | |
6164 | * yet: we're going to revalidate es->s_blocks_count after | |
6165 | @@ -988,7 +1001,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, | |
6166 | } | |
6167 | ||
6168 | /* Handle the remaining blocks in the last group only. */ | |
6169 | - ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last); | |
6170 | + ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); | |
6171 | ||
6172 | if (last == 0) { | |
6173 | ext4_warning(sb, __func__, | |
6174 | @@ -1013,7 +1026,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, | |
6175 | o_blocks_count + add, add); | |
6176 | ||
6177 | /* See if the device is actually as big as what was requested */ | |
6178 | - bh = sb_bread(sb, o_blocks_count + add -1); | |
6179 | + bh = sb_bread(sb, o_blocks_count + add - 1); | |
6180 | if (!bh) { | |
6181 | ext4_warning(sb, __func__, | |
6182 | "can't read last block, resize aborted"); | |
6183 | @@ -1060,6 +1073,45 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, | |
6184 | o_blocks_count + add); | |
6185 | if ((err = ext4_journal_stop(handle))) | |
6186 | goto exit_put; | |
6187 | + | |
6188 | + /* | |
6189 | + * Mark mballoc pages as not up to date so that they will be updated | |
6190 | + * next time they are loaded by ext4_mb_load_buddy. | |
6191 | + */ | |
6192 | + if (test_opt(sb, MBALLOC)) { | |
6193 | + struct ext4_sb_info *sbi = EXT4_SB(sb); | |
6194 | + struct inode *inode = sbi->s_buddy_cache; | |
6195 | + int blocks_per_page; | |
6196 | + int block; | |
6197 | + int pnum; | |
6198 | + struct page *page; | |
6199 | + | |
6200 | + /* Set buddy page as not up to date */ | |
6201 | + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; | |
6202 | + block = group * 2; | |
6203 | + pnum = block / blocks_per_page; | |
6204 | + page = find_get_page(inode->i_mapping, pnum); | |
6205 | + if (page != NULL) { | |
6206 | + ClearPageUptodate(page); | |
6207 | + page_cache_release(page); | |
6208 | + } | |
6209 | + | |
6210 | + /* Set bitmap page as not up to date */ | |
6211 | + block++; | |
6212 | + pnum = block / blocks_per_page; | |
6213 | + page = find_get_page(inode->i_mapping, pnum); | |
6214 | + if (page != NULL) { | |
6215 | + ClearPageUptodate(page); | |
6216 | + page_cache_release(page); | |
6217 | + } | |
6218 | + | |
6219 | + /* Get the info on the last group */ | |
6220 | + grp = ext4_get_group_info(sb, group); | |
6221 | + | |
6222 | + /* Update free blocks in group info */ | |
6223 | + ext4_mb_update_group_info(grp, add); | |
6224 | + } | |
6225 | + | |
6226 | if (test_opt(sb, DEBUG)) | |
6227 | printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n", | |
6228 | ext4_blocks_count(es)); | |
6229 | diff --git a/fs/ext4/super.c b/fs/ext4/super.c | |
6230 | index 02bf243..ed80f9f 100644 | |
6231 | --- a/fs/ext4/super.c | |
6232 | +++ b/fs/ext4/super.c | |
6233 | @@ -49,20 +49,19 @@ static int ext4_load_journal(struct super_block *, struct ext4_super_block *, | |
6234 | unsigned long journal_devnum); | |
6235 | static int ext4_create_journal(struct super_block *, struct ext4_super_block *, | |
6236 | unsigned int); | |
6237 | -static void ext4_commit_super (struct super_block * sb, | |
6238 | - struct ext4_super_block * es, | |
6239 | - int sync); | |
6240 | -static void ext4_mark_recovery_complete(struct super_block * sb, | |
6241 | - struct ext4_super_block * es); | |
6242 | -static void ext4_clear_journal_err(struct super_block * sb, | |
6243 | - struct ext4_super_block * es); | |
6244 | +static void ext4_commit_super(struct super_block *sb, | |
6245 | + struct ext4_super_block *es, int sync); | |
6246 | +static void ext4_mark_recovery_complete(struct super_block *sb, | |
6247 | + struct ext4_super_block *es); | |
6248 | +static void ext4_clear_journal_err(struct super_block *sb, | |
6249 | + struct ext4_super_block *es); | |
6250 | static int ext4_sync_fs(struct super_block *sb, int wait); | |
6251 | -static const char *ext4_decode_error(struct super_block * sb, int errno, | |
6252 | +static const char *ext4_decode_error(struct super_block *sb, int errno, | |
6253 | char nbuf[16]); | |
6254 | -static int ext4_remount (struct super_block * sb, int * flags, char * data); | |
6255 | -static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf); | |
6256 | +static int ext4_remount(struct super_block *sb, int *flags, char *data); | |
6257 | +static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); | |
6258 | static void ext4_unlockfs(struct super_block *sb); | |
6259 | -static void ext4_write_super (struct super_block * sb); | |
6260 | +static void ext4_write_super(struct super_block *sb); | |
6261 | static void ext4_write_super_lockfs(struct super_block *sb); | |
6262 | ||
6263 | ||
6264 | @@ -211,15 +210,15 @@ static void ext4_handle_error(struct super_block *sb) | |
6265 | if (sb->s_flags & MS_RDONLY) | |
6266 | return; | |
6267 | ||
6268 | - if (!test_opt (sb, ERRORS_CONT)) { | |
6269 | + if (!test_opt(sb, ERRORS_CONT)) { | |
6270 | journal_t *journal = EXT4_SB(sb)->s_journal; | |
6271 | ||
6272 | EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT; | |
6273 | if (journal) | |
6274 | jbd2_journal_abort(journal, -EIO); | |
6275 | } | |
6276 | - if (test_opt (sb, ERRORS_RO)) { | |
6277 | - printk (KERN_CRIT "Remounting filesystem read-only\n"); | |
6278 | + if (test_opt(sb, ERRORS_RO)) { | |
6279 | + printk(KERN_CRIT "Remounting filesystem read-only\n"); | |
6280 | sb->s_flags |= MS_RDONLY; | |
6281 | } | |
6282 | ext4_commit_super(sb, es, 1); | |
6283 | @@ -228,13 +227,13 @@ static void ext4_handle_error(struct super_block *sb) | |
6284 | sb->s_id); | |
6285 | } | |
6286 | ||
6287 | -void ext4_error (struct super_block * sb, const char * function, | |
6288 | - const char * fmt, ...) | |
6289 | +void ext4_error(struct super_block *sb, const char *function, | |
6290 | + const char *fmt, ...) | |
6291 | { | |
6292 | va_list args; | |
6293 | ||
6294 | va_start(args, fmt); | |
6295 | - printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function); | |
6296 | + printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); | |
6297 | vprintk(fmt, args); | |
6298 | printk("\n"); | |
6299 | va_end(args); | |
6300 | @@ -242,7 +241,7 @@ void ext4_error (struct super_block * sb, const char * function, | |
6301 | ext4_handle_error(sb); | |
6302 | } | |
6303 | ||
6304 | -static const char *ext4_decode_error(struct super_block * sb, int errno, | |
6305 | +static const char *ext4_decode_error(struct super_block *sb, int errno, | |
6306 | char nbuf[16]) | |
6307 | { | |
6308 | char *errstr = NULL; | |
6309 | @@ -278,8 +277,7 @@ static const char *ext4_decode_error(struct super_block * sb, int errno, | |
6310 | /* __ext4_std_error decodes expected errors from journaling functions | |
6311 | * automatically and invokes the appropriate error response. */ | |
6312 | ||
6313 | -void __ext4_std_error (struct super_block * sb, const char * function, | |
6314 | - int errno) | |
6315 | +void __ext4_std_error(struct super_block *sb, const char *function, int errno) | |
6316 | { | |
6317 | char nbuf[16]; | |
6318 | const char *errstr; | |
6319 | @@ -292,8 +290,8 @@ void __ext4_std_error (struct super_block * sb, const char * function, | |
6320 | return; | |
6321 | ||
6322 | errstr = ext4_decode_error(sb, errno, nbuf); | |
6323 | - printk (KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n", | |
6324 | - sb->s_id, function, errstr); | |
6325 | + printk(KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n", | |
6326 | + sb->s_id, function, errstr); | |
6327 | ||
6328 | ext4_handle_error(sb); | |
6329 | } | |
6330 | @@ -308,15 +306,15 @@ void __ext4_std_error (struct super_block * sb, const char * function, | |
6331 | * case we take the easy way out and panic immediately. | |
6332 | */ | |
6333 | ||
6334 | -void ext4_abort (struct super_block * sb, const char * function, | |
6335 | - const char * fmt, ...) | |
6336 | +void ext4_abort(struct super_block *sb, const char *function, | |
6337 | + const char *fmt, ...) | |
6338 | { | |
6339 | va_list args; | |
6340 | ||
6341 | - printk (KERN_CRIT "ext4_abort called.\n"); | |
6342 | + printk(KERN_CRIT "ext4_abort called.\n"); | |
6343 | ||
6344 | va_start(args, fmt); | |
6345 | - printk(KERN_CRIT "EXT4-fs error (device %s): %s: ",sb->s_id, function); | |
6346 | + printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); | |
6347 | vprintk(fmt, args); | |
6348 | printk("\n"); | |
6349 | va_end(args); | |
6350 | @@ -334,8 +332,8 @@ void ext4_abort (struct super_block * sb, const char * function, | |
6351 | jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); | |
6352 | } | |
6353 | ||
6354 | -void ext4_warning (struct super_block * sb, const char * function, | |
6355 | - const char * fmt, ...) | |
6356 | +void ext4_warning(struct super_block *sb, const char *function, | |
6357 | + const char *fmt, ...) | |
6358 | { | |
6359 | va_list args; | |
6360 | ||
6361 | @@ -496,7 +494,7 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi) | |
6362 | } | |
6363 | } | |
6364 | ||
6365 | -static void ext4_put_super (struct super_block * sb) | |
6366 | +static void ext4_put_super(struct super_block *sb) | |
6367 | { | |
6368 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
6369 | struct ext4_super_block *es = sbi->s_es; | |
6370 | @@ -506,6 +504,7 @@ static void ext4_put_super (struct super_block * sb) | |
6371 | ext4_ext_release(sb); | |
6372 | ext4_xattr_put_super(sb); | |
6373 | jbd2_journal_destroy(sbi->s_journal); | |
6374 | + sbi->s_journal = NULL; | |
6375 | if (!(sb->s_flags & MS_RDONLY)) { | |
6376 | EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); | |
6377 | es->s_state = cpu_to_le16(sbi->s_mount_state); | |
6378 | @@ -517,6 +516,7 @@ static void ext4_put_super (struct super_block * sb) | |
6379 | for (i = 0; i < sbi->s_gdb_count; i++) | |
6380 | brelse(sbi->s_group_desc[i]); | |
6381 | kfree(sbi->s_group_desc); | |
6382 | + kfree(sbi->s_flex_groups); | |
6383 | percpu_counter_destroy(&sbi->s_freeblocks_counter); | |
6384 | percpu_counter_destroy(&sbi->s_freeinodes_counter); | |
6385 | percpu_counter_destroy(&sbi->s_dirs_counter); | |
6386 | @@ -568,9 +568,16 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) | |
6387 | #endif | |
6388 | ei->i_block_alloc_info = NULL; | |
6389 | ei->vfs_inode.i_version = 1; | |
6390 | + ei->vfs_inode.i_data.writeback_index = 0; | |
6391 | memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); | |
6392 | INIT_LIST_HEAD(&ei->i_prealloc_list); | |
6393 | spin_lock_init(&ei->i_prealloc_lock); | |
6394 | + jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode); | |
6395 | + ei->i_reserved_data_blocks = 0; | |
6396 | + ei->i_reserved_meta_blocks = 0; | |
6397 | + ei->i_allocated_meta_blocks = 0; | |
6398 | + ei->i_delalloc_reserved_flag = 0; | |
6399 | + spin_lock_init(&(ei->i_block_reservation_lock)); | |
6400 | return &ei->vfs_inode; | |
6401 | } | |
6402 | ||
6403 | @@ -635,9 +642,12 @@ static void ext4_clear_inode(struct inode *inode) | |
6404 | EXT4_I(inode)->i_block_alloc_info = NULL; | |
6405 | if (unlikely(rsv)) | |
6406 | kfree(rsv); | |
6407 | + jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, | |
6408 | + &EXT4_I(inode)->jinode); | |
6409 | } | |
6410 | ||
6411 | -static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb) | |
6412 | +static inline void ext4_show_quota_options(struct seq_file *seq, | |
6413 | + struct super_block *sb) | |
6414 | { | |
6415 | #if defined(CONFIG_QUOTA) | |
6416 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
6417 | @@ -671,7 +681,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) | |
6418 | unsigned long def_mount_opts; | |
6419 | struct super_block *sb = vfs->mnt_sb; | |
6420 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
6421 | - journal_t *journal = sbi->s_journal; | |
6422 | struct ext4_super_block *es = sbi->s_es; | |
6423 | ||
6424 | def_mount_opts = le32_to_cpu(es->s_default_mount_opts); | |
6425 | @@ -747,6 +756,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) | |
6426 | seq_puts(seq, ",nomballoc"); | |
6427 | if (test_opt(sb, I_VERSION)) | |
6428 | seq_puts(seq, ",i_version"); | |
6429 | + if (!test_opt(sb, DELALLOC)) | |
6430 | + seq_puts(seq, ",nodelalloc"); | |
6431 | + | |
6432 | ||
6433 | if (sbi->s_stripe) | |
6434 | seq_printf(seq, ",stripe=%lu", sbi->s_stripe); | |
6435 | @@ -810,8 +822,8 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, | |
6436 | } | |
6437 | ||
6438 | #ifdef CONFIG_QUOTA | |
6439 | -#define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") | |
6440 | -#define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) | |
6441 | +#define QTYPE2NAME(t) ((t) == USRQUOTA?"user":"group") | |
6442 | +#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) | |
6443 | ||
6444 | static int ext4_dquot_initialize(struct inode *inode, int type); | |
6445 | static int ext4_dquot_drop(struct inode *inode); | |
6446 | @@ -894,7 +906,7 @@ enum { | |
6447 | Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, | |
6448 | Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, | |
6449 | Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, | |
6450 | - Opt_mballoc, Opt_nomballoc, Opt_stripe, | |
6451 | + Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc, | |
6452 | }; | |
6453 | ||
6454 | static match_table_t tokens = { | |
6455 | @@ -953,6 +965,8 @@ static match_table_t tokens = { | |
6456 | {Opt_nomballoc, "nomballoc"}, | |
6457 | {Opt_stripe, "stripe=%u"}, | |
6458 | {Opt_resize, "resize"}, | |
6459 | + {Opt_delalloc, "delalloc"}, | |
6460 | + {Opt_nodelalloc, "nodelalloc"}, | |
6461 | {Opt_err, NULL}, | |
6462 | }; | |
6463 | ||
6464 | @@ -977,12 +991,12 @@ static ext4_fsblk_t get_sb_block(void **data) | |
6465 | return sb_block; | |
6466 | } | |
6467 | ||
6468 | -static int parse_options (char *options, struct super_block *sb, | |
6469 | - unsigned int *inum, unsigned long *journal_devnum, | |
6470 | - ext4_fsblk_t *n_blocks_count, int is_remount) | |
6471 | +static int parse_options(char *options, struct super_block *sb, | |
6472 | + unsigned int *inum, unsigned long *journal_devnum, | |
6473 | + ext4_fsblk_t *n_blocks_count, int is_remount) | |
6474 | { | |
6475 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
6476 | - char * p; | |
6477 | + char *p; | |
6478 | substring_t args[MAX_OPT_ARGS]; | |
6479 | int data_opt = 0; | |
6480 | int option; | |
6481 | @@ -990,11 +1004,12 @@ static int parse_options (char *options, struct super_block *sb, | |
6482 | int qtype, qfmt; | |
6483 | char *qname; | |
6484 | #endif | |
6485 | + ext4_fsblk_t last_block; | |
6486 | ||
6487 | if (!options) | |
6488 | return 1; | |
6489 | ||
6490 | - while ((p = strsep (&options, ",")) != NULL) { | |
6491 | + while ((p = strsep(&options, ",")) != NULL) { | |
6492 | int token; | |
6493 | if (!*p) | |
6494 | continue; | |
6495 | @@ -1002,16 +1017,16 @@ static int parse_options (char *options, struct super_block *sb, | |
6496 | token = match_token(p, tokens, args); | |
6497 | switch (token) { | |
6498 | case Opt_bsd_df: | |
6499 | - clear_opt (sbi->s_mount_opt, MINIX_DF); | |
6500 | + clear_opt(sbi->s_mount_opt, MINIX_DF); | |
6501 | break; | |
6502 | case Opt_minix_df: | |
6503 | - set_opt (sbi->s_mount_opt, MINIX_DF); | |
6504 | + set_opt(sbi->s_mount_opt, MINIX_DF); | |
6505 | break; | |
6506 | case Opt_grpid: | |
6507 | - set_opt (sbi->s_mount_opt, GRPID); | |
6508 | + set_opt(sbi->s_mount_opt, GRPID); | |
6509 | break; | |
6510 | case Opt_nogrpid: | |
6511 | - clear_opt (sbi->s_mount_opt, GRPID); | |
6512 | + clear_opt(sbi->s_mount_opt, GRPID); | |
6513 | break; | |
6514 | case Opt_resuid: | |
6515 | if (match_int(&args[0], &option)) | |
6516 | @@ -1028,41 +1043,41 @@ static int parse_options (char *options, struct super_block *sb, | |
6517 | /* *sb_block = match_int(&args[0]); */ | |
6518 | break; | |
6519 | case Opt_err_panic: | |
6520 | - clear_opt (sbi->s_mount_opt, ERRORS_CONT); | |
6521 | - clear_opt (sbi->s_mount_opt, ERRORS_RO); | |
6522 | - set_opt (sbi->s_mount_opt, ERRORS_PANIC); | |
6523 | + clear_opt(sbi->s_mount_opt, ERRORS_CONT); | |
6524 | + clear_opt(sbi->s_mount_opt, ERRORS_RO); | |
6525 | + set_opt(sbi->s_mount_opt, ERRORS_PANIC); | |
6526 | break; | |
6527 | case Opt_err_ro: | |
6528 | - clear_opt (sbi->s_mount_opt, ERRORS_CONT); | |
6529 | - clear_opt (sbi->s_mount_opt, ERRORS_PANIC); | |
6530 | - set_opt (sbi->s_mount_opt, ERRORS_RO); | |
6531 | + clear_opt(sbi->s_mount_opt, ERRORS_CONT); | |
6532 | + clear_opt(sbi->s_mount_opt, ERRORS_PANIC); | |
6533 | + set_opt(sbi->s_mount_opt, ERRORS_RO); | |
6534 | break; | |
6535 | case Opt_err_cont: | |
6536 | - clear_opt (sbi->s_mount_opt, ERRORS_RO); | |
6537 | - clear_opt (sbi->s_mount_opt, ERRORS_PANIC); | |
6538 | - set_opt (sbi->s_mount_opt, ERRORS_CONT); | |
6539 | + clear_opt(sbi->s_mount_opt, ERRORS_RO); | |
6540 | + clear_opt(sbi->s_mount_opt, ERRORS_PANIC); | |
6541 | + set_opt(sbi->s_mount_opt, ERRORS_CONT); | |
6542 | break; | |
6543 | case Opt_nouid32: | |
6544 | - set_opt (sbi->s_mount_opt, NO_UID32); | |
6545 | + set_opt(sbi->s_mount_opt, NO_UID32); | |
6546 | break; | |
6547 | case Opt_nocheck: | |
6548 | - clear_opt (sbi->s_mount_opt, CHECK); | |
6549 | + clear_opt(sbi->s_mount_opt, CHECK); | |
6550 | break; | |
6551 | case Opt_debug: | |
6552 | - set_opt (sbi->s_mount_opt, DEBUG); | |
6553 | + set_opt(sbi->s_mount_opt, DEBUG); | |
6554 | break; | |
6555 | case Opt_oldalloc: | |
6556 | - set_opt (sbi->s_mount_opt, OLDALLOC); | |
6557 | + set_opt(sbi->s_mount_opt, OLDALLOC); | |
6558 | break; | |
6559 | case Opt_orlov: | |
6560 | - clear_opt (sbi->s_mount_opt, OLDALLOC); | |
6561 | + clear_opt(sbi->s_mount_opt, OLDALLOC); | |
6562 | break; | |
6563 | #ifdef CONFIG_EXT4DEV_FS_XATTR | |
6564 | case Opt_user_xattr: | |
6565 | - set_opt (sbi->s_mount_opt, XATTR_USER); | |
6566 | + set_opt(sbi->s_mount_opt, XATTR_USER); | |
6567 | break; | |
6568 | case Opt_nouser_xattr: | |
6569 | - clear_opt (sbi->s_mount_opt, XATTR_USER); | |
6570 | + clear_opt(sbi->s_mount_opt, XATTR_USER); | |
6571 | break; | |
6572 | #else | |
6573 | case Opt_user_xattr: | |
6574 | @@ -1100,7 +1115,7 @@ static int parse_options (char *options, struct super_block *sb, | |
6575 | "journal on remount\n"); | |
6576 | return 0; | |
6577 | } | |
6578 | - set_opt (sbi->s_mount_opt, UPDATE_JOURNAL); | |
6579 | + set_opt(sbi->s_mount_opt, UPDATE_JOURNAL); | |
6580 | break; | |
6581 | case Opt_journal_inum: | |
6582 | if (is_remount) { | |
6583 | @@ -1130,7 +1145,7 @@ static int parse_options (char *options, struct super_block *sb, | |
6584 | set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM); | |
6585 | break; | |
6586 | case Opt_noload: | |
6587 | - set_opt (sbi->s_mount_opt, NOLOAD); | |
6588 | + set_opt(sbi->s_mount_opt, NOLOAD); | |
6589 | break; | |
6590 | case Opt_commit: | |
6591 | if (match_int(&args[0], &option)) | |
6592 | @@ -1309,15 +1324,39 @@ set_qf_format: | |
6593 | clear_opt(sbi->s_mount_opt, NOBH); | |
6594 | break; | |
6595 | case Opt_extents: | |
6596 | - set_opt (sbi->s_mount_opt, EXTENTS); | |
6597 | + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, | |
6598 | + EXT4_FEATURE_INCOMPAT_EXTENTS)) { | |
6599 | + ext4_warning(sb, __func__, | |
6600 | + "extents feature not enabled " | |
6601 | + "on this filesystem, use tune2fs\n"); | |
6602 | + return 0; | |
6603 | + } | |
6604 | + set_opt(sbi->s_mount_opt, EXTENTS); | |
6605 | break; | |
6606 | case Opt_noextents: | |
6607 | - clear_opt (sbi->s_mount_opt, EXTENTS); | |
6608 | + /* | |
6609 | + * When e2fsprogs support resizing an already existing | |
6610 | + * ext3 file system to greater than 2**32 we need to | |
6611 | + * add support to block allocator to handle growing | |
6612 | + * already existing block mapped inode so that blocks | |
6613 | + * allocated for them fall within 2**32 | |
6614 | + */ | |
6615 | + last_block = ext4_blocks_count(sbi->s_es) - 1; | |
6616 | + if (last_block > 0xffffffffULL) { | |
6617 | + printk(KERN_ERR "EXT4-fs: Filesystem too " | |
6618 | + "large to mount with " | |
6619 | + "-o noextents options\n"); | |
6620 | + return 0; | |
6621 | + } | |
6622 | + clear_opt(sbi->s_mount_opt, EXTENTS); | |
6623 | break; | |
6624 | case Opt_i_version: | |
6625 | set_opt(sbi->s_mount_opt, I_VERSION); | |
6626 | sb->s_flags |= MS_I_VERSION; | |
6627 | break; | |
6628 | + case Opt_nodelalloc: | |
6629 | + clear_opt(sbi->s_mount_opt, DELALLOC); | |
6630 | + break; | |
6631 | case Opt_mballoc: | |
6632 | set_opt(sbi->s_mount_opt, MBALLOC); | |
6633 | break; | |
6634 | @@ -1331,10 +1370,13 @@ set_qf_format: | |
6635 | return 0; | |
6636 | sbi->s_stripe = option; | |
6637 | break; | |
6638 | + case Opt_delalloc: | |
6639 | + set_opt(sbi->s_mount_opt, DELALLOC); | |
6640 | + break; | |
6641 | default: | |
6642 | - printk (KERN_ERR | |
6643 | - "EXT4-fs: Unrecognized mount option \"%s\" " | |
6644 | - "or missing value\n", p); | |
6645 | + printk(KERN_ERR | |
6646 | + "EXT4-fs: Unrecognized mount option \"%s\" " | |
6647 | + "or missing value\n", p); | |
6648 | return 0; | |
6649 | } | |
6650 | } | |
6651 | @@ -1381,31 +1423,31 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, | |
6652 | int res = 0; | |
6653 | ||
6654 | if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) { | |
6655 | - printk (KERN_ERR "EXT4-fs warning: revision level too high, " | |
6656 | - "forcing read-only mode\n"); | |
6657 | + printk(KERN_ERR "EXT4-fs warning: revision level too high, " | |
6658 | + "forcing read-only mode\n"); | |
6659 | res = MS_RDONLY; | |
6660 | } | |
6661 | if (read_only) | |
6662 | return res; | |
6663 | if (!(sbi->s_mount_state & EXT4_VALID_FS)) | |
6664 | - printk (KERN_WARNING "EXT4-fs warning: mounting unchecked fs, " | |
6665 | - "running e2fsck is recommended\n"); | |
6666 | + printk(KERN_WARNING "EXT4-fs warning: mounting unchecked fs, " | |
6667 | + "running e2fsck is recommended\n"); | |
6668 | else if ((sbi->s_mount_state & EXT4_ERROR_FS)) | |
6669 | - printk (KERN_WARNING | |
6670 | - "EXT4-fs warning: mounting fs with errors, " | |
6671 | - "running e2fsck is recommended\n"); | |
6672 | + printk(KERN_WARNING | |
6673 | + "EXT4-fs warning: mounting fs with errors, " | |
6674 | + "running e2fsck is recommended\n"); | |
6675 | else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 && | |
6676 | le16_to_cpu(es->s_mnt_count) >= | |
6677 | (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count)) | |
6678 | - printk (KERN_WARNING | |
6679 | - "EXT4-fs warning: maximal mount count reached, " | |
6680 | - "running e2fsck is recommended\n"); | |
6681 | + printk(KERN_WARNING | |
6682 | + "EXT4-fs warning: maximal mount count reached, " | |
6683 | + "running e2fsck is recommended\n"); | |
6684 | else if (le32_to_cpu(es->s_checkinterval) && | |
6685 | (le32_to_cpu(es->s_lastcheck) + | |
6686 | le32_to_cpu(es->s_checkinterval) <= get_seconds())) | |
6687 | - printk (KERN_WARNING | |
6688 | - "EXT4-fs warning: checktime reached, " | |
6689 | - "running e2fsck is recommended\n"); | |
6690 | + printk(KERN_WARNING | |
6691 | + "EXT4-fs warning: checktime reached, " | |
6692 | + "running e2fsck is recommended\n"); | |
6693 | #if 0 | |
6694 | /* @@@ We _will_ want to clear the valid bit if we find | |
6695 | * inconsistencies, to force a fsck at reboot. But for | |
6696 | @@ -1443,6 +1485,53 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, | |
6697 | return res; | |
6698 | } | |
6699 | ||
6700 | +static int ext4_fill_flex_info(struct super_block *sb) | |
6701 | +{ | |
6702 | + struct ext4_sb_info *sbi = EXT4_SB(sb); | |
6703 | + struct ext4_group_desc *gdp = NULL; | |
6704 | + struct buffer_head *bh; | |
6705 | + ext4_group_t flex_group_count; | |
6706 | + ext4_group_t flex_group; | |
6707 | + int groups_per_flex = 0; | |
6708 | + __u64 block_bitmap = 0; | |
6709 | + int i; | |
6710 | + | |
6711 | + if (!sbi->s_es->s_log_groups_per_flex) { | |
6712 | + sbi->s_log_groups_per_flex = 0; | |
6713 | + return 1; | |
6714 | + } | |
6715 | + | |
6716 | + sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; | |
6717 | + groups_per_flex = 1 << sbi->s_log_groups_per_flex; | |
6718 | + | |
6719 | + flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) / | |
6720 | + groups_per_flex; | |
6721 | + sbi->s_flex_groups = kzalloc(flex_group_count * | |
6722 | + sizeof(struct flex_groups), GFP_KERNEL); | |
6723 | + if (sbi->s_flex_groups == NULL) { | |
6724 | + printk(KERN_ERR "EXT4-fs: not enough memory for " | |
6725 | + "%lu flex groups\n", flex_group_count); | |
6726 | + goto failed; | |
6727 | + } | |
6728 | + | |
6729 | + gdp = ext4_get_group_desc(sb, 1, &bh); | |
6730 | + block_bitmap = ext4_block_bitmap(sb, gdp) - 1; | |
6731 | + | |
6732 | + for (i = 0; i < sbi->s_groups_count; i++) { | |
6733 | + gdp = ext4_get_group_desc(sb, i, &bh); | |
6734 | + | |
6735 | + flex_group = ext4_flex_group(sbi, i); | |
6736 | + sbi->s_flex_groups[flex_group].free_inodes += | |
6737 | + le16_to_cpu(gdp->bg_free_inodes_count); | |
6738 | + sbi->s_flex_groups[flex_group].free_blocks += | |
6739 | + le16_to_cpu(gdp->bg_free_blocks_count); | |
6740 | + } | |
6741 | + | |
6742 | + return 1; | |
6743 | +failed: | |
6744 | + return 0; | |
6745 | +} | |
6746 | + | |
6747 | __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, | |
6748 | struct ext4_group_desc *gdp) | |
6749 | { | |
6750 | @@ -1507,16 +1596,14 @@ static int ext4_check_descriptors(struct super_block *sb) | |
6751 | (EXT4_BLOCKS_PER_GROUP(sb) - 1); | |
6752 | ||
6753 | block_bitmap = ext4_block_bitmap(sb, gdp); | |
6754 | - if (block_bitmap < first_block || block_bitmap > last_block) | |
6755 | - { | |
6756 | + if (block_bitmap < first_block || block_bitmap > last_block) { | |
6757 | printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " | |
6758 | "Block bitmap for group %lu not in group " | |
6759 | "(block %llu)!", i, block_bitmap); | |
6760 | return 0; | |
6761 | } | |
6762 | inode_bitmap = ext4_inode_bitmap(sb, gdp); | |
6763 | - if (inode_bitmap < first_block || inode_bitmap > last_block) | |
6764 | - { | |
6765 | + if (inode_bitmap < first_block || inode_bitmap > last_block) { | |
6766 | printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " | |
6767 | "Inode bitmap for group %lu not in group " | |
6768 | "(block %llu)!", i, inode_bitmap); | |
6769 | @@ -1524,26 +1611,28 @@ static int ext4_check_descriptors(struct super_block *sb) | |
6770 | } | |
6771 | inode_table = ext4_inode_table(sb, gdp); | |
6772 | if (inode_table < first_block || | |
6773 | - inode_table + sbi->s_itb_per_group - 1 > last_block) | |
6774 | - { | |
6775 | + inode_table + sbi->s_itb_per_group - 1 > last_block) { | |
6776 | printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " | |
6777 | "Inode table for group %lu not in group " | |
6778 | "(block %llu)!", i, inode_table); | |
6779 | return 0; | |
6780 | } | |
6781 | + spin_lock(sb_bgl_lock(sbi, i)); | |
6782 | if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { | |
6783 | printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " | |
6784 | "Checksum for group %lu failed (%u!=%u)\n", | |
6785 | i, le16_to_cpu(ext4_group_desc_csum(sbi, i, | |
6786 | gdp)), le16_to_cpu(gdp->bg_checksum)); | |
6787 | - return 0; | |
6788 | + if (!(sb->s_flags & MS_RDONLY)) | |
6789 | + return 0; | |
6790 | } | |
6791 | + spin_unlock(sb_bgl_lock(sbi, i)); | |
6792 | if (!flexbg_flag) | |
6793 | first_block += EXT4_BLOCKS_PER_GROUP(sb); | |
6794 | } | |
6795 | ||
6796 | ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb)); | |
6797 | - sbi->s_es->s_free_inodes_count=cpu_to_le32(ext4_count_free_inodes(sb)); | |
6798 | + sbi->s_es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb)); | |
6799 | return 1; | |
6800 | } | |
6801 | ||
6802 | @@ -1564,8 +1653,8 @@ static int ext4_check_descriptors(struct super_block *sb) | |
6803 | * e2fsck was run on this filesystem, and it must have already done the orphan | |
6804 | * inode cleanup for us, so we can safely abort without any further action. | |
6805 | */ | |
6806 | -static void ext4_orphan_cleanup (struct super_block * sb, | |
6807 | - struct ext4_super_block * es) | |
6808 | +static void ext4_orphan_cleanup(struct super_block *sb, | |
6809 | + struct ext4_super_block *es) | |
6810 | { | |
6811 | unsigned int s_flags = sb->s_flags; | |
6812 | int nr_orphans = 0, nr_truncates = 0; | |
6813 | @@ -1642,7 +1731,7 @@ static void ext4_orphan_cleanup (struct super_block * sb, | |
6814 | iput(inode); /* The delete magic happens here! */ | |
6815 | } | |
6816 | ||
6817 | -#define PLURAL(x) (x), ((x)==1) ? "" : "s" | |
6818 | +#define PLURAL(x) (x), ((x) == 1) ? "" : "s" | |
6819 | ||
6820 | if (nr_orphans) | |
6821 | printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n", | |
6822 | @@ -1809,12 +1898,12 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) | |
6823 | return 0; | |
6824 | } | |
6825 | ||
6826 | -static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |
6827 | - __releases(kernel_sem) | |
6828 | - __acquires(kernel_sem) | |
6829 | +static int ext4_fill_super(struct super_block *sb, void *data, int silent) | |
6830 | + __releases(kernel_lock) | |
6831 | + __acquires(kernel_lock) | |
6832 | ||
6833 | { | |
6834 | - struct buffer_head * bh; | |
6835 | + struct buffer_head *bh; | |
6836 | struct ext4_super_block *es = NULL; | |
6837 | struct ext4_sb_info *sbi; | |
6838 | ext4_fsblk_t block; | |
6839 | @@ -1851,11 +1940,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |
6840 | goto out_fail; | |
6841 | } | |
6842 | ||
6843 | - if (!sb_set_blocksize(sb, blocksize)) { | |
6844 | - printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize); | |
6845 | - goto out_fail; | |
6846 | - } | |
6847 | - | |
6848 | /* | |
6849 | * The ext4 superblock will not be buffer aligned for other than 1kB | |
6850 | * block sizes. We need to calculate the offset from buffer start. | |
6851 | @@ -1868,7 +1952,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |
6852 | } | |
6853 | ||
6854 | if (!(bh = sb_bread(sb, logical_sb_block))) { | |
6855 | - printk (KERN_ERR "EXT4-fs: unable to read superblock\n"); | |
6856 | + printk(KERN_ERR "EXT4-fs: unable to read superblock\n"); | |
6857 | goto out_fail; | |
6858 | } | |
6859 | /* | |
6860 | @@ -1919,17 +2003,30 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |
6861 | ||
6862 | /* | |
6863 | * turn on extents feature by default in ext4 filesystem | |
6864 | - * User -o noextents to turn it off | |
6865 | + * only if feature flag already set by mkfs or tune2fs. | |
6866 | + * Use -o noextents to turn it off | |
6867 | */ | |
6868 | - set_opt(sbi->s_mount_opt, EXTENTS); | |
6869 | + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) | |
6870 | + set_opt(sbi->s_mount_opt, EXTENTS); | |
6871 | + else | |
6872 | + ext4_warning(sb, __func__, | |
6873 | + "extents feature not enabled on this filesystem, " | |
6874 | + "use tune2fs.\n"); | |
6875 | /* | |
6876 | - * turn on mballoc feature by default in ext4 filesystem | |
6877 | - * User -o nomballoc to turn it off | |
6878 | + * turn on mballoc code by default in ext4 filesystem | |
6879 | + * Use -o nomballoc to turn it off | |
6880 | */ | |
6881 | set_opt(sbi->s_mount_opt, MBALLOC); | |
6882 | ||
6883 | - if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, | |
6884 | - NULL, 0)) | |
6885 | + /* | |
6886 | + * enable delayed allocation by default | |
6887 | + * Use -o nodelalloc to turn it off | |
6888 | + */ | |
6889 | + set_opt(sbi->s_mount_opt, DELALLOC); | |
6890 | + | |
6891 | + | |
6892 | + if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum, | |
6893 | + NULL, 0)) | |
6894 | goto failed_mount; | |
6895 | ||
6896 | sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | | |
6897 | @@ -2004,7 +2101,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |
6898 | goto failed_mount; | |
6899 | } | |
6900 | ||
6901 | - brelse (bh); | |
6902 | + brelse(bh); | |
6903 | logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; | |
6904 | offset = do_div(logical_sb_block, blocksize); | |
6905 | bh = sb_bread(sb, logical_sb_block); | |
6906 | @@ -2016,8 +2113,8 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |
6907 | es = (struct ext4_super_block *)(((char *)bh->b_data) + offset); | |
6908 | sbi->s_es = es; | |
6909 | if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) { | |
6910 | - printk (KERN_ERR | |
6911 | - "EXT4-fs: Magic mismatch, very weird !\n"); | |
6912 | + printk(KERN_ERR | |
6913 | + "EXT4-fs: Magic mismatch, very weird !\n"); | |
6914 | goto failed_mount; | |
6915 | } | |
6916 | } | |
6917 | @@ -2034,9 +2131,9 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |
6918 | if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) || | |
6919 | (!is_power_of_2(sbi->s_inode_size)) || | |
6920 | (sbi->s_inode_size > blocksize)) { | |
6921 | - printk (KERN_ERR | |
6922 | - "EXT4-fs: unsupported inode size: %d\n", | |
6923 | - sbi->s_inode_size); | |
6924 | + printk(KERN_ERR | |
6925 | + "EXT4-fs: unsupported inode size: %d\n", | |
6926 | + sbi->s_inode_size); | |
6927 | goto failed_mount; | |
6928 | } | |
6929 | if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) | |
6930 | @@ -2068,20 +2165,20 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |
6931 | sbi->s_mount_state = le16_to_cpu(es->s_state); | |
6932 | sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); | |
6933 | sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); | |
6934 | - for (i=0; i < 4; i++) | |
6935 | + for (i = 0; i < 4; i++) | |
6936 | sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); | |
6937 | sbi->s_def_hash_version = es->s_def_hash_version; | |
6938 | ||
6939 | if (sbi->s_blocks_per_group > blocksize * 8) { | |
6940 | - printk (KERN_ERR | |
6941 | - "EXT4-fs: #blocks per group too big: %lu\n", | |
6942 | - sbi->s_blocks_per_group); | |
6943 | + printk(KERN_ERR | |
6944 | + "EXT4-fs: #blocks per group too big: %lu\n", | |
6945 | + sbi->s_blocks_per_group); | |
6946 | goto failed_mount; | |
6947 | } | |
6948 | if (sbi->s_inodes_per_group > blocksize * 8) { | |
6949 | - printk (KERN_ERR | |
6950 | - "EXT4-fs: #inodes per group too big: %lu\n", | |
6951 | - sbi->s_inodes_per_group); | |
6952 | + printk(KERN_ERR | |
6953 | + "EXT4-fs: #inodes per group too big: %lu\n", | |
6954 | + sbi->s_inodes_per_group); | |
6955 | goto failed_mount; | |
6956 | } | |
6957 | ||
6958 | @@ -2115,10 +2212,10 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |
6959 | sbi->s_groups_count = blocks_count; | |
6960 | db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / | |
6961 | EXT4_DESC_PER_BLOCK(sb); | |
6962 | - sbi->s_group_desc = kmalloc(db_count * sizeof (struct buffer_head *), | |
6963 | + sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *), | |
6964 | GFP_KERNEL); | |
6965 | if (sbi->s_group_desc == NULL) { | |
6966 | - printk (KERN_ERR "EXT4-fs: not enough memory\n"); | |
6967 | + printk(KERN_ERR "EXT4-fs: not enough memory\n"); | |
6968 | goto failed_mount; | |
6969 | } | |
6970 | ||
6971 | @@ -2128,16 +2225,24 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |
6972 | block = descriptor_loc(sb, logical_sb_block, i); | |
6973 | sbi->s_group_desc[i] = sb_bread(sb, block); | |
6974 | if (!sbi->s_group_desc[i]) { | |
6975 | - printk (KERN_ERR "EXT4-fs: " | |
6976 | - "can't read group descriptor %d\n", i); | |
6977 | + printk(KERN_ERR "EXT4-fs: " | |
6978 | + "can't read group descriptor %d\n", i); | |
6979 | db_count = i; | |
6980 | goto failed_mount2; | |
6981 | } | |
6982 | } | |
6983 | - if (!ext4_check_descriptors (sb)) { | |
6984 | + if (!ext4_check_descriptors(sb)) { | |
6985 | printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n"); | |
6986 | goto failed_mount2; | |
6987 | } | |
6988 | + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) | |
6989 | + if (!ext4_fill_flex_info(sb)) { | |
6990 | + printk(KERN_ERR | |
6991 | + "EXT4-fs: unable to initialize " | |
6992 | + "flex_bg meta info!\n"); | |
6993 | + goto failed_mount2; | |
6994 | + } | |
6995 | + | |
6996 | sbi->s_gdb_count = db_count; | |
6997 | get_random_bytes(&sbi->s_next_generation, sizeof(u32)); | |
6998 | spin_lock_init(&sbi->s_next_gen_lock); | |
6999 | @@ -2202,11 +2307,11 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |
7000 | EXT4_SB(sb)->s_journal->j_failed_commit) { | |
7001 | printk(KERN_CRIT "EXT4-fs error (device %s): " | |
7002 | "ext4_fill_super: Journal transaction " | |
7003 | - "%u is corrupt\n", sb->s_id, | |
7004 | + "%u is corrupt\n", sb->s_id, | |
7005 | EXT4_SB(sb)->s_journal->j_failed_commit); | |
7006 | - if (test_opt (sb, ERRORS_RO)) { | |
7007 | - printk (KERN_CRIT | |
7008 | - "Mounting filesystem read-only\n"); | |
7009 | + if (test_opt(sb, ERRORS_RO)) { | |
7010 | + printk(KERN_CRIT | |
7011 | + "Mounting filesystem read-only\n"); | |
7012 | sb->s_flags |= MS_RDONLY; | |
7013 | EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; | |
7014 | es->s_state |= cpu_to_le16(EXT4_ERROR_FS); | |
7015 | @@ -2226,9 +2331,9 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |
7016 | goto failed_mount3; | |
7017 | } else { | |
7018 | if (!silent) | |
7019 | - printk (KERN_ERR | |
7020 | - "ext4: No journal on filesystem on %s\n", | |
7021 | - sb->s_id); | |
7022 | + printk(KERN_ERR | |
7023 | + "ext4: No journal on filesystem on %s\n", | |
7024 | + sb->s_id); | |
7025 | goto failed_mount3; | |
7026 | } | |
7027 | ||
7028 | @@ -2312,7 +2417,7 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |
7029 | goto failed_mount4; | |
7030 | } | |
7031 | ||
7032 | - ext4_setup_super (sb, es, sb->s_flags & MS_RDONLY); | |
7033 | + ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY); | |
7034 | ||
7035 | /* determine the minimum size of new large inodes, if present */ | |
7036 | if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) { | |
7037 | @@ -2351,12 +2456,19 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |
7038 | ext4_orphan_cleanup(sb, es); | |
7039 | EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; | |
7040 | if (needs_recovery) | |
7041 | - printk (KERN_INFO "EXT4-fs: recovery complete.\n"); | |
7042 | + printk(KERN_INFO "EXT4-fs: recovery complete.\n"); | |
7043 | ext4_mark_recovery_complete(sb, es); | |
7044 | - printk (KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n", | |
7045 | - test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal": | |
7046 | - test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": | |
7047 | - "writeback"); | |
7048 | + printk(KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n", | |
7049 | + test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal": | |
7050 | + test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": | |
7051 | + "writeback"); | |
7052 | + | |
7053 | + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { | |
7054 | + printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - " | |
7055 | + "requested data journaling mode\n"); | |
7056 | + clear_opt(sbi->s_mount_opt, DELALLOC); | |
7057 | + } else if (test_opt(sb, DELALLOC)) | |
7058 | + printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); | |
7059 | ||
7060 | ext4_ext_init(sb); | |
7061 | ext4_mb_init(sb, needs_recovery); | |
7062 | @@ -2372,6 +2484,7 @@ cantfind_ext4: | |
7063 | ||
7064 | failed_mount4: | |
7065 | jbd2_journal_destroy(sbi->s_journal); | |
7066 | + sbi->s_journal = NULL; | |
7067 | failed_mount3: | |
7068 | percpu_counter_destroy(&sbi->s_freeblocks_counter); | |
7069 | percpu_counter_destroy(&sbi->s_freeinodes_counter); | |
7070 | @@ -2461,14 +2574,14 @@ static journal_t *ext4_get_journal(struct super_block *sb, | |
7071 | static journal_t *ext4_get_dev_journal(struct super_block *sb, | |
7072 | dev_t j_dev) | |
7073 | { | |
7074 | - struct buffer_head * bh; | |
7075 | + struct buffer_head *bh; | |
7076 | journal_t *journal; | |
7077 | ext4_fsblk_t start; | |
7078 | ext4_fsblk_t len; | |
7079 | int hblock, blocksize; | |
7080 | ext4_fsblk_t sb_block; | |
7081 | unsigned long offset; | |
7082 | - struct ext4_super_block * es; | |
7083 | + struct ext4_super_block *es; | |
7084 | struct block_device *bdev; | |
7085 | ||
7086 | bdev = ext4_blkdev_get(j_dev); | |
7087 | @@ -2583,8 +2696,8 @@ static int ext4_load_journal(struct super_block *sb, | |
7088 | "unavailable, cannot proceed.\n"); | |
7089 | return -EROFS; | |
7090 | } | |
7091 | - printk (KERN_INFO "EXT4-fs: write access will " | |
7092 | - "be enabled during recovery.\n"); | |
7093 | + printk(KERN_INFO "EXT4-fs: write access will " | |
7094 | + "be enabled during recovery.\n"); | |
7095 | } | |
7096 | } | |
7097 | ||
7098 | @@ -2637,8 +2750,8 @@ static int ext4_load_journal(struct super_block *sb, | |
7099 | return 0; | |
7100 | } | |
7101 | ||
7102 | -static int ext4_create_journal(struct super_block * sb, | |
7103 | - struct ext4_super_block * es, | |
7104 | +static int ext4_create_journal(struct super_block *sb, | |
7105 | + struct ext4_super_block *es, | |
7106 | unsigned int journal_inum) | |
7107 | { | |
7108 | journal_t *journal; | |
7109 | @@ -2679,9 +2792,8 @@ static int ext4_create_journal(struct super_block * sb, | |
7110 | return 0; | |
7111 | } | |
7112 | ||
7113 | -static void ext4_commit_super (struct super_block * sb, | |
7114 | - struct ext4_super_block * es, | |
7115 | - int sync) | |
7116 | +static void ext4_commit_super(struct super_block *sb, | |
7117 | + struct ext4_super_block *es, int sync) | |
7118 | { | |
7119 | struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; | |
7120 | ||
7121 | @@ -2702,8 +2814,8 @@ static void ext4_commit_super (struct super_block * sb, | |
7122 | * remounting) the filesystem readonly, then we will end up with a | |
7123 | * consistent fs on disk. Record that fact. | |
7124 | */ | |
7125 | -static void ext4_mark_recovery_complete(struct super_block * sb, | |
7126 | - struct ext4_super_block * es) | |
7127 | +static void ext4_mark_recovery_complete(struct super_block *sb, | |
7128 | + struct ext4_super_block *es) | |
7129 | { | |
7130 | journal_t *journal = EXT4_SB(sb)->s_journal; | |
7131 | ||
7132 | @@ -2725,8 +2837,8 @@ static void ext4_mark_recovery_complete(struct super_block * sb, | |
7133 | * has recorded an error from a previous lifetime, move that error to the | |
7134 | * main filesystem now. | |
7135 | */ | |
7136 | -static void ext4_clear_journal_err(struct super_block * sb, | |
7137 | - struct ext4_super_block * es) | |
7138 | +static void ext4_clear_journal_err(struct super_block *sb, | |
7139 | + struct ext4_super_block *es) | |
7140 | { | |
7141 | journal_t *journal; | |
7142 | int j_errno; | |
7143 | @@ -2751,7 +2863,7 @@ static void ext4_clear_journal_err(struct super_block * sb, | |
7144 | ||
7145 | EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; | |
7146 | es->s_state |= cpu_to_le16(EXT4_ERROR_FS); | |
7147 | - ext4_commit_super (sb, es, 1); | |
7148 | + ext4_commit_super(sb, es, 1); | |
7149 | ||
7150 | jbd2_journal_clear_err(journal); | |
7151 | } | |
7152 | @@ -2784,7 +2896,7 @@ int ext4_force_commit(struct super_block *sb) | |
7153 | * This implicitly triggers the writebehind on sync(). | |
7154 | */ | |
7155 | ||
7156 | -static void ext4_write_super (struct super_block * sb) | |
7157 | +static void ext4_write_super(struct super_block *sb) | |
7158 | { | |
7159 | if (mutex_trylock(&sb->s_lock) != 0) | |
7160 | BUG(); | |
7161 | @@ -2840,13 +2952,14 @@ static void ext4_unlockfs(struct super_block *sb) | |
7162 | } | |
7163 | } | |
7164 | ||
7165 | -static int ext4_remount (struct super_block * sb, int * flags, char * data) | |
7166 | +static int ext4_remount(struct super_block *sb, int *flags, char *data) | |
7167 | { | |
7168 | - struct ext4_super_block * es; | |
7169 | + struct ext4_super_block *es; | |
7170 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
7171 | ext4_fsblk_t n_blocks_count = 0; | |
7172 | unsigned long old_sb_flags; | |
7173 | struct ext4_mount_options old_opts; | |
7174 | + ext4_group_t g; | |
7175 | int err; | |
7176 | #ifdef CONFIG_QUOTA | |
7177 | int i; | |
7178 | @@ -2925,6 +3038,26 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data) | |
7179 | } | |
7180 | ||
7181 | /* | |
7182 | + * Make sure the group descriptor checksums | |
7183 | + * are sane. If they aren't, refuse to | |
7184 | + * remount r/w. | |
7185 | + */ | |
7186 | + for (g = 0; g < sbi->s_groups_count; g++) { | |
7187 | + struct ext4_group_desc *gdp = | |
7188 | + ext4_get_group_desc(sb, g, NULL); | |
7189 | + | |
7190 | + if (!ext4_group_desc_csum_verify(sbi, g, gdp)) { | |
7191 | + printk(KERN_ERR | |
7192 | + "EXT4-fs: ext4_remount: " | |
7193 | + "Checksum for group %lu failed (%u!=%u)\n", | |
7194 | + g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)), | |
7195 | + le16_to_cpu(gdp->bg_checksum)); | |
7196 | + err = -EINVAL; | |
7197 | + goto restore_opts; | |
7198 | + } | |
7199 | + } | |
7200 | + | |
7201 | + /* | |
7202 | * If we have an unprocessed orphan list hanging | |
7203 | * around from a previously readonly bdev mount, | |
7204 | * require a full umount/remount for now. | |
7205 | @@ -2949,7 +3082,7 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data) | |
7206 | sbi->s_mount_state = le16_to_cpu(es->s_state); | |
7207 | if ((err = ext4_group_extend(sb, es, n_blocks_count))) | |
7208 | goto restore_opts; | |
7209 | - if (!ext4_setup_super (sb, es, 0)) | |
7210 | + if (!ext4_setup_super(sb, es, 0)) | |
7211 | sb->s_flags &= ~MS_RDONLY; | |
7212 | } | |
7213 | } | |
7214 | @@ -2979,7 +3112,7 @@ restore_opts: | |
7215 | return err; | |
7216 | } | |
7217 | ||
7218 | -static int ext4_statfs (struct dentry * dentry, struct kstatfs * buf) | |
7219 | +static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) | |
7220 | { | |
7221 | struct super_block *sb = dentry->d_sb; | |
7222 | struct ext4_sb_info *sbi = EXT4_SB(sb); | |
7223 | @@ -3217,12 +3350,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, | |
7224 | } | |
7225 | /* Journaling quota? */ | |
7226 | if (EXT4_SB(sb)->s_qf_names[type]) { | |
7227 | - /* Quotafile not of fs root? */ | |
7228 | + /* Quotafile not in fs root? */ | |
7229 | if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode) | |
7230 | printk(KERN_WARNING | |
7231 | "EXT4-fs: Quota file not on filesystem root. " | |
7232 | "Journaled quota will not work.\n"); | |
7233 | - } | |
7234 | + } | |
7235 | ||
7236 | /* | |
7237 | * When we journal data on quota file, we have to flush journal to see | |
7238 | @@ -3325,7 +3458,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, | |
7239 | err = ext4_journal_dirty_metadata(handle, bh); | |
7240 | else { | |
7241 | /* Always do at least ordered writes for quotas */ | |
7242 | - err = ext4_journal_dirty_data(handle, bh); | |
7243 | + err = ext4_jbd2_file_inode(handle, inode); | |
7244 | mark_buffer_dirty(bh); | |
7245 | } | |
7246 | brelse(bh); | |
7247 | diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c | |
7248 | index ff08633..8954208 100644 | |
7249 | --- a/fs/ext4/xattr.c | |
7250 | +++ b/fs/ext4/xattr.c | |
7251 | @@ -810,7 +810,7 @@ inserted: | |
7252 | /* We need to allocate a new block */ | |
7253 | ext4_fsblk_t goal = ext4_group_first_block_no(sb, | |
7254 | EXT4_I(inode)->i_block_group); | |
7255 | - ext4_fsblk_t block = ext4_new_block(handle, inode, | |
7256 | + ext4_fsblk_t block = ext4_new_meta_block(handle, inode, | |
7257 | goal, &error); | |
7258 | if (error) | |
7259 | goto cleanup; | |
7260 | @@ -1512,7 +1512,7 @@ static inline void ext4_xattr_hash_entry(struct ext4_xattr_header *header, | |
7261 | char *name = entry->e_name; | |
7262 | int n; | |
7263 | ||
7264 | - for (n=0; n < entry->e_name_len; n++) { | |
7265 | + for (n = 0; n < entry->e_name_len; n++) { | |
7266 | hash = (hash << NAME_HASH_SHIFT) ^ | |
7267 | (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ | |
7268 | *name++; | |
7269 | diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c | |
7270 | index fff3338..ac1a52c 100644 | |
7271 | --- a/fs/ext4/xattr_trusted.c | |
7272 | +++ b/fs/ext4/xattr_trusted.c | |
7273 | @@ -13,13 +13,11 @@ | |
7274 | #include "ext4.h" | |
7275 | #include "xattr.h" | |
7276 | ||
7277 | -#define XATTR_TRUSTED_PREFIX "trusted." | |
7278 | - | |
7279 | static size_t | |
7280 | ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, | |
7281 | const char *name, size_t name_len) | |
7282 | { | |
7283 | - const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1; | |
7284 | + const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; | |
7285 | const size_t total_len = prefix_len + name_len + 1; | |
7286 | ||
7287 | if (!capable(CAP_SYS_ADMIN)) | |
7288 | diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c | |
7289 | index 67be723..d91aa61 100644 | |
7290 | --- a/fs/ext4/xattr_user.c | |
7291 | +++ b/fs/ext4/xattr_user.c | |
7292 | @@ -12,13 +12,11 @@ | |
7293 | #include "ext4.h" | |
7294 | #include "xattr.h" | |
7295 | ||
7296 | -#define XATTR_USER_PREFIX "user." | |
7297 | - | |
7298 | static size_t | |
7299 | ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size, | |
7300 | const char *name, size_t name_len) | |
7301 | { | |
7302 | - const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1; | |
7303 | + const size_t prefix_len = XATTR_USER_PREFIX_LEN; | |
7304 | const size_t total_len = prefix_len + name_len + 1; | |
7305 | ||
7306 | if (!test_opt(inode->i_sb, XATTR_USER)) | |
7307 | diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c | |
7308 | index 6914598..91389c8 100644 | |
7309 | --- a/fs/jbd2/checkpoint.c | |
7310 | +++ b/fs/jbd2/checkpoint.c | |
7311 | @@ -688,7 +688,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact | |
7312 | ||
7313 | J_ASSERT(transaction->t_state == T_FINISHED); | |
7314 | J_ASSERT(transaction->t_buffers == NULL); | |
7315 | - J_ASSERT(transaction->t_sync_datalist == NULL); | |
7316 | J_ASSERT(transaction->t_forget == NULL); | |
7317 | J_ASSERT(transaction->t_iobuf_list == NULL); | |
7318 | J_ASSERT(transaction->t_shadow_list == NULL); | |
7319 | diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c | |
7320 | index a2ed72f..adf0395 100644 | |
7321 | --- a/fs/jbd2/commit.c | |
7322 | +++ b/fs/jbd2/commit.c | |
7323 | @@ -22,6 +22,8 @@ | |
7324 | #include <linux/pagemap.h> | |
7325 | #include <linux/jiffies.h> | |
7326 | #include <linux/crc32.h> | |
7327 | +#include <linux/writeback.h> | |
7328 | +#include <linux/backing-dev.h> | |
7329 | ||
7330 | /* | |
7331 | * Default IO end handler for temporary BJ_IO buffer_heads. | |
7332 | @@ -37,8 +39,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) | |
7333 | } | |
7334 | ||
7335 | /* | |
7336 | - * When an ext3-ordered file is truncated, it is possible that many pages are | |
7337 | - * not sucessfully freed, because they are attached to a committing transaction. | |
7338 | + * When an ext4 file is truncated, it is possible that some pages are not | |
7339 | + * successfully freed, because they are attached to a committing transaction. | |
7340 | * After the transaction commits, these pages are left on the LRU, with no | |
7341 | * ->mapping, and with attached buffers. These pages are trivially reclaimable | |
7342 | * by the VM, but their apparent absence upsets the VM accounting, and it makes | |
7343 | @@ -80,21 +82,6 @@ nope: | |
7344 | } | |
7345 | ||
7346 | /* | |
7347 | - * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is | |
7348 | - * held. For ranking reasons we must trylock. If we lose, schedule away and | |
7349 | - * return 0. j_list_lock is dropped in this case. | |
7350 | - */ | |
7351 | -static int inverted_lock(journal_t *journal, struct buffer_head *bh) | |
7352 | -{ | |
7353 | - if (!jbd_trylock_bh_state(bh)) { | |
7354 | - spin_unlock(&journal->j_list_lock); | |
7355 | - schedule(); | |
7356 | - return 0; | |
7357 | - } | |
7358 | - return 1; | |
7359 | -} | |
7360 | - | |
7361 | -/* | |
7362 | * Done it all: now submit the commit record. We should have | |
7363 | * cleaned up our previous buffers by now, so if we are in abort | |
7364 | * mode we can now just skip the rest of the journal write | |
7365 | @@ -112,6 +99,7 @@ static int journal_submit_commit_record(journal_t *journal, | |
7366 | struct buffer_head *bh; | |
7367 | int ret; | |
7368 | int barrier_done = 0; | |
7369 | + struct timespec now = current_kernel_time(); | |
7370 | ||
7371 | if (is_journal_aborted(journal)) | |
7372 | return 0; | |
7373 | @@ -126,6 +114,8 @@ static int journal_submit_commit_record(journal_t *journal, | |
7374 | tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); | |
7375 | tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); | |
7376 | tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); | |
7377 | + tmp->h_commit_sec = cpu_to_be64(now.tv_sec); | |
7378 | + tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); | |
7379 | ||
7380 | if (JBD2_HAS_COMPAT_FEATURE(journal, | |
7381 | JBD2_FEATURE_COMPAT_CHECKSUM)) { | |
7382 | @@ -197,159 +187,114 @@ static int journal_wait_on_commit_record(struct buffer_head *bh) | |
7383 | } | |
7384 | ||
7385 | /* | |
7386 | - * Wait for all submitted IO to complete. | |
7387 | + * write the filemap data using writepage() address_space_operations. | |
7388 | + * We don't do block allocation here even for delalloc. We don't | |
7389 | + * use writepages() because with dealyed allocation we may be doing | |
7390 | + * block allocation in writepages(). | |
7391 | */ | |
7392 | -static int journal_wait_on_locked_list(journal_t *journal, | |
7393 | - transaction_t *commit_transaction) | |
7394 | +static int journal_submit_inode_data_buffers(struct address_space *mapping) | |
7395 | { | |
7396 | - int ret = 0; | |
7397 | - struct journal_head *jh; | |
7398 | - | |
7399 | - while (commit_transaction->t_locked_list) { | |
7400 | - struct buffer_head *bh; | |
7401 | - | |
7402 | - jh = commit_transaction->t_locked_list->b_tprev; | |
7403 | - bh = jh2bh(jh); | |
7404 | - get_bh(bh); | |
7405 | - if (buffer_locked(bh)) { | |
7406 | - spin_unlock(&journal->j_list_lock); | |
7407 | - wait_on_buffer(bh); | |
7408 | - if (unlikely(!buffer_uptodate(bh))) | |
7409 | - ret = -EIO; | |
7410 | - spin_lock(&journal->j_list_lock); | |
7411 | - } | |
7412 | - if (!inverted_lock(journal, bh)) { | |
7413 | - put_bh(bh); | |
7414 | - spin_lock(&journal->j_list_lock); | |
7415 | - continue; | |
7416 | - } | |
7417 | - if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { | |
7418 | - __jbd2_journal_unfile_buffer(jh); | |
7419 | - jbd_unlock_bh_state(bh); | |
7420 | - jbd2_journal_remove_journal_head(bh); | |
7421 | - put_bh(bh); | |
7422 | - } else { | |
7423 | - jbd_unlock_bh_state(bh); | |
7424 | - } | |
7425 | - put_bh(bh); | |
7426 | - cond_resched_lock(&journal->j_list_lock); | |
7427 | - } | |
7428 | + int ret; | |
7429 | + struct writeback_control wbc = { | |
7430 | + .sync_mode = WB_SYNC_ALL, | |
7431 | + .nr_to_write = mapping->nrpages * 2, | |
7432 | + .range_start = 0, | |
7433 | + .range_end = i_size_read(mapping->host), | |
7434 | + .for_writepages = 1, | |
7435 | + }; | |
7436 | + | |
7437 | + ret = generic_writepages(mapping, &wbc); | |
7438 | return ret; | |
7439 | - } | |
7440 | +} | |
7441 | ||
7442 | -static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) | |
7443 | +/* | |
7444 | + * Submit all the data buffers of inode associated with the transaction to | |
7445 | + * disk. | |
7446 | + * | |
7447 | + * We are in a committing transaction. Therefore no new inode can be added to | |
7448 | + * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently | |
7449 | + * operate on from being released while we write out pages. | |
7450 | + */ | |
7451 | +static int journal_submit_data_buffers(journal_t *journal, | |
7452 | + transaction_t *commit_transaction) | |
7453 | { | |
7454 | - int i; | |
7455 | + struct jbd2_inode *jinode; | |
7456 | + int err, ret = 0; | |
7457 | + struct address_space *mapping; | |
7458 | ||
7459 | - for (i = 0; i < bufs; i++) { | |
7460 | - wbuf[i]->b_end_io = end_buffer_write_sync; | |
7461 | - /* We use-up our safety reference in submit_bh() */ | |
7462 | - submit_bh(WRITE, wbuf[i]); | |
7463 | + spin_lock(&journal->j_list_lock); | |
7464 | + list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { | |
7465 | + mapping = jinode->i_vfs_inode->i_mapping; | |
7466 | + jinode->i_flags |= JI_COMMIT_RUNNING; | |
7467 | + spin_unlock(&journal->j_list_lock); | |
7468 | + /* | |
7469 | + * submit the inode data buffers. We use writepage | |
7470 | + * instead of writepages. Because writepages can do | |
7471 | + * block allocation with delalloc. We need to write | |
7472 | + * only allocated blocks here. | |
7473 | + */ | |
7474 | + err = journal_submit_inode_data_buffers(mapping); | |
7475 | + if (!ret) | |
7476 | + ret = err; | |
7477 | + spin_lock(&journal->j_list_lock); | |
7478 | + J_ASSERT(jinode->i_transaction == commit_transaction); | |
7479 | + jinode->i_flags &= ~JI_COMMIT_RUNNING; | |
7480 | + wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); | |
7481 | } | |
7482 | + spin_unlock(&journal->j_list_lock); | |
7483 | + return ret; | |
7484 | } | |
7485 | ||
7486 | /* | |
7487 | - * Submit all the data buffers to disk | |
7488 | + * Wait for data submitted for writeout, refile inodes to proper | |
7489 | + * transaction if needed. | |
7490 | + * | |
7491 | */ | |
7492 | -static void journal_submit_data_buffers(journal_t *journal, | |
7493 | - transaction_t *commit_transaction) | |
7494 | +static int journal_finish_inode_data_buffers(journal_t *journal, | |
7495 | + transaction_t *commit_transaction) | |
7496 | { | |
7497 | - struct journal_head *jh; | |
7498 | - struct buffer_head *bh; | |
7499 | - int locked; | |
7500 | - int bufs = 0; | |
7501 | - struct buffer_head **wbuf = journal->j_wbuf; | |
7502 | + struct jbd2_inode *jinode, *next_i; | |
7503 | + int err, ret = 0; | |
7504 | ||
7505 | - /* | |
7506 | - * Whenever we unlock the journal and sleep, things can get added | |
7507 | - * onto ->t_sync_datalist, so we have to keep looping back to | |
7508 | - * write_out_data until we *know* that the list is empty. | |
7509 | - * | |
7510 | - * Cleanup any flushed data buffers from the data list. Even in | |
7511 | - * abort mode, we want to flush this out as soon as possible. | |
7512 | - */ | |
7513 | -write_out_data: | |
7514 | - cond_resched(); | |
7515 | + /* For locking, see the comment in journal_submit_data_buffers() */ | |
7516 | spin_lock(&journal->j_list_lock); | |
7517 | - | |
7518 | - while (commit_transaction->t_sync_datalist) { | |
7519 | - jh = commit_transaction->t_sync_datalist; | |
7520 | - bh = jh2bh(jh); | |
7521 | - locked = 0; | |
7522 | - | |
7523 | - /* Get reference just to make sure buffer does not disappear | |
7524 | - * when we are forced to drop various locks */ | |
7525 | - get_bh(bh); | |
7526 | - /* If the buffer is dirty, we need to submit IO and hence | |
7527 | - * we need the buffer lock. We try to lock the buffer without | |
7528 | - * blocking. If we fail, we need to drop j_list_lock and do | |
7529 | - * blocking lock_buffer(). | |
7530 | - */ | |
7531 | - if (buffer_dirty(bh)) { | |
7532 | - if (test_set_buffer_locked(bh)) { | |
7533 | - BUFFER_TRACE(bh, "needs blocking lock"); | |
7534 | - spin_unlock(&journal->j_list_lock); | |
7535 | - /* Write out all data to prevent deadlocks */ | |
7536 | - journal_do_submit_data(wbuf, bufs); | |
7537 | - bufs = 0; | |
7538 | - lock_buffer(bh); | |
7539 | - spin_lock(&journal->j_list_lock); | |
7540 | - } | |
7541 | - locked = 1; | |
7542 | - } | |
7543 | - /* We have to get bh_state lock. Again out of order, sigh. */ | |
7544 | - if (!inverted_lock(journal, bh)) { | |
7545 | - jbd_lock_bh_state(bh); | |
7546 | - spin_lock(&journal->j_list_lock); | |
7547 | - } | |
7548 | - /* Someone already cleaned up the buffer? */ | |
7549 | - if (!buffer_jbd(bh) | |
7550 | - || jh->b_transaction != commit_transaction | |
7551 | - || jh->b_jlist != BJ_SyncData) { | |
7552 | - jbd_unlock_bh_state(bh); | |
7553 | - if (locked) | |
7554 | - unlock_buffer(bh); | |
7555 | - BUFFER_TRACE(bh, "already cleaned up"); | |
7556 | - put_bh(bh); | |
7557 | - continue; | |
7558 | - } | |
7559 | - if (locked && test_clear_buffer_dirty(bh)) { | |
7560 | - BUFFER_TRACE(bh, "needs writeout, adding to array"); | |
7561 | - wbuf[bufs++] = bh; | |
7562 | - __jbd2_journal_file_buffer(jh, commit_transaction, | |
7563 | - BJ_Locked); | |
7564 | - jbd_unlock_bh_state(bh); | |
7565 | - if (bufs == journal->j_wbufsize) { | |
7566 | - spin_unlock(&journal->j_list_lock); | |
7567 | - journal_do_submit_data(wbuf, bufs); | |
7568 | - bufs = 0; | |
7569 | - goto write_out_data; | |
7570 | - } | |
7571 | - } else if (!locked && buffer_locked(bh)) { | |
7572 | - __jbd2_journal_file_buffer(jh, commit_transaction, | |
7573 | - BJ_Locked); | |
7574 | - jbd_unlock_bh_state(bh); | |
7575 | - put_bh(bh); | |
7576 | - } else { | |
7577 | - BUFFER_TRACE(bh, "writeout complete: unfile"); | |
7578 | - __jbd2_journal_unfile_buffer(jh); | |
7579 | - jbd_unlock_bh_state(bh); | |
7580 | - if (locked) | |
7581 | - unlock_buffer(bh); | |
7582 | - jbd2_journal_remove_journal_head(bh); | |
7583 | - /* Once for our safety reference, once for | |
7584 | - * jbd2_journal_remove_journal_head() */ | |
7585 | - put_bh(bh); | |
7586 | - put_bh(bh); | |
7587 | + list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { | |
7588 | + jinode->i_flags |= JI_COMMIT_RUNNING; | |
7589 | + spin_unlock(&journal->j_list_lock); | |
7590 | + err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); | |
7591 | + if (err) { | |
7592 | + /* | |
7593 | + * Because AS_EIO is cleared by | |
7594 | + * wait_on_page_writeback_range(), set it again so | |
7595 | + * that user process can get -EIO from fsync(). | |
7596 | + */ | |
7597 | + set_bit(AS_EIO, | |
7598 | + &jinode->i_vfs_inode->i_mapping->flags); | |
7599 | + | |
7600 | + if (!ret) | |
7601 | + ret = err; | |
7602 | } | |
7603 | + spin_lock(&journal->j_list_lock); | |
7604 | + jinode->i_flags &= ~JI_COMMIT_RUNNING; | |
7605 | + wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); | |
7606 | + } | |
7607 | ||
7608 | - if (need_resched() || spin_needbreak(&journal->j_list_lock)) { | |
7609 | - spin_unlock(&journal->j_list_lock); | |
7610 | - goto write_out_data; | |
7611 | + /* Now refile inode to proper lists */ | |
7612 | + list_for_each_entry_safe(jinode, next_i, | |
7613 | + &commit_transaction->t_inode_list, i_list) { | |
7614 | + list_del(&jinode->i_list); | |
7615 | + if (jinode->i_next_transaction) { | |
7616 | + jinode->i_transaction = jinode->i_next_transaction; | |
7617 | + jinode->i_next_transaction = NULL; | |
7618 | + list_add(&jinode->i_list, | |
7619 | + &jinode->i_transaction->t_inode_list); | |
7620 | + } else { | |
7621 | + jinode->i_transaction = NULL; | |
7622 | } | |
7623 | } | |
7624 | spin_unlock(&journal->j_list_lock); | |
7625 | - journal_do_submit_data(wbuf, bufs); | |
7626 | + | |
7627 | + return ret; | |
7628 | } | |
7629 | ||
7630 | static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) | |
7631 | @@ -524,21 +469,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |
7632 | * Now start flushing things to disk, in the order they appear | |
7633 | * on the transaction lists. Data blocks go first. | |
7634 | */ | |
7635 | - err = 0; | |
7636 | - journal_submit_data_buffers(journal, commit_transaction); | |
7637 | - | |
7638 | - /* | |
7639 | - * Wait for all previously submitted IO to complete if commit | |
7640 | - * record is to be written synchronously. | |
7641 | - */ | |
7642 | - spin_lock(&journal->j_list_lock); | |
7643 | - if (!JBD2_HAS_INCOMPAT_FEATURE(journal, | |
7644 | - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) | |
7645 | - err = journal_wait_on_locked_list(journal, | |
7646 | - commit_transaction); | |
7647 | - | |
7648 | - spin_unlock(&journal->j_list_lock); | |
7649 | - | |
7650 | + err = journal_submit_data_buffers(journal, commit_transaction); | |
7651 | if (err) | |
7652 | jbd2_journal_abort(journal, err); | |
7653 | ||
7654 | @@ -547,16 +478,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |
7655 | jbd_debug(3, "JBD: commit phase 2\n"); | |
7656 | ||
7657 | /* | |
7658 | - * If we found any dirty or locked buffers, then we should have | |
7659 | - * looped back up to the write_out_data label. If there weren't | |
7660 | - * any then journal_clean_data_list should have wiped the list | |
7661 | - * clean by now, so check that it is in fact empty. | |
7662 | - */ | |
7663 | - J_ASSERT (commit_transaction->t_sync_datalist == NULL); | |
7664 | - | |
7665 | - jbd_debug (3, "JBD: commit phase 3\n"); | |
7666 | - | |
7667 | - /* | |
7668 | * Way to go: we have now written out all of the data for a | |
7669 | * transaction! Now comes the tricky part: we need to write out | |
7670 | * metadata. Loop over the transaction's entire buffer list: | |
7671 | @@ -574,6 +495,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |
7672 | J_ASSERT(commit_transaction->t_nr_buffers <= | |
7673 | commit_transaction->t_outstanding_credits); | |
7674 | ||
7675 | + err = 0; | |
7676 | descriptor = NULL; | |
7677 | bufs = 0; | |
7678 | while (commit_transaction->t_buffers) { | |
7679 | @@ -748,13 +670,23 @@ start_journal_io: | |
7680 | &cbh, crc32_sum); | |
7681 | if (err) | |
7682 | __jbd2_journal_abort_hard(journal); | |
7683 | + } | |
7684 | ||
7685 | - spin_lock(&journal->j_list_lock); | |
7686 | - err = journal_wait_on_locked_list(journal, | |
7687 | - commit_transaction); | |
7688 | - spin_unlock(&journal->j_list_lock); | |
7689 | - if (err) | |
7690 | - __jbd2_journal_abort_hard(journal); | |
7691 | + /* | |
7692 | + * This is the right place to wait for data buffers both for ASYNC | |
7693 | + * and !ASYNC commit. If commit is ASYNC, we need to wait only after | |
7694 | + * the commit block went to disk (which happens above). If commit is | |
7695 | + * SYNC, we need to wait for data buffers before we start writing | |
7696 | + * commit block, which happens below in such setting. | |
7697 | + */ | |
7698 | + err = journal_finish_inode_data_buffers(journal, commit_transaction); | |
7699 | + if (err) { | |
7700 | + char b[BDEVNAME_SIZE]; | |
7701 | + | |
7702 | + printk(KERN_WARNING | |
7703 | + "JBD2: Detected IO errors while flushing file data " | |
7704 | + "on %s\n", bdevname(journal->j_fs_dev, b)); | |
7705 | + err = 0; | |
7706 | } | |
7707 | ||
7708 | /* Lo and behold: we have just managed to send a transaction to | |
7709 | @@ -768,7 +700,7 @@ start_journal_io: | |
7710 | so we incur less scheduling load. | |
7711 | */ | |
7712 | ||
7713 | - jbd_debug(3, "JBD: commit phase 4\n"); | |
7714 | + jbd_debug(3, "JBD: commit phase 3\n"); | |
7715 | ||
7716 | /* | |
7717 | * akpm: these are BJ_IO, and j_list_lock is not needed. | |
7718 | @@ -827,7 +759,7 @@ wait_for_iobuf: | |
7719 | ||
7720 | J_ASSERT (commit_transaction->t_shadow_list == NULL); | |
7721 | ||
7722 | - jbd_debug(3, "JBD: commit phase 5\n"); | |
7723 | + jbd_debug(3, "JBD: commit phase 4\n"); | |
7724 | ||
7725 | /* Here we wait for the revoke record and descriptor record buffers */ | |
7726 | wait_for_ctlbuf: | |
7727 | @@ -854,7 +786,7 @@ wait_for_iobuf: | |
7728 | /* AKPM: bforget here */ | |
7729 | } | |
7730 | ||
7731 | - jbd_debug(3, "JBD: commit phase 6\n"); | |
7732 | + jbd_debug(3, "JBD: commit phase 5\n"); | |
7733 | ||
7734 | if (!JBD2_HAS_INCOMPAT_FEATURE(journal, | |
7735 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { | |
7736 | @@ -874,9 +806,9 @@ wait_for_iobuf: | |
7737 | transaction can be removed from any checkpoint list it was on | |
7738 | before. */ | |
7739 | ||
7740 | - jbd_debug(3, "JBD: commit phase 7\n"); | |
7741 | + jbd_debug(3, "JBD: commit phase 6\n"); | |
7742 | ||
7743 | - J_ASSERT(commit_transaction->t_sync_datalist == NULL); | |
7744 | + J_ASSERT(list_empty(&commit_transaction->t_inode_list)); | |
7745 | J_ASSERT(commit_transaction->t_buffers == NULL); | |
7746 | J_ASSERT(commit_transaction->t_checkpoint_list == NULL); | |
7747 | J_ASSERT(commit_transaction->t_iobuf_list == NULL); | |
7748 | @@ -997,7 +929,7 @@ restart_loop: | |
7749 | ||
7750 | /* Done with this transaction! */ | |
7751 | ||
7752 | - jbd_debug(3, "JBD: commit phase 8\n"); | |
7753 | + jbd_debug(3, "JBD: commit phase 7\n"); | |
7754 | ||
7755 | J_ASSERT(commit_transaction->t_state == T_COMMIT); | |
7756 | ||
7757 | diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c | |
7758 | index 2e24567..8207a01 100644 | |
7759 | --- a/fs/jbd2/journal.c | |
7760 | +++ b/fs/jbd2/journal.c | |
7761 | @@ -50,7 +50,6 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates); | |
7762 | EXPORT_SYMBOL(jbd2_journal_get_write_access); | |
7763 | EXPORT_SYMBOL(jbd2_journal_get_create_access); | |
7764 | EXPORT_SYMBOL(jbd2_journal_get_undo_access); | |
7765 | -EXPORT_SYMBOL(jbd2_journal_dirty_data); | |
7766 | EXPORT_SYMBOL(jbd2_journal_dirty_metadata); | |
7767 | EXPORT_SYMBOL(jbd2_journal_release_buffer); | |
7768 | EXPORT_SYMBOL(jbd2_journal_forget); | |
7769 | @@ -69,7 +68,6 @@ EXPORT_SYMBOL(jbd2_journal_set_features); | |
7770 | EXPORT_SYMBOL(jbd2_journal_create); | |
7771 | EXPORT_SYMBOL(jbd2_journal_load); | |
7772 | EXPORT_SYMBOL(jbd2_journal_destroy); | |
7773 | -EXPORT_SYMBOL(jbd2_journal_update_superblock); | |
7774 | EXPORT_SYMBOL(jbd2_journal_abort); | |
7775 | EXPORT_SYMBOL(jbd2_journal_errno); | |
7776 | EXPORT_SYMBOL(jbd2_journal_ack_err); | |
7777 | @@ -82,6 +80,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page); | |
7778 | EXPORT_SYMBOL(jbd2_journal_invalidatepage); | |
7779 | EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); | |
7780 | EXPORT_SYMBOL(jbd2_journal_force_commit); | |
7781 | +EXPORT_SYMBOL(jbd2_journal_file_inode); | |
7782 | +EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); | |
7783 | +EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); | |
7784 | +EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); | |
7785 | ||
7786 | static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); | |
7787 | static void __journal_abort_soft (journal_t *journal, int errno); | |
7788 | @@ -2195,6 +2197,54 @@ void jbd2_journal_put_journal_head(struct journal_head *jh) | |
7789 | } | |
7790 | ||
7791 | /* | |
7792 | + * Initialize jbd inode head | |
7793 | + */ | |
7794 | +void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode) | |
7795 | +{ | |
7796 | + jinode->i_transaction = NULL; | |
7797 | + jinode->i_next_transaction = NULL; | |
7798 | + jinode->i_vfs_inode = inode; | |
7799 | + jinode->i_flags = 0; | |
7800 | + INIT_LIST_HEAD(&jinode->i_list); | |
7801 | +} | |
7802 | + | |
7803 | +/* | |
7804 | + * Function to be called before we start removing inode from memory (i.e., | |
7805 | + * clear_inode() is a fine place to be called from). It removes inode from | |
7806 | + * transaction's lists. | |
7807 | + */ | |
7808 | +void jbd2_journal_release_jbd_inode(journal_t *journal, | |
7809 | + struct jbd2_inode *jinode) | |
7810 | +{ | |
7811 | + int writeout = 0; | |
7812 | + | |
7813 | + if (!journal) | |
7814 | + return; | |
7815 | +restart: | |
7816 | + spin_lock(&journal->j_list_lock); | |
7817 | + /* Is commit writing out inode - we have to wait */ | |
7818 | + if (jinode->i_flags & JI_COMMIT_RUNNING) { | |
7819 | + wait_queue_head_t *wq; | |
7820 | + DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); | |
7821 | + wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); | |
7822 | + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); | |
7823 | + spin_unlock(&journal->j_list_lock); | |
7824 | + schedule(); | |
7825 | + finish_wait(wq, &wait.wait); | |
7826 | + goto restart; | |
7827 | + } | |
7828 | + | |
7829 | + /* Do we need to wait for data writeback? */ | |
7830 | + if (journal->j_committing_transaction == jinode->i_transaction) | |
7831 | + writeout = 1; | |
7832 | + if (jinode->i_transaction) { | |
7833 | + list_del(&jinode->i_list); | |
7834 | + jinode->i_transaction = NULL; | |
7835 | + } | |
7836 | + spin_unlock(&journal->j_list_lock); | |
7837 | +} | |
7838 | + | |
7839 | +/* | |
7840 | * debugfs tunables | |
7841 | */ | |
7842 | #ifdef CONFIG_JBD2_DEBUG | |
7843 | diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c | |
7844 | index d6e006e..4f7cadb 100644 | |
7845 | --- a/fs/jbd2/transaction.c | |
7846 | +++ b/fs/jbd2/transaction.c | |
7847 | @@ -41,7 +41,6 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); | |
7848 | * new transaction and we can't block without protecting against other | |
7849 | * processes trying to touch the journal while it is in transition. | |
7850 | * | |
7851 | - * Called under j_state_lock | |
7852 | */ | |
7853 | ||
7854 | static transaction_t * | |
7855 | @@ -52,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) | |
7856 | transaction->t_tid = journal->j_transaction_sequence++; | |
7857 | transaction->t_expires = jiffies + journal->j_commit_interval; | |
7858 | spin_lock_init(&transaction->t_handle_lock); | |
7859 | + INIT_LIST_HEAD(&transaction->t_inode_list); | |
7860 | ||
7861 | /* Set up the commit timer for the new transaction. */ | |
7862 | journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); | |
7863 | @@ -943,183 +943,6 @@ out: | |
7864 | } | |
7865 | ||
7866 | /** | |
7867 | - * int jbd2_journal_dirty_data() - mark a buffer as containing dirty data which | |
7868 | - * needs to be flushed before we can commit the | |
7869 | - * current transaction. | |
7870 | - * @handle: transaction | |
7871 | - * @bh: bufferhead to mark | |
7872 | - * | |
7873 | - * The buffer is placed on the transaction's data list and is marked as | |
7874 | - * belonging to the transaction. | |
7875 | - * | |
7876 | - * Returns error number or 0 on success. | |
7877 | - * | |
7878 | - * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage | |
7879 | - * by kswapd. | |
7880 | - */ | |
7881 | -int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh) | |
7882 | -{ | |
7883 | - journal_t *journal = handle->h_transaction->t_journal; | |
7884 | - int need_brelse = 0; | |
7885 | - struct journal_head *jh; | |
7886 | - | |
7887 | - if (is_handle_aborted(handle)) | |
7888 | - return 0; | |
7889 | - | |
7890 | - jh = jbd2_journal_add_journal_head(bh); | |
7891 | - JBUFFER_TRACE(jh, "entry"); | |
7892 | - | |
7893 | - /* | |
7894 | - * The buffer could *already* be dirty. Writeout can start | |
7895 | - * at any time. | |
7896 | - */ | |
7897 | - jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid); | |
7898 | - | |
7899 | - /* | |
7900 | - * What if the buffer is already part of a running transaction? | |
7901 | - * | |
7902 | - * There are two cases: | |
7903 | - * 1) It is part of the current running transaction. Refile it, | |
7904 | - * just in case we have allocated it as metadata, deallocated | |
7905 | - * it, then reallocated it as data. | |
7906 | - * 2) It is part of the previous, still-committing transaction. | |
7907 | - * If all we want to do is to guarantee that the buffer will be | |
7908 | - * written to disk before this new transaction commits, then | |
7909 | - * being sure that the *previous* transaction has this same | |
7910 | - * property is sufficient for us! Just leave it on its old | |
7911 | - * transaction. | |
7912 | - * | |
7913 | - * In case (2), the buffer must not already exist as metadata | |
7914 | - * --- that would violate write ordering (a transaction is free | |
7915 | - * to write its data at any point, even before the previous | |
7916 | - * committing transaction has committed). The caller must | |
7917 | - * never, ever allow this to happen: there's nothing we can do | |
7918 | - * about it in this layer. | |
7919 | - */ | |
7920 | - jbd_lock_bh_state(bh); | |
7921 | - spin_lock(&journal->j_list_lock); | |
7922 | - | |
7923 | - /* Now that we have bh_state locked, are we really still mapped? */ | |
7924 | - if (!buffer_mapped(bh)) { | |
7925 | - JBUFFER_TRACE(jh, "unmapped buffer, bailing out"); | |
7926 | - goto no_journal; | |
7927 | - } | |
7928 | - | |
7929 | - if (jh->b_transaction) { | |
7930 | - JBUFFER_TRACE(jh, "has transaction"); | |
7931 | - if (jh->b_transaction != handle->h_transaction) { | |
7932 | - JBUFFER_TRACE(jh, "belongs to older transaction"); | |
7933 | - J_ASSERT_JH(jh, jh->b_transaction == | |
7934 | - journal->j_committing_transaction); | |
7935 | - | |
7936 | - /* @@@ IS THIS TRUE ? */ | |
7937 | - /* | |
7938 | - * Not any more. Scenario: someone does a write() | |
7939 | - * in data=journal mode. The buffer's transaction has | |
7940 | - * moved into commit. Then someone does another | |
7941 | - * write() to the file. We do the frozen data copyout | |
7942 | - * and set b_next_transaction to point to j_running_t. | |
7943 | - * And while we're in that state, someone does a | |
7944 | - * writepage() in an attempt to pageout the same area | |
7945 | - * of the file via a shared mapping. At present that | |
7946 | - * calls jbd2_journal_dirty_data(), and we get right here. | |
7947 | - * It may be too late to journal the data. Simply | |
7948 | - * falling through to the next test will suffice: the | |
7949 | - * data will be dirty and wil be checkpointed. The | |
7950 | - * ordering comments in the next comment block still | |
7951 | - * apply. | |
7952 | - */ | |
7953 | - //J_ASSERT_JH(jh, jh->b_next_transaction == NULL); | |
7954 | - | |
7955 | - /* | |
7956 | - * If we're journalling data, and this buffer was | |
7957 | - * subject to a write(), it could be metadata, forget | |
7958 | - * or shadow against the committing transaction. Now, | |
7959 | - * someone has dirtied the same darn page via a mapping | |
7960 | - * and it is being writepage()'d. | |
7961 | - * We *could* just steal the page from commit, with some | |
7962 | - * fancy locking there. Instead, we just skip it - | |
7963 | - * don't tie the page's buffers to the new transaction | |
7964 | - * at all. | |
7965 | - * Implication: if we crash before the writepage() data | |
7966 | - * is written into the filesystem, recovery will replay | |
7967 | - * the write() data. | |
7968 | - */ | |
7969 | - if (jh->b_jlist != BJ_None && | |
7970 | - jh->b_jlist != BJ_SyncData && | |
7971 | - jh->b_jlist != BJ_Locked) { | |
7972 | - JBUFFER_TRACE(jh, "Not stealing"); | |
7973 | - goto no_journal; | |
7974 | - } | |
7975 | - | |
7976 | - /* | |
7977 | - * This buffer may be undergoing writeout in commit. We | |
7978 | - * can't return from here and let the caller dirty it | |
7979 | - * again because that can cause the write-out loop in | |
7980 | - * commit to never terminate. | |
7981 | - */ | |
7982 | - if (buffer_dirty(bh)) { | |
7983 | - get_bh(bh); | |
7984 | - spin_unlock(&journal->j_list_lock); | |
7985 | - jbd_unlock_bh_state(bh); | |
7986 | - need_brelse = 1; | |
7987 | - sync_dirty_buffer(bh); | |
7988 | - jbd_lock_bh_state(bh); | |
7989 | - spin_lock(&journal->j_list_lock); | |
7990 | - /* Since we dropped the lock... */ | |
7991 | - if (!buffer_mapped(bh)) { | |
7992 | - JBUFFER_TRACE(jh, "buffer got unmapped"); | |
7993 | - goto no_journal; | |
7994 | - } | |
7995 | - /* The buffer may become locked again at any | |
7996 | - time if it is redirtied */ | |
7997 | - } | |
7998 | - | |
7999 | - /* journal_clean_data_list() may have got there first */ | |
8000 | - if (jh->b_transaction != NULL) { | |
8001 | - JBUFFER_TRACE(jh, "unfile from commit"); | |
8002 | - __jbd2_journal_temp_unlink_buffer(jh); | |
8003 | - /* It still points to the committing | |
8004 | - * transaction; move it to this one so | |
8005 | - * that the refile assert checks are | |
8006 | - * happy. */ | |
8007 | - jh->b_transaction = handle->h_transaction; | |
8008 | - } | |
8009 | - /* The buffer will be refiled below */ | |
8010 | - | |
8011 | - } | |
8012 | - /* | |
8013 | - * Special case --- the buffer might actually have been | |
8014 | - * allocated and then immediately deallocated in the previous, | |
8015 | - * committing transaction, so might still be left on that | |
8016 | - * transaction's metadata lists. | |
8017 | - */ | |
8018 | - if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) { | |
8019 | - JBUFFER_TRACE(jh, "not on correct data list: unfile"); | |
8020 | - J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); | |
8021 | - __jbd2_journal_temp_unlink_buffer(jh); | |
8022 | - jh->b_transaction = handle->h_transaction; | |
8023 | - JBUFFER_TRACE(jh, "file as data"); | |
8024 | - __jbd2_journal_file_buffer(jh, handle->h_transaction, | |
8025 | - BJ_SyncData); | |
8026 | - } | |
8027 | - } else { | |
8028 | - JBUFFER_TRACE(jh, "not on a transaction"); | |
8029 | - __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData); | |
8030 | - } | |
8031 | -no_journal: | |
8032 | - spin_unlock(&journal->j_list_lock); | |
8033 | - jbd_unlock_bh_state(bh); | |
8034 | - if (need_brelse) { | |
8035 | - BUFFER_TRACE(bh, "brelse"); | |
8036 | - __brelse(bh); | |
8037 | - } | |
8038 | - JBUFFER_TRACE(jh, "exit"); | |
8039 | - jbd2_journal_put_journal_head(jh); | |
8040 | - return 0; | |
8041 | -} | |
8042 | - | |
8043 | -/** | |
8044 | * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata | |
8045 | * @handle: transaction to add buffer to. | |
8046 | * @bh: buffer to mark | |
8047 | @@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh) | |
8048 | * Remove a buffer from the appropriate transaction list. | |
8049 | * | |
8050 | * Note that this function can *change* the value of | |
8051 | - * bh->b_transaction->t_sync_datalist, t_buffers, t_forget, | |
8052 | - * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller | |
8053 | - * is holding onto a copy of one of thee pointers, it could go bad. | |
8054 | - * Generally the caller needs to re-read the pointer from the transaction_t. | |
8055 | + * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list, | |
8056 | + * t_log_list or t_reserved_list. If the caller is holding onto a copy of one | |
8057 | + * of these pointers, it could go bad. Generally the caller needs to re-read | |
8058 | + * the pointer from the transaction_t. | |
8059 | * | |
8060 | * Called under j_list_lock. The journal may not be locked. | |
8061 | */ | |
8062 | @@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) | |
8063 | switch (jh->b_jlist) { | |
8064 | case BJ_None: | |
8065 | return; | |
8066 | - case BJ_SyncData: | |
8067 | - list = &transaction->t_sync_datalist; | |
8068 | - break; | |
8069 | case BJ_Metadata: | |
8070 | transaction->t_nr_buffers--; | |
8071 | J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); | |
8072 | @@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) | |
8073 | case BJ_Reserved: | |
8074 | list = &transaction->t_reserved_list; | |
8075 | break; | |
8076 | - case BJ_Locked: | |
8077 | - list = &transaction->t_locked_list; | |
8078 | - break; | |
8079 | } | |
8080 | ||
8081 | __blist_del_buffer(list, jh); | |
8082 | @@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) | |
8083 | goto out; | |
8084 | ||
8085 | spin_lock(&journal->j_list_lock); | |
8086 | - if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) { | |
8087 | - if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) { | |
8088 | - /* A written-back ordered data buffer */ | |
8089 | - JBUFFER_TRACE(jh, "release data"); | |
8090 | - __jbd2_journal_unfile_buffer(jh); | |
8091 | - jbd2_journal_remove_journal_head(bh); | |
8092 | - __brelse(bh); | |
8093 | - } | |
8094 | - } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { | |
8095 | + if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { | |
8096 | /* written-back checkpointed metadata buffer */ | |
8097 | if (jh->b_jlist == BJ_None) { | |
8098 | JBUFFER_TRACE(jh, "remove from checkpoint list"); | |
8099 | @@ -1656,12 +1465,43 @@ out: | |
8100 | return; | |
8101 | } | |
8102 | ||
8103 | +/* | |
8104 | + * jbd2_journal_try_to_free_buffers() could race with | |
8105 | + * jbd2_journal_commit_transaction(). The later might still hold the | |
8106 | + * reference count to the buffers when inspecting them on | |
8107 | + * t_syncdata_list or t_locked_list. | |
8108 | + * | |
8109 | + * jbd2_journal_try_to_free_buffers() will call this function to | |
8110 | + * wait for the current transaction to finish syncing data buffers, before | |
8111 | + * try to free that buffer. | |
8112 | + * | |
8113 | + * Called with journal->j_state_lock hold. | |
8114 | + */ | |
8115 | +static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal) | |
8116 | +{ | |
8117 | + transaction_t *transaction; | |
8118 | + tid_t tid; | |
8119 | + | |
8120 | + spin_lock(&journal->j_state_lock); | |
8121 | + transaction = journal->j_committing_transaction; | |
8122 | + | |
8123 | + if (!transaction) { | |
8124 | + spin_unlock(&journal->j_state_lock); | |
8125 | + return; | |
8126 | + } | |
8127 | + | |
8128 | + tid = transaction->t_tid; | |
8129 | + spin_unlock(&journal->j_state_lock); | |
8130 | + jbd2_log_wait_commit(journal, tid); | |
8131 | +} | |
8132 | ||
8133 | /** | |
8134 | * int jbd2_journal_try_to_free_buffers() - try to free page buffers. | |
8135 | * @journal: journal for operation | |
8136 | * @page: to try and free | |
8137 | - * @unused_gfp_mask: unused | |
8138 | + * @gfp_mask: we use the mask to detect how hard should we try to release | |
8139 | + * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to | |
8140 | + * release the buffers. | |
8141 | * | |
8142 | * | |
8143 | * For all the buffers on this page, | |
8144 | @@ -1690,9 +1530,11 @@ out: | |
8145 | * journal_try_to_free_buffer() is changing its state. But that | |
8146 | * cannot happen because we never reallocate freed data as metadata | |
8147 | * while the data is part of a transaction. Yes? | |
8148 | + * | |
8149 | + * Return 0 on failure, 1 on success | |
8150 | */ | |
8151 | int jbd2_journal_try_to_free_buffers(journal_t *journal, | |
8152 | - struct page *page, gfp_t unused_gfp_mask) | |
8153 | + struct page *page, gfp_t gfp_mask) | |
8154 | { | |
8155 | struct buffer_head *head; | |
8156 | struct buffer_head *bh; | |
8157 | @@ -1708,7 +1550,8 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, | |
8158 | /* | |
8159 | * We take our own ref against the journal_head here to avoid | |
8160 | * having to add tons of locking around each instance of | |
8161 | - * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head(). | |
8162 | + * jbd2_journal_remove_journal_head() and | |
8163 | + * jbd2_journal_put_journal_head(). | |
8164 | */ | |
8165 | jh = jbd2_journal_grab_journal_head(bh); | |
8166 | if (!jh) | |
8167 | @@ -1721,7 +1564,28 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, | |
8168 | if (buffer_jbd(bh)) | |
8169 | goto busy; | |
8170 | } while ((bh = bh->b_this_page) != head); | |
8171 | + | |
8172 | ret = try_to_free_buffers(page); | |
8173 | + | |
8174 | + /* | |
8175 | + * There are a number of places where jbd2_journal_try_to_free_buffers() | |
8176 | + * could race with jbd2_journal_commit_transaction(), the later still | |
8177 | + * holds the reference to the buffers to free while processing them. | |
8178 | + * try_to_free_buffers() failed to free those buffers. Some of the | |
8179 | + * caller of releasepage() request page buffers to be dropped, otherwise | |
8180 | + * treat the fail-to-free as errors (such as generic_file_direct_IO()) | |
8181 | + * | |
8182 | + * So, if the caller of try_to_release_page() wants the synchronous | |
8183 | + * behaviour(i.e make sure buffers are dropped upon return), | |
8184 | + * let's wait for the current transaction to finish flush of | |
8185 | + * dirty data buffers, then try to free those buffers again, | |
8186 | + * with the journal locked. | |
8187 | + */ | |
8188 | + if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) { | |
8189 | + jbd2_journal_wait_for_transaction_sync_data(journal); | |
8190 | + ret = try_to_free_buffers(page); | |
8191 | + } | |
8192 | + | |
8193 | busy: | |
8194 | return ret; | |
8195 | } | |
8196 | @@ -1823,6 +1687,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) | |
8197 | if (!buffer_jbd(bh)) | |
8198 | goto zap_buffer_unlocked; | |
8199 | ||
8200 | + /* OK, we have data buffer in journaled mode */ | |
8201 | spin_lock(&journal->j_state_lock); | |
8202 | jbd_lock_bh_state(bh); | |
8203 | spin_lock(&journal->j_list_lock); | |
8204 | @@ -1886,15 +1751,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) | |
8205 | } | |
8206 | } else if (transaction == journal->j_committing_transaction) { | |
8207 | JBUFFER_TRACE(jh, "on committing transaction"); | |
8208 | - if (jh->b_jlist == BJ_Locked) { | |
8209 | - /* | |
8210 | - * The buffer is on the committing transaction's locked | |
8211 | - * list. We have the buffer locked, so I/O has | |
8212 | - * completed. So we can nail the buffer now. | |
8213 | - */ | |
8214 | - may_free = __dispose_buffer(jh, transaction); | |
8215 | - goto zap_buffer; | |
8216 | - } | |
8217 | /* | |
8218 | * If it is committing, we simply cannot touch it. We | |
8219 | * can remove it's next_transaction pointer from the | |
8220 | @@ -2027,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, | |
8221 | J_ASSERT_JH(jh, !jh->b_committed_data); | |
8222 | J_ASSERT_JH(jh, !jh->b_frozen_data); | |
8223 | return; | |
8224 | - case BJ_SyncData: | |
8225 | - list = &transaction->t_sync_datalist; | |
8226 | - break; | |
8227 | case BJ_Metadata: | |
8228 | transaction->t_nr_buffers++; | |
8229 | list = &transaction->t_buffers; | |
8230 | @@ -2049,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, | |
8231 | case BJ_Reserved: | |
8232 | list = &transaction->t_reserved_list; | |
8233 | break; | |
8234 | - case BJ_Locked: | |
8235 | - list = &transaction->t_locked_list; | |
8236 | - break; | |
8237 | } | |
8238 | ||
8239 | __blist_add_buffer(list, jh); | |
8240 | @@ -2141,3 +1991,88 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) | |
8241 | spin_unlock(&journal->j_list_lock); | |
8242 | __brelse(bh); | |
8243 | } | |
8244 | + | |
8245 | +/* | |
8246 | + * File inode in the inode list of the handle's transaction | |
8247 | + */ | |
8248 | +int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) | |
8249 | +{ | |
8250 | + transaction_t *transaction = handle->h_transaction; | |
8251 | + journal_t *journal = transaction->t_journal; | |
8252 | + | |
8253 | + if (is_handle_aborted(handle)) | |
8254 | + return -EIO; | |
8255 | + | |
8256 | + jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, | |
8257 | + transaction->t_tid); | |
8258 | + | |
8259 | + /* | |
8260 | + * First check whether inode isn't already on the transaction's | |
8261 | + * lists without taking the lock. Note that this check is safe | |
8262 | + * without the lock as we cannot race with somebody removing inode | |
8263 | + * from the transaction. The reason is that we remove inode from the | |
8264 | + * transaction only in journal_release_jbd_inode() and when we commit | |
8265 | + * the transaction. We are guarded from the first case by holding | |
8266 | + * a reference to the inode. We are safe against the second case | |
8267 | + * because if jinode->i_transaction == transaction, commit code | |
8268 | + * cannot touch the transaction because we hold reference to it, | |
8269 | + * and if jinode->i_next_transaction == transaction, commit code | |
8270 | + * will only file the inode where we want it. | |
8271 | + */ | |
8272 | + if (jinode->i_transaction == transaction || | |
8273 | + jinode->i_next_transaction == transaction) | |
8274 | + return 0; | |
8275 | + | |
8276 | + spin_lock(&journal->j_list_lock); | |
8277 | + | |
8278 | + if (jinode->i_transaction == transaction || | |
8279 | + jinode->i_next_transaction == transaction) | |
8280 | + goto done; | |
8281 | + | |
8282 | + /* On some different transaction's list - should be | |
8283 | + * the committing one */ | |
8284 | + if (jinode->i_transaction) { | |
8285 | + J_ASSERT(jinode->i_next_transaction == NULL); | |
8286 | + J_ASSERT(jinode->i_transaction == | |
8287 | + journal->j_committing_transaction); | |
8288 | + jinode->i_next_transaction = transaction; | |
8289 | + goto done; | |
8290 | + } | |
8291 | + /* Not on any transaction list... */ | |
8292 | + J_ASSERT(!jinode->i_next_transaction); | |
8293 | + jinode->i_transaction = transaction; | |
8294 | + list_add(&jinode->i_list, &transaction->t_inode_list); | |
8295 | +done: | |
8296 | + spin_unlock(&journal->j_list_lock); | |
8297 | + | |
8298 | + return 0; | |
8299 | +} | |
8300 | + | |
8301 | +/* | |
8302 | + * This function must be called when inode is journaled in ordered mode | |
8303 | + * before truncation happens. It starts writeout of truncated part in | |
8304 | + * case it is in the committing transaction so that we stand to ordered | |
8305 | + * mode consistency guarantees. | |
8306 | + */ | |
8307 | +int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, | |
8308 | + loff_t new_size) | |
8309 | +{ | |
8310 | + journal_t *journal; | |
8311 | + transaction_t *commit_trans; | |
8312 | + int ret = 0; | |
8313 | + | |
8314 | + if (!inode->i_transaction && !inode->i_next_transaction) | |
8315 | + goto out; | |
8316 | + journal = inode->i_transaction->t_journal; | |
8317 | + spin_lock(&journal->j_state_lock); | |
8318 | + commit_trans = journal->j_committing_transaction; | |
8319 | + spin_unlock(&journal->j_state_lock); | |
8320 | + if (inode->i_transaction == commit_trans) { | |
8321 | + ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping, | |
8322 | + new_size, LLONG_MAX); | |
8323 | + if (ret) | |
8324 | + jbd2_journal_abort(journal, ret); | |
8325 | + } | |
8326 | +out: | |
8327 | + return ret; | |
8328 | +} | |
8329 | diff --git a/fs/mpage.c b/fs/mpage.c | |
8330 | index 235e4d3..dbcc7af 100644 | |
8331 | --- a/fs/mpage.c | |
8332 | +++ b/fs/mpage.c | |
8333 | @@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err) | |
8334 | bio_put(bio); | |
8335 | } | |
8336 | ||
8337 | -static struct bio *mpage_bio_submit(int rw, struct bio *bio) | |
8338 | +struct bio *mpage_bio_submit(int rw, struct bio *bio) | |
8339 | { | |
8340 | bio->bi_end_io = mpage_end_io_read; | |
8341 | if (rw == WRITE) | |
8342 | @@ -90,6 +90,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio) | |
8343 | submit_bio(rw, bio); | |
8344 | return NULL; | |
8345 | } | |
8346 | +EXPORT_SYMBOL(mpage_bio_submit); | |
8347 | ||
8348 | static struct bio * | |
8349 | mpage_alloc(struct block_device *bdev, | |
8350 | @@ -435,15 +436,9 @@ EXPORT_SYMBOL(mpage_readpage); | |
8351 | * written, so it can intelligently allocate a suitably-sized BIO. For now, | |
8352 | * just allocate full-size (16-page) BIOs. | |
8353 | */ | |
8354 | -struct mpage_data { | |
8355 | - struct bio *bio; | |
8356 | - sector_t last_block_in_bio; | |
8357 | - get_block_t *get_block; | |
8358 | - unsigned use_writepage; | |
8359 | -}; | |
8360 | ||
8361 | -static int __mpage_writepage(struct page *page, struct writeback_control *wbc, | |
8362 | - void *data) | |
8363 | +int __mpage_writepage(struct page *page, struct writeback_control *wbc, | |
8364 | + void *data) | |
8365 | { | |
8366 | struct mpage_data *mpd = data; | |
8367 | struct bio *bio = mpd->bio; | |
8368 | @@ -651,6 +646,7 @@ out: | |
8369 | mpd->bio = bio; | |
8370 | return ret; | |
8371 | } | |
8372 | +EXPORT_SYMBOL(__mpage_writepage); | |
8373 | ||
8374 | /** | |
8375 | * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them | |
8376 | diff --git a/include/linux/fs.h b/include/linux/fs.h | |
8377 | index d8e2762..97f992a 100644 | |
8378 | --- a/include/linux/fs.h | |
8379 | +++ b/include/linux/fs.h | |
8380 | @@ -1740,6 +1740,8 @@ extern int wait_on_page_writeback_range(struct address_space *mapping, | |
8381 | pgoff_t start, pgoff_t end); | |
8382 | extern int __filemap_fdatawrite_range(struct address_space *mapping, | |
8383 | loff_t start, loff_t end, int sync_mode); | |
8384 | +extern int filemap_fdatawrite_range(struct address_space *mapping, | |
8385 | + loff_t start, loff_t end); | |
8386 | ||
8387 | extern long do_fsync(struct file *file, int datasync); | |
8388 | extern void sync_supers(void); | |
8389 | diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h | |
8390 | index d147f0f..3dd2090 100644 | |
8391 | --- a/include/linux/jbd2.h | |
8392 | +++ b/include/linux/jbd2.h | |
8393 | @@ -168,6 +168,8 @@ struct commit_header { | |
8394 | unsigned char h_chksum_size; | |
8395 | unsigned char h_padding[2]; | |
8396 | __be32 h_chksum[JBD2_CHECKSUM_BYTES]; | |
8397 | + __be64 h_commit_sec; | |
8398 | + __be32 h_commit_nsec; | |
8399 | }; | |
8400 | ||
8401 | /* | |
8402 | @@ -379,6 +381,38 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh) | |
8403 | bit_spin_unlock(BH_JournalHead, &bh->b_state); | |
8404 | } | |
8405 | ||
8406 | +/* Flags in jbd_inode->i_flags */ | |
8407 | +#define __JI_COMMIT_RUNNING 0 | |
8408 | +/* Commit of the inode data in progress. We use this flag to protect us from | |
8409 | + * concurrent deletion of inode. We cannot use reference to inode for this | |
8410 | + * since we cannot afford doing last iput() on behalf of kjournald | |
8411 | + */ | |
8412 | +#define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING) | |
8413 | + | |
8414 | +/** | |
8415 | + * struct jbd_inode is the structure linking inodes in ordered mode | |
8416 | + * present in a transaction so that we can sync them during commit. | |
8417 | + */ | |
8418 | +struct jbd2_inode { | |
8419 | + /* Which transaction does this inode belong to? Either the running | |
8420 | + * transaction or the committing one. [j_list_lock] */ | |
8421 | + transaction_t *i_transaction; | |
8422 | + | |
8423 | + /* Pointer to the running transaction modifying inode's data in case | |
8424 | + * there is already a committing transaction touching it. [j_list_lock] */ | |
8425 | + transaction_t *i_next_transaction; | |
8426 | + | |
8427 | + /* List of inodes in the i_transaction [j_list_lock] */ | |
8428 | + struct list_head i_list; | |
8429 | + | |
8430 | + /* VFS inode this inode belongs to [constant during the lifetime | |
8431 | + * of the structure] */ | |
8432 | + struct inode *i_vfs_inode; | |
8433 | + | |
8434 | + /* Flags of inode [j_list_lock] */ | |
8435 | + unsigned int i_flags; | |
8436 | +}; | |
8437 | + | |
8438 | struct jbd2_revoke_table_s; | |
8439 | ||
8440 | /** | |
8441 | @@ -509,24 +543,12 @@ struct transaction_s | |
8442 | struct journal_head *t_reserved_list; | |
8443 | ||
8444 | /* | |
8445 | - * Doubly-linked circular list of all buffers under writeout during | |
8446 | - * commit [j_list_lock] | |
8447 | - */ | |
8448 | - struct journal_head *t_locked_list; | |
8449 | - | |
8450 | - /* | |
8451 | * Doubly-linked circular list of all metadata buffers owned by this | |
8452 | * transaction [j_list_lock] | |
8453 | */ | |
8454 | struct journal_head *t_buffers; | |
8455 | ||
8456 | /* | |
8457 | - * Doubly-linked circular list of all data buffers still to be | |
8458 | - * flushed before this transaction can be committed [j_list_lock] | |
8459 | - */ | |
8460 | - struct journal_head *t_sync_datalist; | |
8461 | - | |
8462 | - /* | |
8463 | * Doubly-linked circular list of all forget buffers (superseded | |
8464 | * buffers which we can un-checkpoint once this transaction commits) | |
8465 | * [j_list_lock] | |
8466 | @@ -565,6 +587,12 @@ struct transaction_s | |
8467 | struct journal_head *t_log_list; | |
8468 | ||
8469 | /* | |
8470 | + * List of inodes whose data we've modified in data=ordered mode. | |
8471 | + * [j_list_lock] | |
8472 | + */ | |
8473 | + struct list_head t_inode_list; | |
8474 | + | |
8475 | + /* | |
8476 | * Protects info related to handles | |
8477 | */ | |
8478 | spinlock_t t_handle_lock; | |
8479 | @@ -1004,7 +1032,6 @@ extern int jbd2_journal_extend (handle_t *, int nblocks); | |
8480 | extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *); | |
8481 | extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *); | |
8482 | extern int jbd2_journal_get_undo_access(handle_t *, struct buffer_head *); | |
8483 | -extern int jbd2_journal_dirty_data (handle_t *, struct buffer_head *); | |
8484 | extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *); | |
8485 | extern void jbd2_journal_release_buffer (handle_t *, struct buffer_head *); | |
8486 | extern int jbd2_journal_forget (handle_t *, struct buffer_head *); | |
8487 | @@ -1044,6 +1071,10 @@ extern void jbd2_journal_ack_err (journal_t *); | |
8488 | extern int jbd2_journal_clear_err (journal_t *); | |
8489 | extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); | |
8490 | extern int jbd2_journal_force_commit(journal_t *); | |
8491 | +extern int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode); | |
8492 | +extern int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, loff_t new_size); | |
8493 | +extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode); | |
8494 | +extern void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode); | |
8495 | ||
8496 | /* | |
8497 | * journal_head management | |
8498 | @@ -1179,15 +1210,13 @@ static inline int jbd_space_needed(journal_t *journal) | |
8499 | ||
8500 | /* journaling buffer types */ | |
8501 | #define BJ_None 0 /* Not journaled */ | |
8502 | -#define BJ_SyncData 1 /* Normal data: flush before commit */ | |
8503 | -#define BJ_Metadata 2 /* Normal journaled metadata */ | |
8504 | -#define BJ_Forget 3 /* Buffer superseded by this transaction */ | |
8505 | -#define BJ_IO 4 /* Buffer is for temporary IO use */ | |
8506 | -#define BJ_Shadow 5 /* Buffer contents being shadowed to the log */ | |
8507 | -#define BJ_LogCtl 6 /* Buffer contains log descriptors */ | |
8508 | -#define BJ_Reserved 7 /* Buffer is reserved for access by journal */ | |
8509 | -#define BJ_Locked 8 /* Locked for I/O during commit */ | |
8510 | -#define BJ_Types 9 | |
8511 | +#define BJ_Metadata 1 /* Normal journaled metadata */ | |
8512 | +#define BJ_Forget 2 /* Buffer superseded by this transaction */ | |
8513 | +#define BJ_IO 3 /* Buffer is for temporary IO use */ | |
8514 | +#define BJ_Shadow 4 /* Buffer contents being shadowed to the log */ | |
8515 | +#define BJ_LogCtl 5 /* Buffer contains log descriptors */ | |
8516 | +#define BJ_Reserved 6 /* Buffer is reserved for access by journal */ | |
8517 | +#define BJ_Types 7 | |
8518 | ||
8519 | extern int jbd_blocks_per_page(struct inode *inode); | |
8520 | ||
8521 | diff --git a/include/linux/mpage.h b/include/linux/mpage.h | |
8522 | index 068a0c9..5c42821 100644 | |
8523 | --- a/include/linux/mpage.h | |
8524 | +++ b/include/linux/mpage.h | |
8525 | @@ -11,11 +11,21 @@ | |
8526 | */ | |
8527 | #ifdef CONFIG_BLOCK | |
8528 | ||
8529 | +struct mpage_data { | |
8530 | + struct bio *bio; | |
8531 | + sector_t last_block_in_bio; | |
8532 | + get_block_t *get_block; | |
8533 | + unsigned use_writepage; | |
8534 | +}; | |
8535 | + | |
8536 | struct writeback_control; | |
8537 | ||
8538 | +struct bio *mpage_bio_submit(int rw, struct bio *bio); | |
8539 | int mpage_readpages(struct address_space *mapping, struct list_head *pages, | |
8540 | unsigned nr_pages, get_block_t get_block); | |
8541 | int mpage_readpage(struct page *page, get_block_t get_block); | |
8542 | +int __mpage_writepage(struct page *page, struct writeback_control *wbc, | |
8543 | + void *data); | |
8544 | int mpage_writepages(struct address_space *mapping, | |
8545 | struct writeback_control *wbc, get_block_t get_block); | |
8546 | int mpage_writepage(struct page *page, get_block_t *get_block, | |
8547 | diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h | |
8548 | index 9007ccd..2083888 100644 | |
8549 | --- a/include/linux/percpu_counter.h | |
8550 | +++ b/include/linux/percpu_counter.h | |
8551 | @@ -35,7 +35,7 @@ int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount); | |
8552 | void percpu_counter_destroy(struct percpu_counter *fbc); | |
8553 | void percpu_counter_set(struct percpu_counter *fbc, s64 amount); | |
8554 | void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch); | |
8555 | -s64 __percpu_counter_sum(struct percpu_counter *fbc); | |
8556 | +s64 __percpu_counter_sum(struct percpu_counter *fbc, int set); | |
8557 | ||
8558 | static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) | |
8559 | { | |
8560 | @@ -44,13 +44,19 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) | |
8561 | ||
8562 | static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc) | |
8563 | { | |
8564 | - s64 ret = __percpu_counter_sum(fbc); | |
8565 | + s64 ret = __percpu_counter_sum(fbc, 0); | |
8566 | return ret < 0 ? 0 : ret; | |
8567 | } | |
8568 | ||
8569 | +static inline s64 percpu_counter_sum_and_set(struct percpu_counter *fbc) | |
8570 | +{ | |
8571 | + return __percpu_counter_sum(fbc, 1); | |
8572 | +} | |
8573 | + | |
8574 | + | |
8575 | static inline s64 percpu_counter_sum(struct percpu_counter *fbc) | |
8576 | { | |
8577 | - return __percpu_counter_sum(fbc); | |
8578 | + return __percpu_counter_sum(fbc, 0); | |
8579 | } | |
8580 | ||
8581 | static inline s64 percpu_counter_read(struct percpu_counter *fbc) | |
8582 | diff --git a/include/linux/writeback.h b/include/linux/writeback.h | |
8583 | index f462439..0d8573e 100644 | |
8584 | --- a/include/linux/writeback.h | |
8585 | +++ b/include/linux/writeback.h | |
8586 | @@ -63,6 +63,7 @@ struct writeback_control { | |
8587 | unsigned for_writepages:1; /* This is a writepages() call */ | |
8588 | unsigned range_cyclic:1; /* range_start is cyclic */ | |
8589 | unsigned more_io:1; /* more io to be dispatched */ | |
8590 | + unsigned range_cont:1; | |
8591 | }; | |
8592 | ||
8593 | /* | |
8594 | diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c | |
8595 | index 1191744..4a8ba4b 100644 | |
8596 | --- a/lib/percpu_counter.c | |
8597 | +++ b/lib/percpu_counter.c | |
8598 | @@ -52,7 +52,7 @@ EXPORT_SYMBOL(__percpu_counter_add); | |
8599 | * Add up all the per-cpu counts, return the result. This is a more accurate | |
8600 | * but much slower version of percpu_counter_read_positive() | |
8601 | */ | |
8602 | -s64 __percpu_counter_sum(struct percpu_counter *fbc) | |
8603 | +s64 __percpu_counter_sum(struct percpu_counter *fbc, int set) | |
8604 | { | |
8605 | s64 ret; | |
8606 | int cpu; | |
8607 | @@ -62,7 +62,12 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc) | |
8608 | for_each_online_cpu(cpu) { | |
8609 | s32 *pcount = per_cpu_ptr(fbc->counters, cpu); | |
8610 | ret += *pcount; | |
8611 | + if (set) | |
8612 | + *pcount = 0; | |
8613 | } | |
8614 | + if (set) | |
8615 | + fbc->count = ret; | |
8616 | + | |
8617 | spin_unlock(&fbc->lock); | |
8618 | return ret; | |
8619 | } | |
8620 | diff --git a/mm/filemap.c b/mm/filemap.c | |
8621 | index 1e6a7d3..65d9d9e 100644 | |
8622 | --- a/mm/filemap.c | |
8623 | +++ b/mm/filemap.c | |
8624 | @@ -236,11 +236,12 @@ int filemap_fdatawrite(struct address_space *mapping) | |
8625 | } | |
8626 | EXPORT_SYMBOL(filemap_fdatawrite); | |
8627 | ||
8628 | -static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, | |
8629 | +int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, | |
8630 | loff_t end) | |
8631 | { | |
8632 | return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); | |
8633 | } | |
8634 | +EXPORT_SYMBOL(filemap_fdatawrite_range); | |
8635 | ||
8636 | /** | |
8637 | * filemap_flush - mostly a non-blocking flush | |
8638 | diff --git a/mm/page-writeback.c b/mm/page-writeback.c | |
8639 | index 789b6ad..ded57d5 100644 | |
8640 | --- a/mm/page-writeback.c | |
8641 | +++ b/mm/page-writeback.c | |
8642 | @@ -956,6 +956,9 @@ retry: | |
8643 | } | |
8644 | if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) | |
8645 | mapping->writeback_index = index; | |
8646 | + | |
8647 | + if (wbc->range_cont) | |
8648 | + wbc->range_start = index << PAGE_CACHE_SHIFT; | |
8649 | return ret; | |
8650 | } | |
8651 | EXPORT_SYMBOL(write_cache_pages); |