]>
Commit | Line | Data |
---|---|---|
08aa9d92 | 1 | --- linux-2.6.33/scripts/mod/modpost.c~ 2010-02-24 19:52:17.000000000 +0100 |
2 | +++ linux-2.6.33/scripts/mod/modpost.c 2010-03-07 14:26:47.242168558 +0100 | |
3 | @@ -15,7 +15,8 @@ | |
4 | #include <stdio.h> | |
5 | #include <ctype.h> | |
6 | #include "modpost.h" | |
7 | -#include "../../include/generated/autoconf.h" | |
8 | +// PLD architectures don't use CONFIG_SYMBOL_PREFIX | |
9 | +//#include "../../include/generated/autoconf.h" | |
10 | #include "../../include/linux/license.h" | |
11 | ||
12 | /* Some toolchains use a `_' prefix for all user symbols. */ | |
13 | ||
2136e199 AM |
14 | --- linux-3.0/scripts/kconfig/lxdialog/check-lxdialog.sh~ 2011-07-22 04:17:23.000000000 +0200 |
15 | +++ linux-3.0/scripts/kconfig/lxdialog/check-lxdialog.sh 2011-08-25 21:26:04.799150642 +0200 | |
16 | @@ -9,6 +9,12 @@ | |
17 | $cc -print-file-name=lib${lib}.${ext} | grep -q / | |
18 | if [ $? -eq 0 ]; then | |
19 | echo "-l${lib}" | |
20 | + for libt in tinfow tinfo ; do | |
21 | + $cc -print-file-name=lib${libt}.${ext} | grep -q / | |
22 | + if [ $? -eq 0 ]; then | |
23 | + echo "-l${libt}" | |
24 | + fi | |
25 | + done | |
26 | exit | |
27 | fi | |
28 | done | |
7e7bde06 | 29 | |
99992ee3 AM |
30 | From 7a29ac474a47eb8cf212b45917683ae89d6fa13b Mon Sep 17 00:00:00 2001 |
31 | From: Chris Mason <clm@fb.com> | |
32 | Date: Tue, 10 Nov 2015 10:10:34 +1100 | |
33 | Subject: xfs: give all workqueues rescuer threads | |
34 | ||
35 | We're consistently hitting deadlocks here with XFS on recent kernels. | |
36 | After some digging through the crash files, it looks like everyone in | |
37 | the system is waiting for XFS to reclaim memory. | |
38 | ||
39 | Something like this: | |
40 | ||
41 | PID: 2733434 TASK: ffff8808cd242800 CPU: 19 COMMAND: "java" | |
42 | #0 [ffff880019c53588] __schedule at ffffffff818c4df2 | |
43 | #1 [ffff880019c535d8] schedule at ffffffff818c5517 | |
44 | #2 [ffff880019c535f8] _xfs_log_force_lsn at ffffffff81316348 | |
45 | #3 [ffff880019c53688] xfs_log_force_lsn at ffffffff813164fb | |
46 | #4 [ffff880019c536b8] xfs_iunpin_wait at ffffffff8130835e | |
47 | #5 [ffff880019c53728] xfs_reclaim_inode at ffffffff812fd453 | |
48 | #6 [ffff880019c53778] xfs_reclaim_inodes_ag at ffffffff812fd8c7 | |
49 | #7 [ffff880019c53928] xfs_reclaim_inodes_nr at ffffffff812fe433 | |
50 | #8 [ffff880019c53958] xfs_fs_free_cached_objects at ffffffff8130d3b9 | |
51 | #9 [ffff880019c53968] super_cache_scan at ffffffff811a6f73 | |
52 | #10 [ffff880019c539c8] shrink_slab at ffffffff811460e6 | |
53 | #11 [ffff880019c53aa8] shrink_zone at ffffffff8114a53f | |
54 | #12 [ffff880019c53b48] do_try_to_free_pages at ffffffff8114a8ba | |
55 | #13 [ffff880019c53be8] try_to_free_pages at ffffffff8114ad5a | |
56 | #14 [ffff880019c53c78] __alloc_pages_nodemask at ffffffff8113e1b8 | |
57 | #15 [ffff880019c53d88] alloc_kmem_pages_node at ffffffff8113e671 | |
58 | #16 [ffff880019c53dd8] copy_process at ffffffff8104f781 | |
59 | #17 [ffff880019c53ec8] do_fork at ffffffff8105129c | |
60 | #18 [ffff880019c53f38] sys_clone at ffffffff810515b6 | |
61 | #19 [ffff880019c53f48] stub_clone at ffffffff818c8e4d | |
62 | ||
63 | xfs_log_force_lsn is waiting for logs to get cleaned, which is waiting | |
64 | for IO, which is waiting for workers to complete the IO which is waiting | |
65 | for worker threads that don't exist yet: | |
66 | ||
67 | PID: 2752451 TASK: ffff880bd6bdda00 CPU: 37 COMMAND: "kworker/37:1" | |
68 | #0 [ffff8808d20abbb0] __schedule at ffffffff818c4df2 | |
69 | #1 [ffff8808d20abc00] schedule at ffffffff818c5517 | |
70 | #2 [ffff8808d20abc20] schedule_timeout at ffffffff818c7c6c | |
71 | #3 [ffff8808d20abcc0] wait_for_completion_killable at ffffffff818c6495 | |
72 | #4 [ffff8808d20abd30] kthread_create_on_node at ffffffff8106ec82 | |
73 | #5 [ffff8808d20abdf0] create_worker at ffffffff8106752f | |
74 | #6 [ffff8808d20abe40] worker_thread at ffffffff810699be | |
75 | #7 [ffff8808d20abec0] kthread at ffffffff8106ef59 | |
76 | #8 [ffff8808d20abf50] ret_from_fork at ffffffff818c8ac8 | |
77 | ||
78 | I think we should be using WQ_MEM_RECLAIM to make sure this thread | |
79 | pool makes progress when we're not able to allocate new workers. | |
80 | ||
81 | [dchinner: make all workqueues WQ_MEM_RECLAIM] | |
82 | ||
83 | Signed-off-by: Chris Mason <clm@fb.com> | |
84 | Reviewed-by: Dave Chinner <dchinner@redhat.com> | |
85 | Signed-off-by: Dave Chinner <david@fromorbit.com> | |
86 | --- | |
87 | fs/xfs/xfs_super.c | 7 ++++--- | |
88 | 1 file changed, 4 insertions(+), 3 deletions(-) | |
89 | ||
90 | diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c | |
91 | index 29531ec..65fbfb7 100644 | |
92 | --- a/fs/xfs/xfs_super.c | |
93 | +++ b/fs/xfs/xfs_super.c | |
94 | @@ -838,17 +838,18 @@ xfs_init_mount_workqueues( | |
95 | goto out_destroy_unwritten; | |
96 | ||
97 | mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", | |
98 | - WQ_FREEZABLE, 0, mp->m_fsname); | |
99 | + WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); | |
100 | if (!mp->m_reclaim_workqueue) | |
101 | goto out_destroy_cil; | |
102 | ||
103 | mp->m_log_workqueue = alloc_workqueue("xfs-log/%s", | |
104 | - WQ_FREEZABLE|WQ_HIGHPRI, 0, mp->m_fsname); | |
105 | + WQ_MEM_RECLAIM|WQ_FREEZABLE|WQ_HIGHPRI, 0, | |
106 | + mp->m_fsname); | |
107 | if (!mp->m_log_workqueue) | |
108 | goto out_destroy_reclaim; | |
109 | ||
110 | mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s", | |
111 | - WQ_FREEZABLE, 0, mp->m_fsname); | |
112 | + WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); | |
113 | if (!mp->m_eofblocks_workqueue) | |
114 | goto out_destroy_log; | |
115 | ||
116 | -- | |
117 | cgit v0.11.2 | |
118 | ||
e1bba219 AM |
119 | commit c2d42c16ad83006a706d83e51a7268db04af733a |
120 | Author: Andrew Morton <akpm@linux-foundation.org> | |
121 | Date: Thu Nov 5 18:48:43 2015 -0800 | |
122 | ||
123 | mm/vmstat.c: uninline node_page_state() | |
124 | ||
125 | With x86_64 (config http://ozlabs.org/~akpm/config-akpm2.txt) and old gcc | |
126 | (4.4.4), drivers/base/node.c:node_read_meminfo() is using 2344 bytes of | |
127 | stack. Uninlining node_page_state() reduces this to 440 bytes. | |
128 | ||
129 | The stack consumption issue is fixed by newer gcc (4.8.4) however with | |
130 | that compiler this patch reduces the node.o text size from 7314 bytes to | |
131 | 4578. | |
132 | ||
133 | Signed-off-by: Andrew Morton <akpm@linux-foundation.org> | |
134 | Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> | |
135 | ||
136 | diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h | |
137 | index 82e7db7..49dfe40 100644 | |
138 | --- a/include/linux/vmstat.h | |
139 | +++ b/include/linux/vmstat.h | |
140 | @@ -161,30 +161,8 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone, | |
141 | } | |
142 | ||
143 | #ifdef CONFIG_NUMA | |
144 | -/* | |
145 | - * Determine the per node value of a stat item. This function | |
146 | - * is called frequently in a NUMA machine, so try to be as | |
147 | - * frugal as possible. | |
148 | - */ | |
149 | -static inline unsigned long node_page_state(int node, | |
150 | - enum zone_stat_item item) | |
151 | -{ | |
152 | - struct zone *zones = NODE_DATA(node)->node_zones; | |
153 | - | |
154 | - return | |
155 | -#ifdef CONFIG_ZONE_DMA | |
156 | - zone_page_state(&zones[ZONE_DMA], item) + | |
157 | -#endif | |
158 | -#ifdef CONFIG_ZONE_DMA32 | |
159 | - zone_page_state(&zones[ZONE_DMA32], item) + | |
160 | -#endif | |
161 | -#ifdef CONFIG_HIGHMEM | |
162 | - zone_page_state(&zones[ZONE_HIGHMEM], item) + | |
163 | -#endif | |
164 | - zone_page_state(&zones[ZONE_NORMAL], item) + | |
165 | - zone_page_state(&zones[ZONE_MOVABLE], item); | |
166 | -} | |
167 | ||
168 | +extern unsigned long node_page_state(int node, enum zone_stat_item item); | |
169 | extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp); | |
170 | ||
171 | #else | |
172 | diff --git a/mm/vmstat.c b/mm/vmstat.c | |
173 | index fbf1448..ffcb4f5 100644 | |
174 | --- a/mm/vmstat.c | |
175 | +++ b/mm/vmstat.c | |
176 | @@ -591,6 +591,28 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags) | |
177 | else | |
178 | __inc_zone_state(z, NUMA_OTHER); | |
179 | } | |
180 | + | |
181 | +/* | |
182 | + * Determine the per node value of a stat item. | |
183 | + */ | |
184 | +unsigned long node_page_state(int node, enum zone_stat_item item) | |
185 | +{ | |
186 | + struct zone *zones = NODE_DATA(node)->node_zones; | |
187 | + | |
188 | + return | |
189 | +#ifdef CONFIG_ZONE_DMA | |
190 | + zone_page_state(&zones[ZONE_DMA], item) + | |
191 | +#endif | |
192 | +#ifdef CONFIG_ZONE_DMA32 | |
193 | + zone_page_state(&zones[ZONE_DMA32], item) + | |
194 | +#endif | |
195 | +#ifdef CONFIG_HIGHMEM | |
196 | + zone_page_state(&zones[ZONE_HIGHMEM], item) + | |
197 | +#endif | |
198 | + zone_page_state(&zones[ZONE_NORMAL], item) + | |
199 | + zone_page_state(&zones[ZONE_MOVABLE], item); | |
200 | +} | |
201 | + | |
202 | #endif | |
203 | ||
204 | #ifdef CONFIG_COMPACTION | |
205 | commit 016c13daa5c9e4827eca703e2f0621c131f2cca3 | |
206 | Author: Mel Gorman <mgorman@techsingularity.net> | |
207 | Date: Fri Nov 6 16:28:18 2015 -0800 | |
208 | ||
209 | mm, page_alloc: use masks and shifts when converting GFP flags to migrate types | |
210 | ||
211 | This patch redefines which GFP bits are used for specifying mobility and | |
212 | the order of the migrate types. Once redefined it's possible to convert | |
213 | GFP flags to a migrate type with a simple mask and shift. The only | |
214 | downside is that readers of OOM kill messages and allocation failures may | |
215 | have been used to the existing values but scripts/gfp-translate will help. | |
216 | ||
217 | Signed-off-by: Mel Gorman <mgorman@techsingularity.net> | |
218 | Acked-by: Vlastimil Babka <vbabka@suse.cz> | |
219 | Cc: Christoph Lameter <cl@linux.com> | |
220 | Cc: David Rientjes <rientjes@google.com> | |
221 | Cc: Johannes Weiner <hannes@cmpxchg.org> | |
222 | Cc: Michal Hocko <mhocko@suse.com> | |
223 | Cc: Vitaly Wool <vitalywool@gmail.com> | |
224 | Cc: Rik van Riel <riel@redhat.com> | |
225 | Signed-off-by: Andrew Morton <akpm@linux-foundation.org> | |
226 | Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> | |
227 | ||
228 | diff --git a/include/linux/gfp.h b/include/linux/gfp.h | |
229 | index f92cbd2..440fca3 100644 | |
230 | --- a/include/linux/gfp.h | |
231 | +++ b/include/linux/gfp.h | |
232 | @@ -14,7 +14,7 @@ struct vm_area_struct; | |
233 | #define ___GFP_HIGHMEM 0x02u | |
234 | #define ___GFP_DMA32 0x04u | |
235 | #define ___GFP_MOVABLE 0x08u | |
236 | -#define ___GFP_WAIT 0x10u | |
237 | +#define ___GFP_RECLAIMABLE 0x10u | |
238 | #define ___GFP_HIGH 0x20u | |
239 | #define ___GFP_IO 0x40u | |
240 | #define ___GFP_FS 0x80u | |
241 | @@ -29,7 +29,7 @@ struct vm_area_struct; | |
242 | #define ___GFP_NOMEMALLOC 0x10000u | |
243 | #define ___GFP_HARDWALL 0x20000u | |
244 | #define ___GFP_THISNODE 0x40000u | |
245 | -#define ___GFP_RECLAIMABLE 0x80000u | |
246 | +#define ___GFP_WAIT 0x80000u | |
247 | #define ___GFP_NOACCOUNT 0x100000u | |
248 | #define ___GFP_NOTRACK 0x200000u | |
249 | #define ___GFP_NO_KSWAPD 0x400000u | |
250 | @@ -126,6 +126,7 @@ struct vm_area_struct; | |
251 | ||
252 | /* This mask makes up all the page movable related flags */ | |
253 | #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) | |
254 | +#define GFP_MOVABLE_SHIFT 3 | |
255 | ||
256 | /* Control page allocator reclaim behavior */ | |
257 | #define GFP_RECLAIM_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|\ | |
258 | @@ -152,14 +153,15 @@ struct vm_area_struct; | |
259 | /* Convert GFP flags to their corresponding migrate type */ | |
260 | static inline int gfpflags_to_migratetype(const gfp_t gfp_flags) | |
261 | { | |
262 | - WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK); | |
263 | + VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK); | |
264 | + BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE); | |
265 | + BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE); | |
266 | ||
267 | if (unlikely(page_group_by_mobility_disabled)) | |
268 | return MIGRATE_UNMOVABLE; | |
269 | ||
270 | /* Group based on mobility */ | |
271 | - return (((gfp_flags & __GFP_MOVABLE) != 0) << 1) | | |
272 | - ((gfp_flags & __GFP_RECLAIMABLE) != 0); | |
273 | + return (gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT; | |
274 | } | |
275 | ||
276 | #ifdef CONFIG_HIGHMEM | |
277 | diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h | |
278 | index e326843..38bed71 100644 | |
279 | --- a/include/linux/mmzone.h | |
280 | +++ b/include/linux/mmzone.h | |
281 | @@ -37,8 +37,8 @@ | |
282 | ||
283 | enum { | |
284 | MIGRATE_UNMOVABLE, | |
285 | - MIGRATE_RECLAIMABLE, | |
286 | MIGRATE_MOVABLE, | |
287 | + MIGRATE_RECLAIMABLE, | |
288 | MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ | |
289 | MIGRATE_RESERVE = MIGRATE_PCPTYPES, | |
290 | #ifdef CONFIG_CMA | |
291 | commit 974a786e63c96a2401a78ddba926f34c128474f1 | |
292 | Author: Mel Gorman <mgorman@techsingularity.net> | |
293 | Date: Fri Nov 6 16:28:34 2015 -0800 | |
294 | ||
295 | mm, page_alloc: remove MIGRATE_RESERVE | |
296 | ||
297 | MIGRATE_RESERVE preserves an old property of the buddy allocator that | |
298 | existed prior to fragmentation avoidance -- min_free_kbytes worth of pages | |
299 | tended to remain contiguous until the only alternative was to fail the | |
300 | allocation. At the time it was discovered that high-order atomic | |
301 | allocations relied on this property so MIGRATE_RESERVE was introduced. A | |
302 | later patch will introduce an alternative MIGRATE_HIGHATOMIC so this patch | |
303 | deletes MIGRATE_RESERVE and supporting code so it'll be easier to review. | |
304 | Note that this patch in isolation may look like a false regression if | |
305 | someone was bisecting high-order atomic allocation failures. | |
306 | ||
307 | Signed-off-by: Mel Gorman <mgorman@techsingularity.net> | |
308 | Acked-by: Vlastimil Babka <vbabka@suse.cz> | |
309 | Cc: Christoph Lameter <cl@linux.com> | |
310 | Cc: David Rientjes <rientjes@google.com> | |
311 | Cc: Johannes Weiner <hannes@cmpxchg.org> | |
312 | Cc: Michal Hocko <mhocko@suse.com> | |
313 | Cc: Vitaly Wool <vitalywool@gmail.com> | |
314 | Cc: Rik van Riel <riel@redhat.com> | |
315 | Signed-off-by: Andrew Morton <akpm@linux-foundation.org> | |
316 | Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> | |
317 | ||
318 | diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h | |
319 | index 1e88aae..b86cfa3 100644 | |
320 | --- a/include/linux/mmzone.h | |
321 | +++ b/include/linux/mmzone.h | |
322 | @@ -39,8 +39,6 @@ enum { | |
323 | MIGRATE_UNMOVABLE, | |
324 | MIGRATE_MOVABLE, | |
325 | MIGRATE_RECLAIMABLE, | |
326 | - MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ | |
327 | - MIGRATE_RESERVE = MIGRATE_PCPTYPES, | |
328 | #ifdef CONFIG_CMA | |
329 | /* | |
330 | * MIGRATE_CMA migration type is designed to mimic the way | |
331 | @@ -63,6 +61,8 @@ enum { | |
332 | MIGRATE_TYPES | |
333 | }; | |
334 | ||
335 | +#define MIGRATE_PCPTYPES (MIGRATE_RECLAIMABLE+1) | |
336 | + | |
337 | #ifdef CONFIG_CMA | |
338 | # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) | |
339 | #else | |
340 | @@ -429,12 +429,6 @@ struct zone { | |
341 | ||
342 | const char *name; | |
343 | ||
344 | - /* | |
345 | - * Number of MIGRATE_RESERVE page block. To maintain for just | |
346 | - * optimization. Protected by zone->lock. | |
347 | - */ | |
348 | - int nr_migrate_reserve_block; | |
349 | - | |
350 | #ifdef CONFIG_MEMORY_ISOLATION | |
351 | /* | |
352 | * Number of isolated pageblock. It is used to solve incorrect | |
353 | diff --git a/mm/huge_memory.c b/mm/huge_memory.c | |
354 | index 9812d46..dabd247 100644 | |
355 | --- a/mm/huge_memory.c | |
356 | +++ b/mm/huge_memory.c | |
357 | @@ -116,7 +116,7 @@ static void set_recommended_min_free_kbytes(void) | |
358 | for_each_populated_zone(zone) | |
359 | nr_zones++; | |
360 | ||
361 | - /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */ | |
362 | + /* Ensure 2 pageblocks are free to assist fragmentation avoidance */ | |
363 | recommended_min = pageblock_nr_pages * nr_zones * 2; | |
364 | ||
365 | /* | |
366 | diff --git a/mm/page_alloc.c b/mm/page_alloc.c | |
367 | index 8dc6e3c..5888126 100644 | |
368 | --- a/mm/page_alloc.c | |
369 | +++ b/mm/page_alloc.c | |
370 | @@ -817,7 +817,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |
371 | if (unlikely(has_isolate_pageblock(zone))) | |
372 | mt = get_pageblock_migratetype(page); | |
373 | ||
374 | - /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ | |
375 | __free_one_page(page, page_to_pfn(page), zone, 0, mt); | |
376 | trace_mm_page_pcpu_drain(page, 0, mt); | |
377 | } while (--to_free && --batch_free && !list_empty(list)); | |
378 | @@ -1417,15 +1416,14 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, | |
379 | * the free lists for the desirable migrate type are depleted | |
380 | */ | |
381 | static int fallbacks[MIGRATE_TYPES][4] = { | |
382 | - [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, | |
383 | - [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE }, | |
384 | - [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE }, | |
385 | + [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, | |
386 | + [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, | |
387 | + [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, | |
388 | #ifdef CONFIG_CMA | |
389 | - [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */ | |
390 | + [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */ | |
391 | #endif | |
392 | - [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */ | |
393 | #ifdef CONFIG_MEMORY_ISOLATION | |
394 | - [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */ | |
395 | + [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */ | |
396 | #endif | |
397 | }; | |
398 | ||
399 | @@ -1598,7 +1596,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, | |
400 | *can_steal = false; | |
401 | for (i = 0;; i++) { | |
402 | fallback_mt = fallbacks[migratetype][i]; | |
403 | - if (fallback_mt == MIGRATE_RESERVE) | |
404 | + if (fallback_mt == MIGRATE_TYPES) | |
405 | break; | |
406 | ||
407 | if (list_empty(&area->free_list[fallback_mt])) | |
408 | @@ -1676,25 +1674,13 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order, | |
409 | { | |
410 | struct page *page; | |
411 | ||
412 | -retry_reserve: | |
413 | page = __rmqueue_smallest(zone, order, migratetype); | |
414 | - | |
415 | - if (unlikely(!page) && migratetype != MIGRATE_RESERVE) { | |
416 | + if (unlikely(!page)) { | |
417 | if (migratetype == MIGRATE_MOVABLE) | |
418 | page = __rmqueue_cma_fallback(zone, order); | |
419 | ||
420 | if (!page) | |
421 | page = __rmqueue_fallback(zone, order, migratetype); | |
422 | - | |
423 | - /* | |
424 | - * Use MIGRATE_RESERVE rather than fail an allocation. goto | |
425 | - * is used because __rmqueue_smallest is an inline function | |
426 | - * and we want just one call site | |
427 | - */ | |
428 | - if (!page) { | |
429 | - migratetype = MIGRATE_RESERVE; | |
430 | - goto retry_reserve; | |
431 | - } | |
432 | } | |
433 | ||
434 | trace_mm_page_alloc_zone_locked(page, order, migratetype); | |
435 | @@ -3492,7 +3478,6 @@ static void show_migration_types(unsigned char type) | |
436 | [MIGRATE_UNMOVABLE] = 'U', | |
437 | [MIGRATE_RECLAIMABLE] = 'E', | |
438 | [MIGRATE_MOVABLE] = 'M', | |
439 | - [MIGRATE_RESERVE] = 'R', | |
440 | #ifdef CONFIG_CMA | |
441 | [MIGRATE_CMA] = 'C', | |
442 | #endif | |
443 | @@ -4303,120 +4288,6 @@ static inline unsigned long wait_table_bits(unsigned long size) | |
444 | } | |
445 | ||
446 | /* | |
447 | - * Check if a pageblock contains reserved pages | |
448 | - */ | |
449 | -static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn) | |
450 | -{ | |
451 | - unsigned long pfn; | |
452 | - | |
453 | - for (pfn = start_pfn; pfn < end_pfn; pfn++) { | |
454 | - if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn))) | |
455 | - return 1; | |
456 | - } | |
457 | - return 0; | |
458 | -} | |
459 | - | |
460 | -/* | |
461 | - * Mark a number of pageblocks as MIGRATE_RESERVE. The number | |
462 | - * of blocks reserved is based on min_wmark_pages(zone). The memory within | |
463 | - * the reserve will tend to store contiguous free pages. Setting min_free_kbytes | |
464 | - * higher will lead to a bigger reserve which will get freed as contiguous | |
465 | - * blocks as reclaim kicks in | |
466 | - */ | |
467 | -static void setup_zone_migrate_reserve(struct zone *zone) | |
468 | -{ | |
469 | - unsigned long start_pfn, pfn, end_pfn, block_end_pfn; | |
470 | - struct page *page; | |
471 | - unsigned long block_migratetype; | |
472 | - int reserve; | |
473 | - int old_reserve; | |
474 | - | |
475 | - /* | |
476 | - * Get the start pfn, end pfn and the number of blocks to reserve | |
477 | - * We have to be careful to be aligned to pageblock_nr_pages to | |
478 | - * make sure that we always check pfn_valid for the first page in | |
479 | - * the block. | |
480 | - */ | |
481 | - start_pfn = zone->zone_start_pfn; | |
482 | - end_pfn = zone_end_pfn(zone); | |
483 | - start_pfn = roundup(start_pfn, pageblock_nr_pages); | |
484 | - reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> | |
485 | - pageblock_order; | |
486 | - | |
487 | - /* | |
488 | - * Reserve blocks are generally in place to help high-order atomic | |
489 | - * allocations that are short-lived. A min_free_kbytes value that | |
490 | - * would result in more than 2 reserve blocks for atomic allocations | |
491 | - * is assumed to be in place to help anti-fragmentation for the | |
492 | - * future allocation of hugepages at runtime. | |
493 | - */ | |
494 | - reserve = min(2, reserve); | |
495 | - old_reserve = zone->nr_migrate_reserve_block; | |
496 | - | |
497 | - /* When memory hot-add, we almost always need to do nothing */ | |
498 | - if (reserve == old_reserve) | |
499 | - return; | |
500 | - zone->nr_migrate_reserve_block = reserve; | |
501 | - | |
502 | - for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { | |
503 | - if (!early_page_nid_uninitialised(pfn, zone_to_nid(zone))) | |
504 | - return; | |
505 | - | |
506 | - if (!pfn_valid(pfn)) | |
507 | - continue; | |
508 | - page = pfn_to_page(pfn); | |
509 | - | |
510 | - /* Watch out for overlapping nodes */ | |
511 | - if (page_to_nid(page) != zone_to_nid(zone)) | |
512 | - continue; | |
513 | - | |
514 | - block_migratetype = get_pageblock_migratetype(page); | |
515 | - | |
516 | - /* Only test what is necessary when the reserves are not met */ | |
517 | - if (reserve > 0) { | |
518 | - /* | |
519 | - * Blocks with reserved pages will never free, skip | |
520 | - * them. | |
521 | - */ | |
522 | - block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); | |
523 | - if (pageblock_is_reserved(pfn, block_end_pfn)) | |
524 | - continue; | |
525 | - | |
526 | - /* If this block is reserved, account for it */ | |
527 | - if (block_migratetype == MIGRATE_RESERVE) { | |
528 | - reserve--; | |
529 | - continue; | |
530 | - } | |
531 | - | |
532 | - /* Suitable for reserving if this block is movable */ | |
533 | - if (block_migratetype == MIGRATE_MOVABLE) { | |
534 | - set_pageblock_migratetype(page, | |
535 | - MIGRATE_RESERVE); | |
536 | - move_freepages_block(zone, page, | |
537 | - MIGRATE_RESERVE); | |
538 | - reserve--; | |
539 | - continue; | |
540 | - } | |
541 | - } else if (!old_reserve) { | |
542 | - /* | |
543 | - * At boot time we don't need to scan the whole zone | |
544 | - * for turning off MIGRATE_RESERVE. | |
545 | - */ | |
546 | - break; | |
547 | - } | |
548 | - | |
549 | - /* | |
550 | - * If the reserve is met and this is a previous reserved block, | |
551 | - * take it back | |
552 | - */ | |
553 | - if (block_migratetype == MIGRATE_RESERVE) { | |
554 | - set_pageblock_migratetype(page, MIGRATE_MOVABLE); | |
555 | - move_freepages_block(zone, page, MIGRATE_MOVABLE); | |
556 | - } | |
557 | - } | |
558 | -} | |
559 | - | |
560 | -/* | |
561 | * Initially all pages are reserved - free ones are freed | |
562 | * up by free_all_bootmem() once the early boot process is | |
563 | * done. Non-atomic initialization, single-pass. | |
564 | @@ -4455,9 +4326,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |
565 | * movable at startup. This will force kernel allocations | |
566 | * to reserve their blocks rather than leaking throughout | |
567 | * the address space during boot when many long-lived | |
568 | - * kernel allocations are made. Later some blocks near | |
569 | - * the start are marked MIGRATE_RESERVE by | |
570 | - * setup_zone_migrate_reserve() | |
571 | + * kernel allocations are made. | |
572 | * | |
573 | * bitmap is created for zone's valid pfn range. but memmap | |
574 | * can be created for invalid pages (for alignment) | |
575 | @@ -6018,7 +5887,6 @@ static void __setup_per_zone_wmarks(void) | |
576 | high_wmark_pages(zone) - low_wmark_pages(zone) - | |
577 | atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); | |
578 | ||
579 | - setup_zone_migrate_reserve(zone); | |
580 | spin_unlock_irqrestore(&zone->lock, flags); | |
581 | } | |
582 | ||
583 | diff --git a/mm/vmstat.c b/mm/vmstat.c | |
584 | index ffcb4f5..5b289dc 100644 | |
585 | --- a/mm/vmstat.c | |
586 | +++ b/mm/vmstat.c | |
587 | @@ -923,7 +923,6 @@ static char * const migratetype_names[MIGRATE_TYPES] = { | |
588 | "Unmovable", | |
589 | "Reclaimable", | |
590 | "Movable", | |
591 | - "Reserve", | |
592 | #ifdef CONFIG_CMA | |
593 | "CMA", | |
594 | #endif | |
595 | diff --git a/mm/backing-dev.c b/mm/backing-dev.c | |
596 | index 8ed2ffd963c5..7340353f8aea 100644 | |
597 | --- a/mm/backing-dev.c | |
598 | +++ b/mm/backing-dev.c | |
599 | @@ -957,8 +957,9 @@ EXPORT_SYMBOL(congestion_wait); | |
600 | * jiffies for either a BDI to exit congestion of the given @sync queue | |
601 | * or a write to complete. | |
602 | * | |
603 | - * In the absence of zone congestion, cond_resched() is called to yield | |
604 | - * the processor if necessary but otherwise does not sleep. | |
605 | + * In the absence of zone congestion, a short sleep or a cond_resched is | |
606 | + * performed to yield the processor and to allow other subsystems to make | |
607 | + * a forward progress. | |
608 | * | |
609 | * The return value is 0 if the sleep is for the full timeout. Otherwise, | |
610 | * it is the number of jiffies that were still remaining when the function | |
611 | @@ -978,7 +979,19 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout) | |
612 | */ | |
613 | if (atomic_read(&nr_wb_congested[sync]) == 0 || | |
614 | !test_bit(ZONE_CONGESTED, &zone->flags)) { | |
615 | - cond_resched(); | |
616 | + | |
617 | + /* | |
618 | + * Memory allocation/reclaim might be called from a WQ | |
619 | + * context and the current implementation of the WQ | |
620 | + * concurrency control doesn't recognize that a particular | |
621 | + * WQ is congested if the worker thread is looping without | |
622 | + * ever sleeping. Therefore we have to do a short sleep | |
623 | + * here rather than calling cond_resched(). | |
624 | + */ | |
625 | + if (current->flags & PF_WQ_WORKER) | |
626 | + schedule_timeout(1); | |
627 | + else | |
628 | + cond_resched(); | |
629 | ||
630 | /* In case we scheduled, work out time remaining */ | |
631 | ret = timeout - (jiffies - start); | |
632 | diff --git a/mm/vmstat.c b/mm/vmstat.c | |
633 | index 45dcbcb5c594..0975da8e3432 100644 | |
634 | --- a/mm/vmstat.c | |
635 | +++ b/mm/vmstat.c | |
636 | @@ -1381,6 +1381,7 @@ static const struct file_operations proc_vmstat_file_operations = { | |
637 | #endif /* CONFIG_PROC_FS */ | |
638 | ||
639 | #ifdef CONFIG_SMP | |
640 | +static struct workqueue_struct *vmstat_wq; | |
641 | static DEFINE_PER_CPU(struct delayed_work, vmstat_work); | |
642 | int sysctl_stat_interval __read_mostly = HZ; | |
643 | static cpumask_var_t cpu_stat_off; | |
644 | @@ -1393,7 +1394,7 @@ static void vmstat_update(struct work_struct *w) | |
645 | * to occur in the future. Keep on running the | |
646 | * update worker thread. | |
647 | */ | |
648 | - schedule_delayed_work_on(smp_processor_id(), | |
649 | + queue_delayed_work_on(smp_processor_id(), vmstat_wq, | |
650 | this_cpu_ptr(&vmstat_work), | |
651 | round_jiffies_relative(sysctl_stat_interval)); | |
652 | } else { | |
653 | @@ -1462,7 +1463,7 @@ static void vmstat_shepherd(struct work_struct *w) | |
654 | if (need_update(cpu) && | |
655 | cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) | |
656 | ||
657 | - schedule_delayed_work_on(cpu, | |
658 | + queue_delayed_work_on(cpu, vmstat_wq, | |
659 | &per_cpu(vmstat_work, cpu), 0); | |
660 | ||
661 | put_online_cpus(); | |
662 | @@ -1551,6 +1552,7 @@ static int __init setup_vmstat(void) | |
663 | ||
664 | start_shepherd_timer(); | |
665 | cpu_notifier_register_done(); | |
666 | + vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); | |
667 | #endif | |
668 | #ifdef CONFIG_PROC_FS | |
669 | proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); | |
670 | -- | |
671 | 2.6.2 | |
672 | ||
8194c377 AM |
673 | From 09ccfd238e5a0e670d8178cf50180ea81ae09ae1 Mon Sep 17 00:00:00 2001 |
674 | From: WANG Cong <xiyou.wangcong@gmail.com> | |
675 | Date: Mon, 14 Dec 2015 13:48:36 -0800 | |
676 | Subject: pptp: verify sockaddr_len in pptp_bind() and pptp_connect() | |
677 | ||
678 | Reported-by: Dmitry Vyukov <dvyukov@gmail.com> | |
679 | Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com> | |
680 | Signed-off-by: David S. Miller <davem@davemloft.net> | |
681 | --- | |
682 | drivers/net/ppp/pptp.c | 6 ++++++ | |
683 | 1 file changed, 6 insertions(+) | |
684 | ||
685 | diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c | |
686 | index fc69e41..597c53e 100644 | |
687 | --- a/drivers/net/ppp/pptp.c | |
688 | +++ b/drivers/net/ppp/pptp.c | |
689 | @@ -419,6 +419,9 @@ static int pptp_bind(struct socket *sock, struct sockaddr *uservaddr, | |
690 | struct pptp_opt *opt = &po->proto.pptp; | |
691 | int error = 0; | |
692 | ||
693 | + if (sockaddr_len < sizeof(struct sockaddr_pppox)) | |
694 | + return -EINVAL; | |
695 | + | |
696 | lock_sock(sk); | |
697 | ||
698 | opt->src_addr = sp->sa_addr.pptp; | |
699 | @@ -440,6 +443,9 @@ static int pptp_connect(struct socket *sock, struct sockaddr *uservaddr, | |
700 | struct flowi4 fl4; | |
701 | int error = 0; | |
702 | ||
703 | + if (sockaddr_len < sizeof(struct sockaddr_pppox)) | |
704 | + return -EINVAL; | |
705 | + | |
706 | if (sp->sa_protocol != PX_PROTO_PPTP) | |
707 | return -EINVAL; | |
708 | ||
709 | -- | |
710 | cgit v0.11.2 | |
711 | ||
552e066f AM |
712 | commit cc57858831e3e9678291de730c4b4d2e52a19f59 |
713 | Author: Artur Paszkiewicz <artur.paszkiewicz@intel.com> | |
714 | Date: Fri Dec 18 15:19:16 2015 +1100 | |
715 | ||
716 | md/raid10: fix data corruption and crash during resync | |
717 | ||
718 | The commit c31df25f20e3 ("md/raid10: make sync_request_write() call | |
719 | bio_copy_data()") replaced manual data copying with bio_copy_data() but | |
720 | it doesn't work as intended. The source bio (fbio) is already processed, | |
721 | so its bvec_iter has bi_size == 0 and bi_idx == bi_vcnt. Because of | |
722 | this, bio_copy_data() either does not copy anything, or worse, copies | |
723 | data from the ->bi_next bio if it is set. This causes wrong data to be | |
724 | written to drives during resync and sometimes lockups/crashes in | |
725 | bio_copy_data(): | |
726 | ||
727 | [ 517.338478] NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [md126_raid10:3319] | |
728 | [ 517.347324] Modules linked in: raid10 xt_CHECKSUM ipt_MASQUERADE nf_nat_masquerade_ipv4 tun ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 ipt_REJECT nf_reject_ipv4 xt_conntrack ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw ip6table_filter ip6_tables iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw iptable_filter ip_tables x86_pkg_temp_thermal coretemp kvm_intel kvm crct10dif_pclmul crc32_pclmul cryptd shpchp pcspkr ipmi_si ipmi_msghandler tpm_crb acpi_power_meter acpi_cpufreq ext4 mbcache jbd2 sr_mod cdrom sd_mod e1000e ax88179_178a usbnet mii ahci ata_generic crc32c_intel libahci ptp pata_acpi libata pps_core wmi sunrpc dm_mirror dm_region_hash dm_log dm_mod | |
729 | [ 517.440555] CPU: 0 PID: 3319 Comm: md126_raid10 Not tainted 4.3.0-rc6+ #1 | |
730 | [ 517.448384] Hardware name: Intel Corporation PURLEY/PURLEY, BIOS PLYDCRB1.86B.0055.D14.1509221924 09/22/2015 | |
731 | [ 517.459768] task: ffff880153773980 ti: ffff880150df8000 task.ti: ffff880150df8000 | |
732 | [ 517.468529] RIP: 0010:[<ffffffff812e1888>] [<ffffffff812e1888>] bio_copy_data+0xc8/0x3c0 | |
733 | [ 517.478164] RSP: 0018:ffff880150dfbc98 EFLAGS: 00000246 | |
734 | [ 517.484341] RAX: ffff880169356688 RBX: 0000000000001000 RCX: 0000000000000000 | |
735 | [ 517.492558] RDX: 0000000000000000 RSI: ffffea0001ac2980 RDI: ffffea0000d835c0 | |
736 | [ 517.500773] RBP: ffff880150dfbd08 R08: 0000000000000001 R09: ffff880153773980 | |
737 | [ 517.508987] R10: ffff880169356600 R11: 0000000000001000 R12: 0000000000010000 | |
738 | [ 517.517199] R13: 000000000000e000 R14: 0000000000000000 R15: 0000000000001000 | |
739 | [ 517.525412] FS: 0000000000000000(0000) GS:ffff880174a00000(0000) knlGS:0000000000000000 | |
740 | [ 517.534844] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 | |
741 | [ 517.541507] CR2: 00007f8a044d5fed CR3: 0000000169504000 CR4: 00000000001406f0 | |
742 | [ 517.549722] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 | |
743 | [ 517.557929] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 | |
744 | [ 517.566144] Stack: | |
745 | [ 517.568626] ffff880174a16bc0 ffff880153773980 ffff880169356600 0000000000000000 | |
746 | [ 517.577659] 0000000000000001 0000000000000001 ffff880153773980 ffff88016a61a800 | |
747 | [ 517.586715] ffff880150dfbcf8 0000000000000001 ffff88016dd209e0 0000000000001000 | |
748 | [ 517.595773] Call Trace: | |
749 | [ 517.598747] [<ffffffffa043ef95>] raid10d+0xfc5/0x1690 [raid10] | |
750 | [ 517.605610] [<ffffffff816697ae>] ? __schedule+0x29e/0x8e2 | |
751 | [ 517.611987] [<ffffffff814ff206>] md_thread+0x106/0x140 | |
752 | [ 517.618072] [<ffffffff810c1d80>] ? wait_woken+0x80/0x80 | |
753 | [ 517.624252] [<ffffffff814ff100>] ? super_1_load+0x520/0x520 | |
754 | [ 517.630817] [<ffffffff8109ef89>] kthread+0xc9/0xe0 | |
755 | [ 517.636506] [<ffffffff8109eec0>] ? flush_kthread_worker+0x70/0x70 | |
756 | [ 517.643653] [<ffffffff8166d99f>] ret_from_fork+0x3f/0x70 | |
757 | [ 517.649929] [<ffffffff8109eec0>] ? flush_kthread_worker+0x70/0x70 | |
758 | ||
759 | Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com> | |
760 | Reviewed-by: Shaohua Li <shli@kernel.org> | |
761 | Cc: stable@vger.kernel.org (v4.2+) | |
762 | Fixes: c31df25f20e3 ("md/raid10: make sync_request_write() call bio_copy_data()") | |
763 | Signed-off-by: NeilBrown <neilb@suse.com> | |
764 | ||
765 | diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c | |
766 | index 41d70bc..84e597e 100644 | |
767 | --- a/drivers/md/raid10.c | |
768 | +++ b/drivers/md/raid10.c | |
769 | @@ -1946,6 +1946,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) | |
770 | ||
771 | first = i; | |
772 | fbio = r10_bio->devs[i].bio; | |
773 | + fbio->bi_iter.bi_size = r10_bio->sectors << 9; | |
774 | + fbio->bi_iter.bi_idx = 0; | |
775 | ||
776 | vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9); | |
777 | /* now find blocks with errors */ | |
778 | @@ -1989,7 +1991,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) | |
779 | bio_reset(tbio); | |
780 | ||
781 | tbio->bi_vcnt = vcnt; | |
782 | - tbio->bi_iter.bi_size = r10_bio->sectors << 9; | |
783 | + tbio->bi_iter.bi_size = fbio->bi_iter.bi_size; | |
784 | tbio->bi_rw = WRITE; | |
785 | tbio->bi_private = r10_bio; | |
786 | tbio->bi_iter.bi_sector = r10_bio->devs[i].addr; | |
ea1e61a2 AM |
787 | From: Dave Chinner <dchinner@redhat.com> |
788 | ||
789 | When we do dquot readahead in log recovery, we do not use a verifier | |
790 | as the underlying buffer may not have dquots in it. e.g. the | |
791 | allocation operation hasn't yet been replayed. Hence we do not want | |
792 | to fail recovery because we detect an operation to be replayed has | |
793 | not been run yet. This problem was addressed for inodes in commit | |
794 | d891400 ("xfs: inode buffers may not be valid during recovery | |
795 | readahead") but the problem was not recognised to exist for dquots | |
796 | and their buffers as the dquot readahead did not have a verifier. | |
797 | ||
798 | The result of not using a verifier is that when the buffer is then | |
799 | next read to replay a dquot modification, the dquot buffer verifier | |
800 | will only be attached to the buffer if *readahead is not complete*. | |
801 | Hence we can read the buffer, replay the dquot changes and then add | |
802 | it to the delwri submission list without it having a verifier | |
803 | attached to it. This then generates warnings in xfs_buf_ioapply(), | |
804 | which catches and warns about this case. | |
805 | ||
806 | Fix this and make it handle the same readahead verifier error cases | |
807 | as for inode buffers by adding a new readahead verifier that has a | |
808 | write operation as well as a read operation that marks the buffer as | |
809 | not done if any corruption is detected. Also make sure we don't run | |
810 | readahead if the dquot buffer has been marked as cancelled by | |
811 | recovery. | |
812 | ||
813 | This will result in readahead either succeeding and the buffer | |
814 | having a valid write verifier, or readahead failing and the buffer | |
815 | state requiring the subsequent read to resubmit the IO with the new | |
816 | verifier. In either case, this will result in the buffer always | |
817 | ending up with a valid write verifier on it. | |
818 | ||
819 | Note: we also need to fix the inode buffer readahead error handling | |
820 | to mark the buffer with EIO. Brian noticed the code I copied from | |
821 | there wrong during review, so fix it at the same time. Add comments | |
822 | linking the two functions that handle readahead verifier errors | |
823 | together so we don't forget this behavioural link in future. | |
824 | ||
825 | cc: <stable@vger.kernel.org> # 3.12 - current | |
826 | Signed-off-by: Dave Chinner <dchinner@redhat.com> | |
827 | --- | |
828 | ||
829 | Version 2 | |
830 | - fix logic error in determining if verify failed | |
831 | - set error on buffer when verifier fails | |
832 | - fix inode buffer readahead verifier to set error when it fails | |
833 | - better comments, link dquot and inode buffer ra verifiers in the | |
834 | comments | |
835 | ||
836 | fs/xfs/libxfs/xfs_dquot_buf.c | 36 ++++++++++++++++++++++++++++++------ | |
837 | fs/xfs/libxfs/xfs_inode_buf.c | 14 +++++++++----- | |
838 | fs/xfs/libxfs/xfs_quota_defs.h | 2 +- | |
839 | fs/xfs/libxfs/xfs_shared.h | 1 + | |
840 | fs/xfs/xfs_log_recover.c | 9 +++++++-- | |
841 | 5 files changed, 48 insertions(+), 14 deletions(-) | |
842 | ||
843 | diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c | |
844 | index 11cefb2..3cc3cf7 100644 | |
845 | --- a/fs/xfs/libxfs/xfs_dquot_buf.c | |
846 | +++ b/fs/xfs/libxfs/xfs_dquot_buf.c | |
847 | @@ -54,7 +54,7 @@ xfs_dqcheck( | |
848 | xfs_dqid_t id, | |
849 | uint type, /* used only when IO_dorepair is true */ | |
850 | uint flags, | |
851 | - char *str) | |
852 | + const char *str) | |
853 | { | |
854 | xfs_dqblk_t *d = (xfs_dqblk_t *)ddq; | |
855 | int errs = 0; | |
856 | @@ -207,7 +207,8 @@ xfs_dquot_buf_verify_crc( | |
857 | STATIC bool | |
858 | xfs_dquot_buf_verify( | |
859 | struct xfs_mount *mp, | |
860 | - struct xfs_buf *bp) | |
861 | + struct xfs_buf *bp, | |
862 | + int warn) | |
863 | { | |
864 | struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; | |
865 | xfs_dqid_t id = 0; | |
866 | @@ -240,8 +241,7 @@ xfs_dquot_buf_verify( | |
867 | if (i == 0) | |
868 | id = be32_to_cpu(ddq->d_id); | |
869 | ||
870 | - error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, | |
871 | - "xfs_dquot_buf_verify"); | |
872 | + error = xfs_dqcheck(mp, ddq, id + i, 0, warn, __func__); | |
873 | if (error) | |
874 | return false; | |
875 | } | |
876 | @@ -256,7 +256,7 @@ xfs_dquot_buf_read_verify( | |
877 | ||
878 | if (!xfs_dquot_buf_verify_crc(mp, bp)) | |
879 | xfs_buf_ioerror(bp, -EFSBADCRC); | |
880 | - else if (!xfs_dquot_buf_verify(mp, bp)) | |
881 | + else if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN)) | |
882 | xfs_buf_ioerror(bp, -EFSCORRUPTED); | |
883 | ||
884 | if (bp->b_error) | |
885 | @@ -264,6 +264,25 @@ xfs_dquot_buf_read_verify( | |
886 | } | |
887 | ||
888 | /* | |
889 | + * readahead errors are silent and simply leave the buffer as !done so a real | |
890 | + * read will then be run with the xfs_dquot_buf_ops verifier. See | |
891 | + * xfs_inode_buf_verify() for why we use EIO and ~XBF_DONE here rather than | |
892 | + * reporting the failure. | |
893 | + */ | |
894 | +static void | |
895 | +xfs_dquot_buf_readahead_verify( | |
896 | + struct xfs_buf *bp) | |
897 | +{ | |
898 | + struct xfs_mount *mp = bp->b_target->bt_mount; | |
899 | + | |
900 | + if (!xfs_dquot_buf_verify_crc(mp, bp) || | |
901 | + !xfs_dquot_buf_verify(mp, bp, 0)) { | |
902 | + xfs_buf_ioerror(bp, -EIO); | |
903 | + bp->b_flags &= ~XBF_DONE; | |
904 | + } | |
905 | +} | |
906 | + | |
907 | +/* | |
908 | * we don't calculate the CRC here as that is done when the dquot is flushed to | |
909 | * the buffer after the update is done. This ensures that the dquot in the | |
910 | * buffer always has an up-to-date CRC value. | |
911 | @@ -274,7 +293,7 @@ xfs_dquot_buf_write_verify( | |
912 | { | |
913 | struct xfs_mount *mp = bp->b_target->bt_mount; | |
914 | ||
915 | - if (!xfs_dquot_buf_verify(mp, bp)) { | |
916 | + if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN)) { | |
917 | xfs_buf_ioerror(bp, -EFSCORRUPTED); | |
918 | xfs_verifier_error(bp); | |
919 | return; | |
920 | @@ -287,3 +306,8 @@ const struct xfs_buf_ops xfs_dquot_buf_ops = { | |
921 | .verify_write = xfs_dquot_buf_write_verify, | |
922 | }; | |
923 | ||
924 | +const struct xfs_buf_ops xfs_dquot_buf_ra_ops = { | |
925 | + | |
926 | + .verify_read = xfs_dquot_buf_readahead_verify, | |
927 | + .verify_write = xfs_dquot_buf_write_verify, | |
928 | +}; | |
929 | diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c | |
930 | index 1b8d98a..4816209 100644 | |
931 | --- a/fs/xfs/libxfs/xfs_inode_buf.c | |
932 | +++ b/fs/xfs/libxfs/xfs_inode_buf.c | |
933 | @@ -62,11 +62,14 @@ xfs_inobp_check( | |
934 | * has not had the inode cores stamped into it. Hence for readahead, the buffer | |
935 | * may be potentially invalid. | |
936 | * | |
937 | - * If the readahead buffer is invalid, we don't want to mark it with an error, | |
938 | - * but we do want to clear the DONE status of the buffer so that a followup read | |
939 | - * will re-read it from disk. This will ensure that we don't get an unnecessary | |
940 | - * warnings during log recovery and we don't get unnecssary panics on debug | |
941 | - * kernels. | |
942 | + * If the readahead buffer is invalid, we need to mark it with an error and | |
943 | + * clear the DONE status of the buffer so that a followup read will re-read it | |
944 | + * from disk. We don't report the error otherwise to avoid warnings during log | |
945 | + * recovery and we don't get unnecssary panics on debug kernels. We use EIO here | |
946 | + * because all we want to do is say readahead failed; there is no-one to report | |
947 | + * the error to, so this will distinguish it from a non-ra verifier failure. | |
948 | + * Changes to this readahead error behavour also need to be reflected in | |
949 | + * xfs_dquot_buf_readahead_verify(). | |
950 | */ | |
951 | static void | |
952 | xfs_inode_buf_verify( | |
953 | @@ -92,6 +95,7 @@ xfs_inode_buf_verify( | |
954 | XFS_ERRTAG_ITOBP_INOTOBP, | |
955 | XFS_RANDOM_ITOBP_INOTOBP))) { | |
956 | if (readahead) { | |
957 | + xfs_buf_ioerror(bp, -EIO); | |
958 | bp->b_flags &= ~XBF_DONE; | |
959 | return; | |
960 | } | |
961 | diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h | |
962 | index 1b0a083..f51078f 100644 | |
963 | --- a/fs/xfs/libxfs/xfs_quota_defs.h | |
964 | +++ b/fs/xfs/libxfs/xfs_quota_defs.h | |
965 | @@ -153,7 +153,7 @@ typedef __uint16_t xfs_qwarncnt_t; | |
966 | #define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) | |
967 | ||
968 | extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq, | |
969 | - xfs_dqid_t id, uint type, uint flags, char *str); | |
970 | + xfs_dqid_t id, uint type, uint flags, const char *str); | |
971 | extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); | |
972 | ||
973 | #endif /* __XFS_QUOTA_H__ */ | |
974 | diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h | |
975 | index 5be5297..15c3ceb 100644 | |
976 | --- a/fs/xfs/libxfs/xfs_shared.h | |
977 | +++ b/fs/xfs/libxfs/xfs_shared.h | |
978 | @@ -49,6 +49,7 @@ extern const struct xfs_buf_ops xfs_inobt_buf_ops; | |
979 | extern const struct xfs_buf_ops xfs_inode_buf_ops; | |
980 | extern const struct xfs_buf_ops xfs_inode_buf_ra_ops; | |
981 | extern const struct xfs_buf_ops xfs_dquot_buf_ops; | |
982 | +extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops; | |
983 | extern const struct xfs_buf_ops xfs_sb_buf_ops; | |
984 | extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; | |
985 | extern const struct xfs_buf_ops xfs_symlink_buf_ops; | |
986 | diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c | |
987 | index 26e67b4..da37beb 100644 | |
988 | --- a/fs/xfs/xfs_log_recover.c | |
989 | +++ b/fs/xfs/xfs_log_recover.c | |
990 | @@ -3521,6 +3521,7 @@ xlog_recover_dquot_ra_pass2( | |
991 | struct xfs_disk_dquot *recddq; | |
992 | struct xfs_dq_logformat *dq_f; | |
993 | uint type; | |
994 | + int len; | |
995 | ||
996 | ||
997 | if (mp->m_qflags == 0) | |
998 | @@ -3541,8 +3542,12 @@ xlog_recover_dquot_ra_pass2( | |
999 | ASSERT(dq_f); | |
1000 | ASSERT(dq_f->qlf_len == 1); | |
1001 | ||
1002 | - xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, | |
1003 | - XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL); | |
1004 | + len = XFS_FSB_TO_BB(mp, dq_f->qlf_len); | |
1005 | + if (xlog_peek_buffer_cancelled(log, dq_f->qlf_blkno, len, 0)) | |
1006 | + return; | |
1007 | + | |
1008 | + xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, len, | |
1009 | + &xfs_dquot_buf_ra_ops); | |
1010 | } | |
1011 | ||
1012 | STATIC void | |
1013 | ||
1014 | _______________________________________________ | |
1015 | xfs mailing list | |
1016 | xfs@oss.sgi.com | |
1017 | http://oss.sgi.com/mailman/listinfo/xfs |