]> git.pld-linux.org Git - packages/kernel.git/blame - kernel-small_fixes.patch
- v3 version of the patch (split into two chunks)
[packages/kernel.git] / kernel-small_fixes.patch
CommitLineData
08aa9d92 1--- linux-2.6.33/scripts/mod/modpost.c~ 2010-02-24 19:52:17.000000000 +0100
2+++ linux-2.6.33/scripts/mod/modpost.c 2010-03-07 14:26:47.242168558 +0100
3@@ -15,7 +15,8 @@
4 #include <stdio.h>
5 #include <ctype.h>
6 #include "modpost.h"
7-#include "../../include/generated/autoconf.h"
8+// PLD architectures don't use CONFIG_SYMBOL_PREFIX
9+//#include "../../include/generated/autoconf.h"
10 #include "../../include/linux/license.h"
11
12 /* Some toolchains use a `_' prefix for all user symbols. */
13
2136e199
AM
14--- linux-3.0/scripts/kconfig/lxdialog/check-lxdialog.sh~ 2011-07-22 04:17:23.000000000 +0200
15+++ linux-3.0/scripts/kconfig/lxdialog/check-lxdialog.sh 2011-08-25 21:26:04.799150642 +0200
16@@ -9,6 +9,12 @@
17 $cc -print-file-name=lib${lib}.${ext} | grep -q /
18 if [ $? -eq 0 ]; then
19 echo "-l${lib}"
20+ for libt in tinfow tinfo ; do
21+ $cc -print-file-name=lib${libt}.${ext} | grep -q /
22+ if [ $? -eq 0 ]; then
23+ echo "-l${libt}"
24+ fi
25+ done
26 exit
27 fi
28 done
7e7bde06 29
99992ee3
AM
30From 7a29ac474a47eb8cf212b45917683ae89d6fa13b Mon Sep 17 00:00:00 2001
31From: Chris Mason <clm@fb.com>
32Date: Tue, 10 Nov 2015 10:10:34 +1100
33Subject: xfs: give all workqueues rescuer threads
34
35We're consistently hitting deadlocks here with XFS on recent kernels.
36After some digging through the crash files, it looks like everyone in
37the system is waiting for XFS to reclaim memory.
38
39Something like this:
40
41PID: 2733434 TASK: ffff8808cd242800 CPU: 19 COMMAND: "java"
42 #0 [ffff880019c53588] __schedule at ffffffff818c4df2
43 #1 [ffff880019c535d8] schedule at ffffffff818c5517
44 #2 [ffff880019c535f8] _xfs_log_force_lsn at ffffffff81316348
45 #3 [ffff880019c53688] xfs_log_force_lsn at ffffffff813164fb
46 #4 [ffff880019c536b8] xfs_iunpin_wait at ffffffff8130835e
47 #5 [ffff880019c53728] xfs_reclaim_inode at ffffffff812fd453
48 #6 [ffff880019c53778] xfs_reclaim_inodes_ag at ffffffff812fd8c7
49 #7 [ffff880019c53928] xfs_reclaim_inodes_nr at ffffffff812fe433
50 #8 [ffff880019c53958] xfs_fs_free_cached_objects at ffffffff8130d3b9
51 #9 [ffff880019c53968] super_cache_scan at ffffffff811a6f73
52#10 [ffff880019c539c8] shrink_slab at ffffffff811460e6
53#11 [ffff880019c53aa8] shrink_zone at ffffffff8114a53f
54#12 [ffff880019c53b48] do_try_to_free_pages at ffffffff8114a8ba
55#13 [ffff880019c53be8] try_to_free_pages at ffffffff8114ad5a
56#14 [ffff880019c53c78] __alloc_pages_nodemask at ffffffff8113e1b8
57#15 [ffff880019c53d88] alloc_kmem_pages_node at ffffffff8113e671
58#16 [ffff880019c53dd8] copy_process at ffffffff8104f781
59#17 [ffff880019c53ec8] do_fork at ffffffff8105129c
60#18 [ffff880019c53f38] sys_clone at ffffffff810515b6
61#19 [ffff880019c53f48] stub_clone at ffffffff818c8e4d
62
63xfs_log_force_lsn is waiting for logs to get cleaned, which is waiting
64for IO, which is waiting for workers to complete the IO which is waiting
65for worker threads that don't exist yet:
66
67PID: 2752451 TASK: ffff880bd6bdda00 CPU: 37 COMMAND: "kworker/37:1"
68 #0 [ffff8808d20abbb0] __schedule at ffffffff818c4df2
69 #1 [ffff8808d20abc00] schedule at ffffffff818c5517
70 #2 [ffff8808d20abc20] schedule_timeout at ffffffff818c7c6c
71 #3 [ffff8808d20abcc0] wait_for_completion_killable at ffffffff818c6495
72 #4 [ffff8808d20abd30] kthread_create_on_node at ffffffff8106ec82
73 #5 [ffff8808d20abdf0] create_worker at ffffffff8106752f
74 #6 [ffff8808d20abe40] worker_thread at ffffffff810699be
75 #7 [ffff8808d20abec0] kthread at ffffffff8106ef59
76 #8 [ffff8808d20abf50] ret_from_fork at ffffffff818c8ac8
77
78I think we should be using WQ_MEM_RECLAIM to make sure this thread
79pool makes progress when we're not able to allocate new workers.
80
81[dchinner: make all workqueues WQ_MEM_RECLAIM]
82
83Signed-off-by: Chris Mason <clm@fb.com>
84Reviewed-by: Dave Chinner <dchinner@redhat.com>
85Signed-off-by: Dave Chinner <david@fromorbit.com>
86---
87 fs/xfs/xfs_super.c | 7 ++++---
88 1 file changed, 4 insertions(+), 3 deletions(-)
89
90diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
91index 29531ec..65fbfb7 100644
92--- a/fs/xfs/xfs_super.c
93+++ b/fs/xfs/xfs_super.c
94@@ -838,17 +838,18 @@ xfs_init_mount_workqueues(
95 goto out_destroy_unwritten;
96
97 mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
98- WQ_FREEZABLE, 0, mp->m_fsname);
99+ WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
100 if (!mp->m_reclaim_workqueue)
101 goto out_destroy_cil;
102
103 mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
104- WQ_FREEZABLE|WQ_HIGHPRI, 0, mp->m_fsname);
105+ WQ_MEM_RECLAIM|WQ_FREEZABLE|WQ_HIGHPRI, 0,
106+ mp->m_fsname);
107 if (!mp->m_log_workqueue)
108 goto out_destroy_reclaim;
109
110 mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
111- WQ_FREEZABLE, 0, mp->m_fsname);
112+ WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
113 if (!mp->m_eofblocks_workqueue)
114 goto out_destroy_log;
115
116--
117cgit v0.11.2
118
e1bba219
AM
119commit c2d42c16ad83006a706d83e51a7268db04af733a
120Author: Andrew Morton <akpm@linux-foundation.org>
121Date: Thu Nov 5 18:48:43 2015 -0800
122
123 mm/vmstat.c: uninline node_page_state()
124
125 With x86_64 (config http://ozlabs.org/~akpm/config-akpm2.txt) and old gcc
126 (4.4.4), drivers/base/node.c:node_read_meminfo() is using 2344 bytes of
127 stack. Uninlining node_page_state() reduces this to 440 bytes.
128
129 The stack consumption issue is fixed by newer gcc (4.8.4) however with
130 that compiler this patch reduces the node.o text size from 7314 bytes to
131 4578.
132
133 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
134 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
135
136diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
137index 82e7db7..49dfe40 100644
138--- a/include/linux/vmstat.h
139+++ b/include/linux/vmstat.h
140@@ -161,30 +161,8 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
141 }
142
143 #ifdef CONFIG_NUMA
144-/*
145- * Determine the per node value of a stat item. This function
146- * is called frequently in a NUMA machine, so try to be as
147- * frugal as possible.
148- */
149-static inline unsigned long node_page_state(int node,
150- enum zone_stat_item item)
151-{
152- struct zone *zones = NODE_DATA(node)->node_zones;
153-
154- return
155-#ifdef CONFIG_ZONE_DMA
156- zone_page_state(&zones[ZONE_DMA], item) +
157-#endif
158-#ifdef CONFIG_ZONE_DMA32
159- zone_page_state(&zones[ZONE_DMA32], item) +
160-#endif
161-#ifdef CONFIG_HIGHMEM
162- zone_page_state(&zones[ZONE_HIGHMEM], item) +
163-#endif
164- zone_page_state(&zones[ZONE_NORMAL], item) +
165- zone_page_state(&zones[ZONE_MOVABLE], item);
166-}
167
168+extern unsigned long node_page_state(int node, enum zone_stat_item item);
169 extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp);
170
171 #else
172diff --git a/mm/vmstat.c b/mm/vmstat.c
173index fbf1448..ffcb4f5 100644
174--- a/mm/vmstat.c
175+++ b/mm/vmstat.c
176@@ -591,6 +591,28 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
177 else
178 __inc_zone_state(z, NUMA_OTHER);
179 }
180+
181+/*
182+ * Determine the per node value of a stat item.
183+ */
184+unsigned long node_page_state(int node, enum zone_stat_item item)
185+{
186+ struct zone *zones = NODE_DATA(node)->node_zones;
187+
188+ return
189+#ifdef CONFIG_ZONE_DMA
190+ zone_page_state(&zones[ZONE_DMA], item) +
191+#endif
192+#ifdef CONFIG_ZONE_DMA32
193+ zone_page_state(&zones[ZONE_DMA32], item) +
194+#endif
195+#ifdef CONFIG_HIGHMEM
196+ zone_page_state(&zones[ZONE_HIGHMEM], item) +
197+#endif
198+ zone_page_state(&zones[ZONE_NORMAL], item) +
199+ zone_page_state(&zones[ZONE_MOVABLE], item);
200+}
201+
202 #endif
203
204 #ifdef CONFIG_COMPACTION
205commit 016c13daa5c9e4827eca703e2f0621c131f2cca3
206Author: Mel Gorman <mgorman@techsingularity.net>
207Date: Fri Nov 6 16:28:18 2015 -0800
208
209 mm, page_alloc: use masks and shifts when converting GFP flags to migrate types
210
211 This patch redefines which GFP bits are used for specifying mobility and
212 the order of the migrate types. Once redefined it's possible to convert
213 GFP flags to a migrate type with a simple mask and shift. The only
214 downside is that readers of OOM kill messages and allocation failures may
215 have been used to the existing values but scripts/gfp-translate will help.
216
217 Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
218 Acked-by: Vlastimil Babka <vbabka@suse.cz>
219 Cc: Christoph Lameter <cl@linux.com>
220 Cc: David Rientjes <rientjes@google.com>
221 Cc: Johannes Weiner <hannes@cmpxchg.org>
222 Cc: Michal Hocko <mhocko@suse.com>
223 Cc: Vitaly Wool <vitalywool@gmail.com>
224 Cc: Rik van Riel <riel@redhat.com>
225 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
226 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
227
228diff --git a/include/linux/gfp.h b/include/linux/gfp.h
229index f92cbd2..440fca3 100644
230--- a/include/linux/gfp.h
231+++ b/include/linux/gfp.h
232@@ -14,7 +14,7 @@ struct vm_area_struct;
233 #define ___GFP_HIGHMEM 0x02u
234 #define ___GFP_DMA32 0x04u
235 #define ___GFP_MOVABLE 0x08u
236-#define ___GFP_WAIT 0x10u
237+#define ___GFP_RECLAIMABLE 0x10u
238 #define ___GFP_HIGH 0x20u
239 #define ___GFP_IO 0x40u
240 #define ___GFP_FS 0x80u
241@@ -29,7 +29,7 @@ struct vm_area_struct;
242 #define ___GFP_NOMEMALLOC 0x10000u
243 #define ___GFP_HARDWALL 0x20000u
244 #define ___GFP_THISNODE 0x40000u
245-#define ___GFP_RECLAIMABLE 0x80000u
246+#define ___GFP_WAIT 0x80000u
247 #define ___GFP_NOACCOUNT 0x100000u
248 #define ___GFP_NOTRACK 0x200000u
249 #define ___GFP_NO_KSWAPD 0x400000u
250@@ -126,6 +126,7 @@ struct vm_area_struct;
251
252 /* This mask makes up all the page movable related flags */
253 #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
254+#define GFP_MOVABLE_SHIFT 3
255
256 /* Control page allocator reclaim behavior */
257 #define GFP_RECLAIM_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|\
258@@ -152,14 +153,15 @@ struct vm_area_struct;
259 /* Convert GFP flags to their corresponding migrate type */
260 static inline int gfpflags_to_migratetype(const gfp_t gfp_flags)
261 {
262- WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
263+ VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
264+ BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE);
265+ BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE);
266
267 if (unlikely(page_group_by_mobility_disabled))
268 return MIGRATE_UNMOVABLE;
269
270 /* Group based on mobility */
271- return (((gfp_flags & __GFP_MOVABLE) != 0) << 1) |
272- ((gfp_flags & __GFP_RECLAIMABLE) != 0);
273+ return (gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT;
274 }
275
276 #ifdef CONFIG_HIGHMEM
277diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
278index e326843..38bed71 100644
279--- a/include/linux/mmzone.h
280+++ b/include/linux/mmzone.h
281@@ -37,8 +37,8 @@
282
283 enum {
284 MIGRATE_UNMOVABLE,
285- MIGRATE_RECLAIMABLE,
286 MIGRATE_MOVABLE,
287+ MIGRATE_RECLAIMABLE,
288 MIGRATE_PCPTYPES, /* the number of types on the pcp lists */
289 MIGRATE_RESERVE = MIGRATE_PCPTYPES,
290 #ifdef CONFIG_CMA
291commit 974a786e63c96a2401a78ddba926f34c128474f1
292Author: Mel Gorman <mgorman@techsingularity.net>
293Date: Fri Nov 6 16:28:34 2015 -0800
294
295 mm, page_alloc: remove MIGRATE_RESERVE
296
297 MIGRATE_RESERVE preserves an old property of the buddy allocator that
298 existed prior to fragmentation avoidance -- min_free_kbytes worth of pages
299 tended to remain contiguous until the only alternative was to fail the
300 allocation. At the time it was discovered that high-order atomic
301 allocations relied on this property so MIGRATE_RESERVE was introduced. A
302 later patch will introduce an alternative MIGRATE_HIGHATOMIC so this patch
303 deletes MIGRATE_RESERVE and supporting code so it'll be easier to review.
304 Note that this patch in isolation may look like a false regression if
305 someone was bisecting high-order atomic allocation failures.
306
307 Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
308 Acked-by: Vlastimil Babka <vbabka@suse.cz>
309 Cc: Christoph Lameter <cl@linux.com>
310 Cc: David Rientjes <rientjes@google.com>
311 Cc: Johannes Weiner <hannes@cmpxchg.org>
312 Cc: Michal Hocko <mhocko@suse.com>
313 Cc: Vitaly Wool <vitalywool@gmail.com>
314 Cc: Rik van Riel <riel@redhat.com>
315 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
316 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
317
318diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
319index 1e88aae..b86cfa3 100644
320--- a/include/linux/mmzone.h
321+++ b/include/linux/mmzone.h
322@@ -39,8 +39,6 @@ enum {
323 MIGRATE_UNMOVABLE,
324 MIGRATE_MOVABLE,
325 MIGRATE_RECLAIMABLE,
326- MIGRATE_PCPTYPES, /* the number of types on the pcp lists */
327- MIGRATE_RESERVE = MIGRATE_PCPTYPES,
328 #ifdef CONFIG_CMA
329 /*
330 * MIGRATE_CMA migration type is designed to mimic the way
331@@ -63,6 +61,8 @@ enum {
332 MIGRATE_TYPES
333 };
334
335+#define MIGRATE_PCPTYPES (MIGRATE_RECLAIMABLE+1)
336+
337 #ifdef CONFIG_CMA
338 # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
339 #else
340@@ -429,12 +429,6 @@ struct zone {
341
342 const char *name;
343
344- /*
345- * Number of MIGRATE_RESERVE page block. To maintain for just
346- * optimization. Protected by zone->lock.
347- */
348- int nr_migrate_reserve_block;
349-
350 #ifdef CONFIG_MEMORY_ISOLATION
351 /*
352 * Number of isolated pageblock. It is used to solve incorrect
353diff --git a/mm/huge_memory.c b/mm/huge_memory.c
354index 9812d46..dabd247 100644
355--- a/mm/huge_memory.c
356+++ b/mm/huge_memory.c
357@@ -116,7 +116,7 @@ static void set_recommended_min_free_kbytes(void)
358 for_each_populated_zone(zone)
359 nr_zones++;
360
361- /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
362+ /* Ensure 2 pageblocks are free to assist fragmentation avoidance */
363 recommended_min = pageblock_nr_pages * nr_zones * 2;
364
365 /*
366diff --git a/mm/page_alloc.c b/mm/page_alloc.c
367index 8dc6e3c..5888126 100644
368--- a/mm/page_alloc.c
369+++ b/mm/page_alloc.c
370@@ -817,7 +817,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
371 if (unlikely(has_isolate_pageblock(zone)))
372 mt = get_pageblock_migratetype(page);
373
374- /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
375 __free_one_page(page, page_to_pfn(page), zone, 0, mt);
376 trace_mm_page_pcpu_drain(page, 0, mt);
377 } while (--to_free && --batch_free && !list_empty(list));
378@@ -1417,15 +1416,14 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
379 * the free lists for the desirable migrate type are depleted
380 */
381 static int fallbacks[MIGRATE_TYPES][4] = {
382- [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
383- [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RESERVE },
384- [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
385+ [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
386+ [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
387+ [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
388 #ifdef CONFIG_CMA
389- [MIGRATE_CMA] = { MIGRATE_RESERVE }, /* Never used */
390+ [MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */
391 #endif
392- [MIGRATE_RESERVE] = { MIGRATE_RESERVE }, /* Never used */
393 #ifdef CONFIG_MEMORY_ISOLATION
394- [MIGRATE_ISOLATE] = { MIGRATE_RESERVE }, /* Never used */
395+ [MIGRATE_ISOLATE] = { MIGRATE_TYPES }, /* Never used */
396 #endif
397 };
398
399@@ -1598,7 +1596,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
400 *can_steal = false;
401 for (i = 0;; i++) {
402 fallback_mt = fallbacks[migratetype][i];
403- if (fallback_mt == MIGRATE_RESERVE)
404+ if (fallback_mt == MIGRATE_TYPES)
405 break;
406
407 if (list_empty(&area->free_list[fallback_mt]))
408@@ -1676,25 +1674,13 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order,
409 {
410 struct page *page;
411
412-retry_reserve:
413 page = __rmqueue_smallest(zone, order, migratetype);
414-
415- if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
416+ if (unlikely(!page)) {
417 if (migratetype == MIGRATE_MOVABLE)
418 page = __rmqueue_cma_fallback(zone, order);
419
420 if (!page)
421 page = __rmqueue_fallback(zone, order, migratetype);
422-
423- /*
424- * Use MIGRATE_RESERVE rather than fail an allocation. goto
425- * is used because __rmqueue_smallest is an inline function
426- * and we want just one call site
427- */
428- if (!page) {
429- migratetype = MIGRATE_RESERVE;
430- goto retry_reserve;
431- }
432 }
433
434 trace_mm_page_alloc_zone_locked(page, order, migratetype);
435@@ -3492,7 +3478,6 @@ static void show_migration_types(unsigned char type)
436 [MIGRATE_UNMOVABLE] = 'U',
437 [MIGRATE_RECLAIMABLE] = 'E',
438 [MIGRATE_MOVABLE] = 'M',
439- [MIGRATE_RESERVE] = 'R',
440 #ifdef CONFIG_CMA
441 [MIGRATE_CMA] = 'C',
442 #endif
443@@ -4303,120 +4288,6 @@ static inline unsigned long wait_table_bits(unsigned long size)
444 }
445
446 /*
447- * Check if a pageblock contains reserved pages
448- */
449-static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
450-{
451- unsigned long pfn;
452-
453- for (pfn = start_pfn; pfn < end_pfn; pfn++) {
454- if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
455- return 1;
456- }
457- return 0;
458-}
459-
460-/*
461- * Mark a number of pageblocks as MIGRATE_RESERVE. The number
462- * of blocks reserved is based on min_wmark_pages(zone). The memory within
463- * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
464- * higher will lead to a bigger reserve which will get freed as contiguous
465- * blocks as reclaim kicks in
466- */
467-static void setup_zone_migrate_reserve(struct zone *zone)
468-{
469- unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
470- struct page *page;
471- unsigned long block_migratetype;
472- int reserve;
473- int old_reserve;
474-
475- /*
476- * Get the start pfn, end pfn and the number of blocks to reserve
477- * We have to be careful to be aligned to pageblock_nr_pages to
478- * make sure that we always check pfn_valid for the first page in
479- * the block.
480- */
481- start_pfn = zone->zone_start_pfn;
482- end_pfn = zone_end_pfn(zone);
483- start_pfn = roundup(start_pfn, pageblock_nr_pages);
484- reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
485- pageblock_order;
486-
487- /*
488- * Reserve blocks are generally in place to help high-order atomic
489- * allocations that are short-lived. A min_free_kbytes value that
490- * would result in more than 2 reserve blocks for atomic allocations
491- * is assumed to be in place to help anti-fragmentation for the
492- * future allocation of hugepages at runtime.
493- */
494- reserve = min(2, reserve);
495- old_reserve = zone->nr_migrate_reserve_block;
496-
497- /* When memory hot-add, we almost always need to do nothing */
498- if (reserve == old_reserve)
499- return;
500- zone->nr_migrate_reserve_block = reserve;
501-
502- for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
503- if (!early_page_nid_uninitialised(pfn, zone_to_nid(zone)))
504- return;
505-
506- if (!pfn_valid(pfn))
507- continue;
508- page = pfn_to_page(pfn);
509-
510- /* Watch out for overlapping nodes */
511- if (page_to_nid(page) != zone_to_nid(zone))
512- continue;
513-
514- block_migratetype = get_pageblock_migratetype(page);
515-
516- /* Only test what is necessary when the reserves are not met */
517- if (reserve > 0) {
518- /*
519- * Blocks with reserved pages will never free, skip
520- * them.
521- */
522- block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
523- if (pageblock_is_reserved(pfn, block_end_pfn))
524- continue;
525-
526- /* If this block is reserved, account for it */
527- if (block_migratetype == MIGRATE_RESERVE) {
528- reserve--;
529- continue;
530- }
531-
532- /* Suitable for reserving if this block is movable */
533- if (block_migratetype == MIGRATE_MOVABLE) {
534- set_pageblock_migratetype(page,
535- MIGRATE_RESERVE);
536- move_freepages_block(zone, page,
537- MIGRATE_RESERVE);
538- reserve--;
539- continue;
540- }
541- } else if (!old_reserve) {
542- /*
543- * At boot time we don't need to scan the whole zone
544- * for turning off MIGRATE_RESERVE.
545- */
546- break;
547- }
548-
549- /*
550- * If the reserve is met and this is a previous reserved block,
551- * take it back
552- */
553- if (block_migratetype == MIGRATE_RESERVE) {
554- set_pageblock_migratetype(page, MIGRATE_MOVABLE);
555- move_freepages_block(zone, page, MIGRATE_MOVABLE);
556- }
557- }
558-}
559-
560-/*
561 * Initially all pages are reserved - free ones are freed
562 * up by free_all_bootmem() once the early boot process is
563 * done. Non-atomic initialization, single-pass.
564@@ -4455,9 +4326,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
565 * movable at startup. This will force kernel allocations
566 * to reserve their blocks rather than leaking throughout
567 * the address space during boot when many long-lived
568- * kernel allocations are made. Later some blocks near
569- * the start are marked MIGRATE_RESERVE by
570- * setup_zone_migrate_reserve()
571+ * kernel allocations are made.
572 *
573 * bitmap is created for zone's valid pfn range. but memmap
574 * can be created for invalid pages (for alignment)
575@@ -6018,7 +5887,6 @@ static void __setup_per_zone_wmarks(void)
576 high_wmark_pages(zone) - low_wmark_pages(zone) -
577 atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
578
579- setup_zone_migrate_reserve(zone);
580 spin_unlock_irqrestore(&zone->lock, flags);
581 }
582
583diff --git a/mm/vmstat.c b/mm/vmstat.c
584index ffcb4f5..5b289dc 100644
585--- a/mm/vmstat.c
586+++ b/mm/vmstat.c
587@@ -923,7 +923,6 @@ static char * const migratetype_names[MIGRATE_TYPES] = {
588 "Unmovable",
589 "Reclaimable",
590 "Movable",
591- "Reserve",
592 #ifdef CONFIG_CMA
593 "CMA",
594 #endif
595diff --git a/mm/backing-dev.c b/mm/backing-dev.c
596index 8ed2ffd963c5..7340353f8aea 100644
597--- a/mm/backing-dev.c
598+++ b/mm/backing-dev.c
599@@ -957,8 +957,9 @@ EXPORT_SYMBOL(congestion_wait);
600 * jiffies for either a BDI to exit congestion of the given @sync queue
601 * or a write to complete.
602 *
603- * In the absence of zone congestion, cond_resched() is called to yield
604- * the processor if necessary but otherwise does not sleep.
605+ * In the absence of zone congestion, a short sleep or a cond_resched is
606+ * performed to yield the processor and to allow other subsystems to make
607+ * a forward progress.
608 *
609 * The return value is 0 if the sleep is for the full timeout. Otherwise,
610 * it is the number of jiffies that were still remaining when the function
611@@ -978,7 +979,19 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
612 */
613 if (atomic_read(&nr_wb_congested[sync]) == 0 ||
614 !test_bit(ZONE_CONGESTED, &zone->flags)) {
615- cond_resched();
616+
617+ /*
618+ * Memory allocation/reclaim might be called from a WQ
619+ * context and the current implementation of the WQ
620+ * concurrency control doesn't recognize that a particular
621+ * WQ is congested if the worker thread is looping without
622+ * ever sleeping. Therefore we have to do a short sleep
623+ * here rather than calling cond_resched().
624+ */
625+ if (current->flags & PF_WQ_WORKER)
626+ schedule_timeout(1);
627+ else
628+ cond_resched();
629
630 /* In case we scheduled, work out time remaining */
631 ret = timeout - (jiffies - start);
632diff --git a/mm/vmstat.c b/mm/vmstat.c
633index 45dcbcb5c594..0975da8e3432 100644
634--- a/mm/vmstat.c
635+++ b/mm/vmstat.c
636@@ -1381,6 +1381,7 @@ static const struct file_operations proc_vmstat_file_operations = {
637 #endif /* CONFIG_PROC_FS */
638
639 #ifdef CONFIG_SMP
640+static struct workqueue_struct *vmstat_wq;
641 static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
642 int sysctl_stat_interval __read_mostly = HZ;
643 static cpumask_var_t cpu_stat_off;
644@@ -1393,7 +1394,7 @@ static void vmstat_update(struct work_struct *w)
645 * to occur in the future. Keep on running the
646 * update worker thread.
647 */
648- schedule_delayed_work_on(smp_processor_id(),
649+ queue_delayed_work_on(smp_processor_id(), vmstat_wq,
650 this_cpu_ptr(&vmstat_work),
651 round_jiffies_relative(sysctl_stat_interval));
652 } else {
653@@ -1462,7 +1463,7 @@ static void vmstat_shepherd(struct work_struct *w)
654 if (need_update(cpu) &&
655 cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
656
657- schedule_delayed_work_on(cpu,
658+ queue_delayed_work_on(cpu, vmstat_wq,
659 &per_cpu(vmstat_work, cpu), 0);
660
661 put_online_cpus();
662@@ -1551,6 +1552,7 @@ static int __init setup_vmstat(void)
663
664 start_shepherd_timer();
665 cpu_notifier_register_done();
666+ vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
667 #endif
668 #ifdef CONFIG_PROC_FS
669 proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
670--
6712.6.2
672
8194c377
AM
673From 09ccfd238e5a0e670d8178cf50180ea81ae09ae1 Mon Sep 17 00:00:00 2001
674From: WANG Cong <xiyou.wangcong@gmail.com>
675Date: Mon, 14 Dec 2015 13:48:36 -0800
676Subject: pptp: verify sockaddr_len in pptp_bind() and pptp_connect()
677
678Reported-by: Dmitry Vyukov <dvyukov@gmail.com>
679Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
680Signed-off-by: David S. Miller <davem@davemloft.net>
681---
682 drivers/net/ppp/pptp.c | 6 ++++++
683 1 file changed, 6 insertions(+)
684
685diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c
686index fc69e41..597c53e 100644
687--- a/drivers/net/ppp/pptp.c
688+++ b/drivers/net/ppp/pptp.c
689@@ -419,6 +419,9 @@ static int pptp_bind(struct socket *sock, struct sockaddr *uservaddr,
690 struct pptp_opt *opt = &po->proto.pptp;
691 int error = 0;
692
693+ if (sockaddr_len < sizeof(struct sockaddr_pppox))
694+ return -EINVAL;
695+
696 lock_sock(sk);
697
698 opt->src_addr = sp->sa_addr.pptp;
699@@ -440,6 +443,9 @@ static int pptp_connect(struct socket *sock, struct sockaddr *uservaddr,
700 struct flowi4 fl4;
701 int error = 0;
702
703+ if (sockaddr_len < sizeof(struct sockaddr_pppox))
704+ return -EINVAL;
705+
706 if (sp->sa_protocol != PX_PROTO_PPTP)
707 return -EINVAL;
708
709--
710cgit v0.11.2
711
552e066f
AM
712commit cc57858831e3e9678291de730c4b4d2e52a19f59
713Author: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
714Date: Fri Dec 18 15:19:16 2015 +1100
715
716 md/raid10: fix data corruption and crash during resync
717
718 The commit c31df25f20e3 ("md/raid10: make sync_request_write() call
719 bio_copy_data()") replaced manual data copying with bio_copy_data() but
720 it doesn't work as intended. The source bio (fbio) is already processed,
721 so its bvec_iter has bi_size == 0 and bi_idx == bi_vcnt. Because of
722 this, bio_copy_data() either does not copy anything, or worse, copies
723 data from the ->bi_next bio if it is set. This causes wrong data to be
724 written to drives during resync and sometimes lockups/crashes in
725 bio_copy_data():
726
727 [ 517.338478] NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [md126_raid10:3319]
728 [ 517.347324] Modules linked in: raid10 xt_CHECKSUM ipt_MASQUERADE nf_nat_masquerade_ipv4 tun ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 ipt_REJECT nf_reject_ipv4 xt_conntrack ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw ip6table_filter ip6_tables iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw iptable_filter ip_tables x86_pkg_temp_thermal coretemp kvm_intel kvm crct10dif_pclmul crc32_pclmul cryptd shpchp pcspkr ipmi_si ipmi_msghandler tpm_crb acpi_power_meter acpi_cpufreq ext4 mbcache jbd2 sr_mod cdrom sd_mod e1000e ax88179_178a usbnet mii ahci ata_generic crc32c_intel libahci ptp pata_acpi libata pps_core wmi sunrpc dm_mirror dm_region_hash dm_log dm_mod
729 [ 517.440555] CPU: 0 PID: 3319 Comm: md126_raid10 Not tainted 4.3.0-rc6+ #1
730 [ 517.448384] Hardware name: Intel Corporation PURLEY/PURLEY, BIOS PLYDCRB1.86B.0055.D14.1509221924 09/22/2015
731 [ 517.459768] task: ffff880153773980 ti: ffff880150df8000 task.ti: ffff880150df8000
732 [ 517.468529] RIP: 0010:[<ffffffff812e1888>] [<ffffffff812e1888>] bio_copy_data+0xc8/0x3c0
733 [ 517.478164] RSP: 0018:ffff880150dfbc98 EFLAGS: 00000246
734 [ 517.484341] RAX: ffff880169356688 RBX: 0000000000001000 RCX: 0000000000000000
735 [ 517.492558] RDX: 0000000000000000 RSI: ffffea0001ac2980 RDI: ffffea0000d835c0
736 [ 517.500773] RBP: ffff880150dfbd08 R08: 0000000000000001 R09: ffff880153773980
737 [ 517.508987] R10: ffff880169356600 R11: 0000000000001000 R12: 0000000000010000
738 [ 517.517199] R13: 000000000000e000 R14: 0000000000000000 R15: 0000000000001000
739 [ 517.525412] FS: 0000000000000000(0000) GS:ffff880174a00000(0000) knlGS:0000000000000000
740 [ 517.534844] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
741 [ 517.541507] CR2: 00007f8a044d5fed CR3: 0000000169504000 CR4: 00000000001406f0
742 [ 517.549722] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
743 [ 517.557929] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
744 [ 517.566144] Stack:
745 [ 517.568626] ffff880174a16bc0 ffff880153773980 ffff880169356600 0000000000000000
746 [ 517.577659] 0000000000000001 0000000000000001 ffff880153773980 ffff88016a61a800
747 [ 517.586715] ffff880150dfbcf8 0000000000000001 ffff88016dd209e0 0000000000001000
748 [ 517.595773] Call Trace:
749 [ 517.598747] [<ffffffffa043ef95>] raid10d+0xfc5/0x1690 [raid10]
750 [ 517.605610] [<ffffffff816697ae>] ? __schedule+0x29e/0x8e2
751 [ 517.611987] [<ffffffff814ff206>] md_thread+0x106/0x140
752 [ 517.618072] [<ffffffff810c1d80>] ? wait_woken+0x80/0x80
753 [ 517.624252] [<ffffffff814ff100>] ? super_1_load+0x520/0x520
754 [ 517.630817] [<ffffffff8109ef89>] kthread+0xc9/0xe0
755 [ 517.636506] [<ffffffff8109eec0>] ? flush_kthread_worker+0x70/0x70
756 [ 517.643653] [<ffffffff8166d99f>] ret_from_fork+0x3f/0x70
757 [ 517.649929] [<ffffffff8109eec0>] ? flush_kthread_worker+0x70/0x70
758
759 Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
760 Reviewed-by: Shaohua Li <shli@kernel.org>
761 Cc: stable@vger.kernel.org (v4.2+)
762 Fixes: c31df25f20e3 ("md/raid10: make sync_request_write() call bio_copy_data()")
763 Signed-off-by: NeilBrown <neilb@suse.com>
764
765diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
766index 41d70bc..84e597e 100644
767--- a/drivers/md/raid10.c
768+++ b/drivers/md/raid10.c
769@@ -1946,6 +1946,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
770
771 first = i;
772 fbio = r10_bio->devs[i].bio;
773+ fbio->bi_iter.bi_size = r10_bio->sectors << 9;
774+ fbio->bi_iter.bi_idx = 0;
775
776 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
777 /* now find blocks with errors */
778@@ -1989,7 +1991,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
779 bio_reset(tbio);
780
781 tbio->bi_vcnt = vcnt;
782- tbio->bi_iter.bi_size = r10_bio->sectors << 9;
783+ tbio->bi_iter.bi_size = fbio->bi_iter.bi_size;
784 tbio->bi_rw = WRITE;
785 tbio->bi_private = r10_bio;
786 tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
28b20467
AM
787From: Michal Hocko <mhocko@suse.com>
788
789kernel test robot has reported the following crash:
790[ 3.870718] BUG: unable to handle kernel NULL pointer dereferenceNULL pointer dereference at 00000100
791 at 00000100
792[ 3.872615] IP: [<c1074df6>] __queue_work+0x26/0x390 [<c1074df6>] __queue_work+0x26/0x390
793[ 3.873758] *pdpt = 0000000000000000 *pde = f000ff53f000ff53 *pde = f000ff53f000ff53
794[ 3.875096] Oops: 0000 [#1] PREEMPT PREEMPT SMP SMP
795[ 3.876130] CPU: 0 PID: 24 Comm: kworker/0:1 Not tainted 4.4.0-rc4-00139-g373ccbe #1
796[ 3.878135] Workqueue: events vmstat_shepherd
797[ 3.879207] task: cb684600 ti: cb7ba000 task.ti: cb7ba000
798[ 3.880445] EIP: 0060:[<c1074df6>] EFLAGS: 00010046 CPU: 0
799[ 3.881704] EIP is at __queue_work+0x26/0x390
800[ 3.882823] EAX: 00000046 EBX: cbb37800 ECX: cbb37800 EDX: 00000000
801[ 3.884457] ESI: 00000000 EDI: 00000000 EBP: cb7bbe68 ESP: cb7bbe38
802[ 3.886005] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
803[ 3.887229] CR0: 8005003b CR2: 00000100 CR3: 01fd5000 CR4: 000006b0
804[ 3.888663] Stack:
805[ 3.895204] Call Trace:
806[ 3.895854] [<c1a381dd>] ? mutex_unlock+0xd/0x10
807[ 3.897120] [<c1075221>] __queue_delayed_work+0xa1/0x160
808[ 3.898530] [<c10764c6>] queue_delayed_work_on+0x36/0x60
809[ 3.899790] [<c11494bd>] vmstat_shepherd+0xad/0xf0
810[ 3.900899] [<c1075a7a>] process_one_work+0x1aa/0x4c0
811[ 3.902093] [<c10759e2>] ? process_one_work+0x112/0x4c0
812[ 3.903520] [<c10ac31e>] ? do_raw_spin_lock+0xe/0x150
813[ 3.904853] [<c1075dd1>] worker_thread+0x41/0x440
814[ 3.906023] [<c1075d90>] ? process_one_work+0x4c0/0x4c0
815[ 3.907242] [<c107b7c0>] kthread+0xb0/0xd0
816[ 3.908188] [<c1a3c651>] ret_from_kernel_thread+0x21/0x40
817[ 3.909601] [<c107b710>] ? __kthread_parkme+0x80/0x80
818
819The reason is that start_shepherd_timer schedules the shepherd work item
820which uses vmstat_wq (vmstat_shepherd) before setup_vmstat allocates
821that workqueue so if the further initialization takes more than HZ
822we might end up scheduling on a NULL vmstat_wq. This is really unlikely
823but not impossible.
824
825Fixes: 373ccbe59270 ("mm, vmstat: allow WQ concurrency to discover memory reclaim doesn't make any progress")
826Reported-by: kernel test robot <ying.huang@linux.intel.com>
827Signed-off-by: Michal Hocko <mhocko@suse.com>
828---
829Hi Linus,
830I am not marking this for stable because I hope we can sneak it into 4.4.
831The patch is trivial and obvious. I am sorry about the breakage. If you prefer
832to postpone it to 4.5-rc1 because this is not really that critical and shouldn't
833happen most of the time then I will repost with stable tag added.
834
835Thanks!
836
837 mm/vmstat.c | 2 +-
838 1 file changed, 1 insertion(+), 1 deletion(-)
839
840diff --git a/mm/vmstat.c b/mm/vmstat.c
841index 4ebc17d948cb..c54fd2924f25 100644
842--- a/mm/vmstat.c
843+++ b/mm/vmstat.c
844@@ -1483,6 +1483,7 @@ static void __init start_shepherd_timer(void)
845 BUG();
846 cpumask_copy(cpu_stat_off, cpu_online_mask);
847
848+ vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
849 schedule_delayed_work(&shepherd,
850 round_jiffies_relative(sysctl_stat_interval));
851 }
852@@ -1550,7 +1551,6 @@ static int __init setup_vmstat(void)
853
854 start_shepherd_timer();
855 cpu_notifier_register_done();
856- vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
857 #endif
858 #ifdef CONFIG_PROC_FS
859 proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
860--
8612.6.4
862
863From: Dave Chinner <dchinner@redhat.com>
864
865When we do inode readahead in log recovery, we do can do the
866readahead before we've replayed the icreate transaction that stamps
867the buffer with inode cores. The inode readahead verifier catches
868this and marks the buffer as !done to indicate that it doesn't yet
869contain valid inodes.
870
871In adding buffer error notification (i.e. setting b_error = -EIO at
872the same time as as we clear the done flag) to such a readahead
873verifier failure, we can then get subsequent inode recovery failing
874with this error:
875
876XFS (dm-0): metadata I/O error: block 0xa00060 ("xlog_recover_do..(read#2)") error 5 numblks 32
877
878This occurs when readahead completion races with icreate item replay
879such as:
880
881 inode readahead
882 find buffer
883 lock buffer
884 submit RA io
885 ....
886 icreate recovery
887 xfs_trans_get_buffer
888 find buffer
889 lock buffer
890 <blocks on RA completion>
891 .....
892 <ra completion>
893 fails verifier
894 clear XBF_DONE
895 set bp->b_error = -EIO
896 release and unlock buffer
897 <icreate gains lock>
898 icreate initialises buffer
899 marks buffer as done
900 adds buffer to delayed write queue
901 releases buffer
902
903At this point, we have an initialised inode buffer that is up to
904date but has an -EIO state registered against it. When we finally
905get to recovering an inode in that buffer:
906
907 inode item recovery
908 xfs_trans_read_buffer
909 find buffer
910 lock buffer
911 sees XBF_DONE is set, returns buffer
912 sees bp->b_error is set
913 fail log recovery!
914
915Essentially, we need xfs_trans_get_buf_map() to clear the error status of
916the buffer when doing a lookup. This function returns uninitialised
917buffers, so the buffer returned can not be in an error state and
918none of the code that uses this function expects b_error to be set
919on return. Indeed, there is an ASSERT(!bp->b_error); in the
920transaction case in xfs_trans_get_buf_map() that would have caught
921this if log recovery used transactions....
922
923This patch firstly changes the inode readahead failure to set -EIO
924on the buffer, and secondly changes xfs_buf_get_map() to never
925return a buffer with an error state set so this first change doesn't
926cause unexpected log recovery failures.
927
928Signed-off-by: Dave Chinner <dchinner@redhat.com>
929---
930 fs/xfs/libxfs/xfs_inode_buf.c | 12 +++++++-----
931 fs/xfs/xfs_buf.c | 7 +++++++
932 2 files changed, 14 insertions(+), 5 deletions(-)
933
934diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
935index 1b8d98a..ff17c48 100644
936--- a/fs/xfs/libxfs/xfs_inode_buf.c
937+++ b/fs/xfs/libxfs/xfs_inode_buf.c
938@@ -62,11 +62,12 @@ xfs_inobp_check(
939 * has not had the inode cores stamped into it. Hence for readahead, the buffer
940 * may be potentially invalid.
941 *
942- * If the readahead buffer is invalid, we don't want to mark it with an error,
943- * but we do want to clear the DONE status of the buffer so that a followup read
944- * will re-read it from disk. This will ensure that we don't get an unnecessary
945- * warnings during log recovery and we don't get unnecssary panics on debug
946- * kernels.
947+ * If the readahead buffer is invalid, we need to mark it with an error and
948+ * clear the DONE status of the buffer so that a followup read will re-read it
949+ * from disk. We don't report the error otherwise to avoid warnings during log
950+ * recovery and we don't get unnecssary panics on debug kernels. We use EIO here
951+ * because all we want to do is say readahead failed; there is no-one to report
952+ * the error to, so this will distinguish it from a non-ra verifier failure.
953 */
954 static void
955 xfs_inode_buf_verify(
956@@ -93,6 +94,7 @@ xfs_inode_buf_verify(
957 XFS_RANDOM_ITOBP_INOTOBP))) {
958 if (readahead) {
959 bp->b_flags &= ~XBF_DONE;
960+ xfs_buf_ioerror(bp, -EIO);
961 return;
962 }
963
964diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
965index 45a8ea7..ae86b16 100644
966--- a/fs/xfs/xfs_buf.c
967+++ b/fs/xfs/xfs_buf.c
968@@ -604,6 +604,13 @@ found:
969 }
970 }
971
972+ /*
973+ * Clear b_error if this is a lookup from a caller that doesn't expect
974+ * valid data to be found in the buffer.
975+ */
976+ if (!(flags & XBF_READ))
977+ xfs_buf_ioerror(bp, 0);
978+
979 XFS_STATS_INC(xb_get);
980 trace_xfs_buf_get(bp, flags, _RET_IP_);
981 return bp;
982--
9832.5.0
984
985_______________________________________________
986xfs mailing list
987xfs@oss.sgi.com
988http://oss.sgi.com/mailman/listinfo/xfs
ea1e61a2
AM
989From: Dave Chinner <dchinner@redhat.com>
990
991When we do dquot readahead in log recovery, we do not use a verifier
992as the underlying buffer may not have dquots in it. e.g. the
993allocation operation hasn't yet been replayed. Hence we do not want
994to fail recovery because we detect an operation to be replayed has
995not been run yet. This problem was addressed for inodes in commit
996d891400 ("xfs: inode buffers may not be valid during recovery
997readahead") but the problem was not recognised to exist for dquots
998and their buffers as the dquot readahead did not have a verifier.
999
1000The result of not using a verifier is that when the buffer is then
1001next read to replay a dquot modification, the dquot buffer verifier
1002will only be attached to the buffer if *readahead is not complete*.
1003Hence we can read the buffer, replay the dquot changes and then add
1004it to the delwri submission list without it having a verifier
1005attached to it. This then generates warnings in xfs_buf_ioapply(),
1006which catches and warns about this case.
1007
1008Fix this and make it handle the same readahead verifier error cases
1009as for inode buffers by adding a new readahead verifier that has a
1010write operation as well as a read operation that marks the buffer as
1011not done if any corruption is detected. Also make sure we don't run
1012readahead if the dquot buffer has been marked as cancelled by
1013recovery.
1014
1015This will result in readahead either succeeding and the buffer
1016having a valid write verifier, or readahead failing and the buffer
1017state requiring the subsequent read to resubmit the IO with the new
1018verifier. In either case, this will result in the buffer always
1019ending up with a valid write verifier on it.
1020
1021Note: we also need to fix the inode buffer readahead error handling
1022to mark the buffer with EIO. Brian noticed the code I copied from
1023there wrong during review, so fix it at the same time. Add comments
1024linking the two functions that handle readahead verifier errors
1025together so we don't forget this behavioural link in future.
1026
1027cc: <stable@vger.kernel.org> # 3.12 - current
1028Signed-off-by: Dave Chinner <dchinner@redhat.com>
28b20467
AM
1029Reviewed-by: Brian Foster <bfoster@redhat.com>
1030Signed-off-by: Dave Chinner <david@fromorbit.com>
ea1e61a2 1031---
ea1e61a2 1032 fs/xfs/libxfs/xfs_dquot_buf.c | 36 ++++++++++++++++++++++++++++++------
28b20467 1033 fs/xfs/libxfs/xfs_inode_buf.c | 2 ++
ea1e61a2
AM
1034 fs/xfs/libxfs/xfs_quota_defs.h | 2 +-
1035 fs/xfs/libxfs/xfs_shared.h | 1 +
1036 fs/xfs/xfs_log_recover.c | 9 +++++++--
28b20467 1037 5 files changed, 41 insertions(+), 9 deletions(-)
ea1e61a2
AM
1038
1039diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
1040index 11cefb2..3cc3cf7 100644
1041--- a/fs/xfs/libxfs/xfs_dquot_buf.c
1042+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
1043@@ -54,7 +54,7 @@ xfs_dqcheck(
1044 xfs_dqid_t id,
1045 uint type, /* used only when IO_dorepair is true */
1046 uint flags,
1047- char *str)
1048+ const char *str)
1049 {
1050 xfs_dqblk_t *d = (xfs_dqblk_t *)ddq;
1051 int errs = 0;
1052@@ -207,7 +207,8 @@ xfs_dquot_buf_verify_crc(
1053 STATIC bool
1054 xfs_dquot_buf_verify(
1055 struct xfs_mount *mp,
1056- struct xfs_buf *bp)
1057+ struct xfs_buf *bp,
1058+ int warn)
1059 {
1060 struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
1061 xfs_dqid_t id = 0;
1062@@ -240,8 +241,7 @@ xfs_dquot_buf_verify(
1063 if (i == 0)
1064 id = be32_to_cpu(ddq->d_id);
1065
1066- error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
1067- "xfs_dquot_buf_verify");
1068+ error = xfs_dqcheck(mp, ddq, id + i, 0, warn, __func__);
1069 if (error)
1070 return false;
1071 }
1072@@ -256,7 +256,7 @@ xfs_dquot_buf_read_verify(
1073
1074 if (!xfs_dquot_buf_verify_crc(mp, bp))
1075 xfs_buf_ioerror(bp, -EFSBADCRC);
1076- else if (!xfs_dquot_buf_verify(mp, bp))
1077+ else if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN))
1078 xfs_buf_ioerror(bp, -EFSCORRUPTED);
1079
1080 if (bp->b_error)
1081@@ -264,6 +264,25 @@ xfs_dquot_buf_read_verify(
1082 }
1083
1084 /*
1085+ * readahead errors are silent and simply leave the buffer as !done so a real
1086+ * read will then be run with the xfs_dquot_buf_ops verifier. See
1087+ * xfs_inode_buf_verify() for why we use EIO and ~XBF_DONE here rather than
1088+ * reporting the failure.
1089+ */
1090+static void
1091+xfs_dquot_buf_readahead_verify(
1092+ struct xfs_buf *bp)
1093+{
1094+ struct xfs_mount *mp = bp->b_target->bt_mount;
1095+
1096+ if (!xfs_dquot_buf_verify_crc(mp, bp) ||
1097+ !xfs_dquot_buf_verify(mp, bp, 0)) {
1098+ xfs_buf_ioerror(bp, -EIO);
1099+ bp->b_flags &= ~XBF_DONE;
1100+ }
1101+}
1102+
1103+/*
1104 * we don't calculate the CRC here as that is done when the dquot is flushed to
1105 * the buffer after the update is done. This ensures that the dquot in the
1106 * buffer always has an up-to-date CRC value.
1107@@ -274,7 +293,7 @@ xfs_dquot_buf_write_verify(
1108 {
1109 struct xfs_mount *mp = bp->b_target->bt_mount;
1110
1111- if (!xfs_dquot_buf_verify(mp, bp)) {
1112+ if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN)) {
1113 xfs_buf_ioerror(bp, -EFSCORRUPTED);
1114 xfs_verifier_error(bp);
1115 return;
1116@@ -287,3 +306,8 @@ const struct xfs_buf_ops xfs_dquot_buf_ops = {
1117 .verify_write = xfs_dquot_buf_write_verify,
1118 };
1119
1120+const struct xfs_buf_ops xfs_dquot_buf_ra_ops = {
1121+
1122+ .verify_read = xfs_dquot_buf_readahead_verify,
1123+ .verify_write = xfs_dquot_buf_write_verify,
1124+};
1125diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
28b20467 1126index ff17c48..1aabfda 100644
ea1e61a2
AM
1127--- a/fs/xfs/libxfs/xfs_inode_buf.c
1128+++ b/fs/xfs/libxfs/xfs_inode_buf.c
28b20467
AM
1129@@ -68,6 +68,8 @@ xfs_inobp_check(
1130 * recovery and we don't get unnecssary panics on debug kernels. We use EIO here
1131 * because all we want to do is say readahead failed; there is no-one to report
1132 * the error to, so this will distinguish it from a non-ra verifier failure.
ea1e61a2
AM
1133+ * Changes to this readahead error behavour also need to be reflected in
1134+ * xfs_dquot_buf_readahead_verify().
1135 */
1136 static void
1137 xfs_inode_buf_verify(
ea1e61a2
AM
1138diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
1139index 1b0a083..f51078f 100644
1140--- a/fs/xfs/libxfs/xfs_quota_defs.h
1141+++ b/fs/xfs/libxfs/xfs_quota_defs.h
1142@@ -153,7 +153,7 @@ typedef __uint16_t xfs_qwarncnt_t;
1143 #define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
1144
1145 extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq,
1146- xfs_dqid_t id, uint type, uint flags, char *str);
1147+ xfs_dqid_t id, uint type, uint flags, const char *str);
1148 extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
1149
1150 #endif /* __XFS_QUOTA_H__ */
1151diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
1152index 5be5297..15c3ceb 100644
1153--- a/fs/xfs/libxfs/xfs_shared.h
1154+++ b/fs/xfs/libxfs/xfs_shared.h
1155@@ -49,6 +49,7 @@ extern const struct xfs_buf_ops xfs_inobt_buf_ops;
1156 extern const struct xfs_buf_ops xfs_inode_buf_ops;
1157 extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
1158 extern const struct xfs_buf_ops xfs_dquot_buf_ops;
1159+extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops;
1160 extern const struct xfs_buf_ops xfs_sb_buf_ops;
1161 extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
1162 extern const struct xfs_buf_ops xfs_symlink_buf_ops;
1163diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
28b20467 1164index c5ecaac..5991cdc 100644
ea1e61a2
AM
1165--- a/fs/xfs/xfs_log_recover.c
1166+++ b/fs/xfs/xfs_log_recover.c
28b20467 1167@@ -3204,6 +3204,7 @@ xlog_recover_dquot_ra_pass2(
ea1e61a2
AM
1168 struct xfs_disk_dquot *recddq;
1169 struct xfs_dq_logformat *dq_f;
1170 uint type;
1171+ int len;
1172
1173
1174 if (mp->m_qflags == 0)
28b20467 1175@@ -3224,8 +3225,12 @@ xlog_recover_dquot_ra_pass2(
ea1e61a2
AM
1176 ASSERT(dq_f);
1177 ASSERT(dq_f->qlf_len == 1);
1178
1179- xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno,
1180- XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL);
1181+ len = XFS_FSB_TO_BB(mp, dq_f->qlf_len);
1182+ if (xlog_peek_buffer_cancelled(log, dq_f->qlf_blkno, len, 0))
1183+ return;
1184+
1185+ xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, len,
1186+ &xfs_dquot_buf_ra_ops);
1187 }
1188
1189 STATIC void
28b20467
AM
1190--
11912.5.0
ea1e61a2
AM
1192
1193_______________________________________________
1194xfs mailing list
1195xfs@oss.sgi.com
1196http://oss.sgi.com/mailman/listinfo/xfs
This page took 1.057485 seconds and 4 git commands to generate.