From e1bba219cb52f8e69e8d832ef6a255cefefed95c Mon Sep 17 00:00:00 2001
From: =?utf8?q?Arkadiusz=20Mi=C5=9Bkiewicz?= <arekm@maven.pl>
Date: Wed, 18 Nov 2015 11:04:52 +0100
Subject: [PATCH] - backport mm improvements/fixes from git and one from
 mailing list (http://www.spinics.net/lists/linux-mm/msg96871.html)

---
 kernel-small_fixes.patch | 555 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 555 insertions(+)

diff --git a/kernel-small_fixes.patch b/kernel-small_fixes.patch
index 68d235cd..20d43776 100644
--- a/kernel-small_fixes.patch
+++ b/kernel-small_fixes.patch
@@ -116,3 +116,558 @@ index 29531ec..65fbfb7 100644
 -- 
 cgit v0.11.2
 
+commit c2d42c16ad83006a706d83e51a7268db04af733a
+Author: Andrew Morton <akpm@linux-foundation.org>
+Date:   Thu Nov 5 18:48:43 2015 -0800
+
+    mm/vmstat.c: uninline node_page_state()
+    
+    With x86_64 (config http://ozlabs.org/~akpm/config-akpm2.txt) and old gcc
+    (4.4.4), drivers/base/node.c:node_read_meminfo() is using 2344 bytes of
+    stack.  Uninlining node_page_state() reduces this to 440 bytes.
+    
+    The stack consumption issue is fixed by newer gcc (4.8.4) however with
+    that compiler this patch reduces the node.o text size from 7314 bytes to
+    4578.
+    
+    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+
+diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
+index 82e7db7..49dfe40 100644
+--- a/include/linux/vmstat.h
++++ b/include/linux/vmstat.h
+@@ -161,30 +161,8 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
+ }
+ 
+ #ifdef CONFIG_NUMA
+-/*
+- * Determine the per node value of a stat item. This function
+- * is called frequently in a NUMA machine, so try to be as
+- * frugal as possible.
+- */
+-static inline unsigned long node_page_state(int node,
+-				 enum zone_stat_item item)
+-{
+-	struct zone *zones = NODE_DATA(node)->node_zones;
+-
+-	return
+-#ifdef CONFIG_ZONE_DMA
+-		zone_page_state(&zones[ZONE_DMA], item) +
+-#endif
+-#ifdef CONFIG_ZONE_DMA32
+-		zone_page_state(&zones[ZONE_DMA32], item) +
+-#endif
+-#ifdef CONFIG_HIGHMEM
+-		zone_page_state(&zones[ZONE_HIGHMEM], item) +
+-#endif
+-		zone_page_state(&zones[ZONE_NORMAL], item) +
+-		zone_page_state(&zones[ZONE_MOVABLE], item);
+-}
+ 
++extern unsigned long node_page_state(int node, enum zone_stat_item item);
+ extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp);
+ 
+ #else
+diff --git a/mm/vmstat.c b/mm/vmstat.c
+index fbf1448..ffcb4f5 100644
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -591,6 +591,28 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
+ 	else
+ 		__inc_zone_state(z, NUMA_OTHER);
+ }
++
++/*
++ * Determine the per node value of a stat item.
++ */
++unsigned long node_page_state(int node, enum zone_stat_item item)
++{
++	struct zone *zones = NODE_DATA(node)->node_zones;
++
++	return
++#ifdef CONFIG_ZONE_DMA
++		zone_page_state(&zones[ZONE_DMA], item) +
++#endif
++#ifdef CONFIG_ZONE_DMA32
++		zone_page_state(&zones[ZONE_DMA32], item) +
++#endif
++#ifdef CONFIG_HIGHMEM
++		zone_page_state(&zones[ZONE_HIGHMEM], item) +
++#endif
++		zone_page_state(&zones[ZONE_NORMAL], item) +
++		zone_page_state(&zones[ZONE_MOVABLE], item);
++}
++
+ #endif
+ 
+ #ifdef CONFIG_COMPACTION
+commit 016c13daa5c9e4827eca703e2f0621c131f2cca3
+Author: Mel Gorman <mgorman@techsingularity.net>
+Date:   Fri Nov 6 16:28:18 2015 -0800
+
+    mm, page_alloc: use masks and shifts when converting GFP flags to migrate types
+    
+    This patch redefines which GFP bits are used for specifying mobility and
+    the order of the migrate types.  Once redefined it's possible to convert
+    GFP flags to a migrate type with a simple mask and shift.  The only
+    downside is that readers of OOM kill messages and allocation failures may
+    have been used to the existing values but scripts/gfp-translate will help.
+    
+    Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+    Acked-by: Vlastimil Babka <vbabka@suse.cz>
+    Cc: Christoph Lameter <cl@linux.com>
+    Cc: David Rientjes <rientjes@google.com>
+    Cc: Johannes Weiner <hannes@cmpxchg.org>
+    Cc: Michal Hocko <mhocko@suse.com>
+    Cc: Vitaly Wool <vitalywool@gmail.com>
+    Cc: Rik van Riel <riel@redhat.com>
+    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+
+diff --git a/include/linux/gfp.h b/include/linux/gfp.h
+index f92cbd2..440fca3 100644
+--- a/include/linux/gfp.h
++++ b/include/linux/gfp.h
+@@ -14,7 +14,7 @@ struct vm_area_struct;
+ #define ___GFP_HIGHMEM		0x02u
+ #define ___GFP_DMA32		0x04u
+ #define ___GFP_MOVABLE		0x08u
+-#define ___GFP_WAIT		0x10u
++#define ___GFP_RECLAIMABLE	0x10u
+ #define ___GFP_HIGH		0x20u
+ #define ___GFP_IO		0x40u
+ #define ___GFP_FS		0x80u
+@@ -29,7 +29,7 @@ struct vm_area_struct;
+ #define ___GFP_NOMEMALLOC	0x10000u
+ #define ___GFP_HARDWALL		0x20000u
+ #define ___GFP_THISNODE		0x40000u
+-#define ___GFP_RECLAIMABLE	0x80000u
++#define ___GFP_WAIT		0x80000u
+ #define ___GFP_NOACCOUNT	0x100000u
+ #define ___GFP_NOTRACK		0x200000u
+ #define ___GFP_NO_KSWAPD	0x400000u
+@@ -126,6 +126,7 @@ struct vm_area_struct;
+ 
+ /* This mask makes up all the page movable related flags */
+ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
++#define GFP_MOVABLE_SHIFT 3
+ 
+ /* Control page allocator reclaim behavior */
+ #define GFP_RECLAIM_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|\
+@@ -152,14 +153,15 @@ struct vm_area_struct;
+ /* Convert GFP flags to their corresponding migrate type */
+ static inline int gfpflags_to_migratetype(const gfp_t gfp_flags)
+ {
+-	WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
++	VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
++	BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE);
++	BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE);
+ 
+ 	if (unlikely(page_group_by_mobility_disabled))
+ 		return MIGRATE_UNMOVABLE;
+ 
+ 	/* Group based on mobility */
+-	return (((gfp_flags & __GFP_MOVABLE) != 0) << 1) |
+-		((gfp_flags & __GFP_RECLAIMABLE) != 0);
++	return (gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT;
+ }
+ 
+ #ifdef CONFIG_HIGHMEM
+diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
+index e326843..38bed71 100644
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -37,8 +37,8 @@
+ 
+ enum {
+ 	MIGRATE_UNMOVABLE,
+-	MIGRATE_RECLAIMABLE,
+ 	MIGRATE_MOVABLE,
++	MIGRATE_RECLAIMABLE,
+ 	MIGRATE_PCPTYPES,	/* the number of types on the pcp lists */
+ 	MIGRATE_RESERVE = MIGRATE_PCPTYPES,
+ #ifdef CONFIG_CMA
+commit 974a786e63c96a2401a78ddba926f34c128474f1
+Author: Mel Gorman <mgorman@techsingularity.net>
+Date:   Fri Nov 6 16:28:34 2015 -0800
+
+    mm, page_alloc: remove MIGRATE_RESERVE
+    
+    MIGRATE_RESERVE preserves an old property of the buddy allocator that
+    existed prior to fragmentation avoidance -- min_free_kbytes worth of pages
+    tended to remain contiguous until the only alternative was to fail the
+    allocation.  At the time it was discovered that high-order atomic
+    allocations relied on this property so MIGRATE_RESERVE was introduced.  A
+    later patch will introduce an alternative MIGRATE_HIGHATOMIC so this patch
+    deletes MIGRATE_RESERVE and supporting code so it'll be easier to review.
+    Note that this patch in isolation may look like a false regression if
+    someone was bisecting high-order atomic allocation failures.
+    
+    Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+    Acked-by: Vlastimil Babka <vbabka@suse.cz>
+    Cc: Christoph Lameter <cl@linux.com>
+    Cc: David Rientjes <rientjes@google.com>
+    Cc: Johannes Weiner <hannes@cmpxchg.org>
+    Cc: Michal Hocko <mhocko@suse.com>
+    Cc: Vitaly Wool <vitalywool@gmail.com>
+    Cc: Rik van Riel <riel@redhat.com>
+    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+
+diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
+index 1e88aae..b86cfa3 100644
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -39,8 +39,6 @@ enum {
+ 	MIGRATE_UNMOVABLE,
+ 	MIGRATE_MOVABLE,
+ 	MIGRATE_RECLAIMABLE,
+-	MIGRATE_PCPTYPES,	/* the number of types on the pcp lists */
+-	MIGRATE_RESERVE = MIGRATE_PCPTYPES,
+ #ifdef CONFIG_CMA
+ 	/*
+ 	 * MIGRATE_CMA migration type is designed to mimic the way
+@@ -63,6 +61,8 @@ enum {
+ 	MIGRATE_TYPES
+ };
+ 
++#define MIGRATE_PCPTYPES (MIGRATE_RECLAIMABLE+1)
++
+ #ifdef CONFIG_CMA
+ #  define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
+ #else
+@@ -429,12 +429,6 @@ struct zone {
+ 
+ 	const char		*name;
+ 
+-	/*
+-	 * Number of MIGRATE_RESERVE page block. To maintain for just
+-	 * optimization. Protected by zone->lock.
+-	 */
+-	int			nr_migrate_reserve_block;
+-
+ #ifdef CONFIG_MEMORY_ISOLATION
+ 	/*
+ 	 * Number of isolated pageblock. It is used to solve incorrect
+diff --git a/mm/huge_memory.c b/mm/huge_memory.c
+index 9812d46..dabd247 100644
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -116,7 +116,7 @@ static void set_recommended_min_free_kbytes(void)
+ 	for_each_populated_zone(zone)
+ 		nr_zones++;
+ 
+-	/* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
++	/* Ensure 2 pageblocks are free to assist fragmentation avoidance */
+ 	recommended_min = pageblock_nr_pages * nr_zones * 2;
+ 
+ 	/*
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 8dc6e3c..5888126 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -817,7 +817,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
+ 			if (unlikely(has_isolate_pageblock(zone)))
+ 				mt = get_pageblock_migratetype(page);
+ 
+-			/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
+ 			__free_one_page(page, page_to_pfn(page), zone, 0, mt);
+ 			trace_mm_page_pcpu_drain(page, 0, mt);
+ 		} while (--to_free && --batch_free && !list_empty(list));
+@@ -1417,15 +1416,14 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
+  * the free lists for the desirable migrate type are depleted
+  */
+ static int fallbacks[MIGRATE_TYPES][4] = {
+-	[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,     MIGRATE_RESERVE },
+-	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,     MIGRATE_RESERVE },
+-	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
++	[MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_TYPES },
++	[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_TYPES },
++	[MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
+ #ifdef CONFIG_CMA
+-	[MIGRATE_CMA]         = { MIGRATE_RESERVE }, /* Never used */
++	[MIGRATE_CMA]         = { MIGRATE_TYPES }, /* Never used */
+ #endif
+-	[MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */
+ #ifdef CONFIG_MEMORY_ISOLATION
+-	[MIGRATE_ISOLATE]     = { MIGRATE_RESERVE }, /* Never used */
++	[MIGRATE_ISOLATE]     = { MIGRATE_TYPES }, /* Never used */
+ #endif
+ };
+ 
+@@ -1598,7 +1596,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
+ 	*can_steal = false;
+ 	for (i = 0;; i++) {
+ 		fallback_mt = fallbacks[migratetype][i];
+-		if (fallback_mt == MIGRATE_RESERVE)
++		if (fallback_mt == MIGRATE_TYPES)
+ 			break;
+ 
+ 		if (list_empty(&area->free_list[fallback_mt]))
+@@ -1676,25 +1674,13 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order,
+ {
+ 	struct page *page;
+ 
+-retry_reserve:
+ 	page = __rmqueue_smallest(zone, order, migratetype);
+-
+-	if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
++	if (unlikely(!page)) {
+ 		if (migratetype == MIGRATE_MOVABLE)
+ 			page = __rmqueue_cma_fallback(zone, order);
+ 
+ 		if (!page)
+ 			page = __rmqueue_fallback(zone, order, migratetype);
+-
+-		/*
+-		 * Use MIGRATE_RESERVE rather than fail an allocation. goto
+-		 * is used because __rmqueue_smallest is an inline function
+-		 * and we want just one call site
+-		 */
+-		if (!page) {
+-			migratetype = MIGRATE_RESERVE;
+-			goto retry_reserve;
+-		}
+ 	}
+ 
+ 	trace_mm_page_alloc_zone_locked(page, order, migratetype);
+@@ -3492,7 +3478,6 @@ static void show_migration_types(unsigned char type)
+ 		[MIGRATE_UNMOVABLE]	= 'U',
+ 		[MIGRATE_RECLAIMABLE]	= 'E',
+ 		[MIGRATE_MOVABLE]	= 'M',
+-		[MIGRATE_RESERVE]	= 'R',
+ #ifdef CONFIG_CMA
+ 		[MIGRATE_CMA]		= 'C',
+ #endif
+@@ -4303,120 +4288,6 @@ static inline unsigned long wait_table_bits(unsigned long size)
+ }
+ 
+ /*
+- * Check if a pageblock contains reserved pages
+- */
+-static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
+-{
+-	unsigned long pfn;
+-
+-	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+-		if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
+-			return 1;
+-	}
+-	return 0;
+-}
+-
+-/*
+- * Mark a number of pageblocks as MIGRATE_RESERVE. The number
+- * of blocks reserved is based on min_wmark_pages(zone). The memory within
+- * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
+- * higher will lead to a bigger reserve which will get freed as contiguous
+- * blocks as reclaim kicks in
+- */
+-static void setup_zone_migrate_reserve(struct zone *zone)
+-{
+-	unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
+-	struct page *page;
+-	unsigned long block_migratetype;
+-	int reserve;
+-	int old_reserve;
+-
+-	/*
+-	 * Get the start pfn, end pfn and the number of blocks to reserve
+-	 * We have to be careful to be aligned to pageblock_nr_pages to
+-	 * make sure that we always check pfn_valid for the first page in
+-	 * the block.
+-	 */
+-	start_pfn = zone->zone_start_pfn;
+-	end_pfn = zone_end_pfn(zone);
+-	start_pfn = roundup(start_pfn, pageblock_nr_pages);
+-	reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
+-							pageblock_order;
+-
+-	/*
+-	 * Reserve blocks are generally in place to help high-order atomic
+-	 * allocations that are short-lived. A min_free_kbytes value that
+-	 * would result in more than 2 reserve blocks for atomic allocations
+-	 * is assumed to be in place to help anti-fragmentation for the
+-	 * future allocation of hugepages at runtime.
+-	 */
+-	reserve = min(2, reserve);
+-	old_reserve = zone->nr_migrate_reserve_block;
+-
+-	/* When memory hot-add, we almost always need to do nothing */
+-	if (reserve == old_reserve)
+-		return;
+-	zone->nr_migrate_reserve_block = reserve;
+-
+-	for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+-		if (!early_page_nid_uninitialised(pfn, zone_to_nid(zone)))
+-			return;
+-
+-		if (!pfn_valid(pfn))
+-			continue;
+-		page = pfn_to_page(pfn);
+-
+-		/* Watch out for overlapping nodes */
+-		if (page_to_nid(page) != zone_to_nid(zone))
+-			continue;
+-
+-		block_migratetype = get_pageblock_migratetype(page);
+-
+-		/* Only test what is necessary when the reserves are not met */
+-		if (reserve > 0) {
+-			/*
+-			 * Blocks with reserved pages will never free, skip
+-			 * them.
+-			 */
+-			block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
+-			if (pageblock_is_reserved(pfn, block_end_pfn))
+-				continue;
+-
+-			/* If this block is reserved, account for it */
+-			if (block_migratetype == MIGRATE_RESERVE) {
+-				reserve--;
+-				continue;
+-			}
+-
+-			/* Suitable for reserving if this block is movable */
+-			if (block_migratetype == MIGRATE_MOVABLE) {
+-				set_pageblock_migratetype(page,
+-							MIGRATE_RESERVE);
+-				move_freepages_block(zone, page,
+-							MIGRATE_RESERVE);
+-				reserve--;
+-				continue;
+-			}
+-		} else if (!old_reserve) {
+-			/*
+-			 * At boot time we don't need to scan the whole zone
+-			 * for turning off MIGRATE_RESERVE.
+-			 */
+-			break;
+-		}
+-
+-		/*
+-		 * If the reserve is met and this is a previous reserved block,
+-		 * take it back
+-		 */
+-		if (block_migratetype == MIGRATE_RESERVE) {
+-			set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+-			move_freepages_block(zone, page, MIGRATE_MOVABLE);
+-		}
+-	}
+-}
+-
+-/*
+  * Initially all pages are reserved - free ones are freed
+  * up by free_all_bootmem() once the early boot process is
+  * done. Non-atomic initialization, single-pass.
+@@ -4455,9 +4326,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+ 		 * movable at startup. This will force kernel allocations
+ 		 * to reserve their blocks rather than leaking throughout
+ 		 * the address space during boot when many long-lived
+-		 * kernel allocations are made. Later some blocks near
+-		 * the start are marked MIGRATE_RESERVE by
+-		 * setup_zone_migrate_reserve()
++		 * kernel allocations are made.
+ 		 *
+ 		 * bitmap is created for zone's valid pfn range. but memmap
+ 		 * can be created for invalid pages (for alignment)
+@@ -6018,7 +5887,6 @@ static void __setup_per_zone_wmarks(void)
+ 			high_wmark_pages(zone) - low_wmark_pages(zone) -
+ 			atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
+ 
+-		setup_zone_migrate_reserve(zone);
+ 		spin_unlock_irqrestore(&zone->lock, flags);
+ 	}
+ 
+diff --git a/mm/vmstat.c b/mm/vmstat.c
+index ffcb4f5..5b289dc 100644
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -923,7 +923,6 @@ static char * const migratetype_names[MIGRATE_TYPES] = {
+ 	"Unmovable",
+ 	"Reclaimable",
+ 	"Movable",
+-	"Reserve",
+ #ifdef CONFIG_CMA
+ 	"CMA",
+ #endif
+diff --git a/mm/backing-dev.c b/mm/backing-dev.c
+index 8ed2ffd963c5..7340353f8aea 100644
+--- a/mm/backing-dev.c
++++ b/mm/backing-dev.c
+@@ -957,8 +957,9 @@ EXPORT_SYMBOL(congestion_wait);
+  * jiffies for either a BDI to exit congestion of the given @sync queue
+  * or a write to complete.
+  *
+- * In the absence of zone congestion, cond_resched() is called to yield
+- * the processor if necessary but otherwise does not sleep.
++ * In the absence of zone congestion, a short sleep or a cond_resched is
++ * performed to yield the processor and to allow other subsystems to make
++ * a forward progress.
+  *
+  * The return value is 0 if the sleep is for the full timeout. Otherwise,
+  * it is the number of jiffies that were still remaining when the function
+@@ -978,7 +979,19 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
+ 	 */
+ 	if (atomic_read(&nr_wb_congested[sync]) == 0 ||
+ 	    !test_bit(ZONE_CONGESTED, &zone->flags)) {
+-		cond_resched();
++
++		/*
++		 * Memory allocation/reclaim might be called from a WQ
++		 * context and the current implementation of the WQ
++		 * concurrency control doesn't recognize that a particular
++		 * WQ is congested if the worker thread is looping without
++		 * ever sleeping. Therefore we have to do a short sleep
++		 * here rather than calling cond_resched().
++		 */
++		if (current->flags & PF_WQ_WORKER)
++			schedule_timeout(1);
++		else
++			cond_resched();
+ 
+ 		/* In case we scheduled, work out time remaining */
+ 		ret = timeout - (jiffies - start);
+diff --git a/mm/vmstat.c b/mm/vmstat.c
+index 45dcbcb5c594..0975da8e3432 100644
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -1381,6 +1381,7 @@ static const struct file_operations proc_vmstat_file_operations = {
+ #endif /* CONFIG_PROC_FS */
+ 
+ #ifdef CONFIG_SMP
++static struct workqueue_struct *vmstat_wq;
+ static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
+ int sysctl_stat_interval __read_mostly = HZ;
+ static cpumask_var_t cpu_stat_off;
+@@ -1393,7 +1394,7 @@ static void vmstat_update(struct work_struct *w)
+ 		 * to occur in the future. Keep on running the
+ 		 * update worker thread.
+ 		 */
+-		schedule_delayed_work_on(smp_processor_id(),
++		queue_delayed_work_on(smp_processor_id(), vmstat_wq,
+ 			this_cpu_ptr(&vmstat_work),
+ 			round_jiffies_relative(sysctl_stat_interval));
+ 	} else {
+@@ -1462,7 +1463,7 @@ static void vmstat_shepherd(struct work_struct *w)
+ 		if (need_update(cpu) &&
+ 			cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
+ 
+-			schedule_delayed_work_on(cpu,
++			queue_delayed_work_on(cpu, vmstat_wq,
+ 				&per_cpu(vmstat_work, cpu), 0);
+ 
+ 	put_online_cpus();
+@@ -1551,6 +1552,7 @@ static int __init setup_vmstat(void)
+ 
+ 	start_shepherd_timer();
+ 	cpu_notifier_register_done();
++	vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+ #endif
+ #ifdef CONFIG_PROC_FS
+ 	proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
+-- 
+2.6.2
+
+  
-- 
2.44.0