- vmstat fix

[packages/kernel.git] / kernel-small_fixes.patch
diff --git a/kernel-small_fixes.patch b/kernel-small_fixes.patch

index 77e162d0df7b8fdc4f4000caa1c2a20a6daea9a1..e4cb3a85e6eea833f9b9de084cb8566add02973c 100644 (file)
--- a/kernel-small_fixes.patch
+++ b/kernel-small_fixes.patch
@@ -11,103 +11,6 @@
   
   /* Some toolchains use a `_' prefix for all user symbols. */
  
-From: Stephen Hemminger <shemminger@vyatta.com>
-
-Some BIOS's don't setup power management correctly (what else is
-new) and don't allow use of PCI Express power control. Add a special
-exception module parameter to allow working around this issue.
-Based on slightly different patch by Knut Petersen.
-
-Reported-by: Arkadiusz Miskiewicz <arekm@maven.pl>
-Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
----
-Patch against -net (ie. 3.3.0)
-
---- a/drivers/net/ethernet/marvell/sky2.c      2012-01-10 10:56:56.855156017 -0800
-+++ b/drivers/net/ethernet/marvell/sky2.c      2012-03-21 08:25:52.400929532 -0700
-@@ -95,6 +95,10 @@ static int disable_msi = 0;
- module_param(disable_msi, int, 0);
- MODULE_PARM_DESC(disable_msi, "Disable Message Signaled Interrupt (MSI)");
- 
-+static int legacy_pme = 0;
-+module_param(legacy_pme, int, 0);
-+MODULE_PARM_DESC(legacy_pme, "Legacy power management");
-+
- static DEFINE_PCI_DEVICE_TABLE(sky2_id_table) = {
-       { PCI_DEVICE(PCI_VENDOR_ID_SYSKONNECT, 0x9000) }, /* SK-9Sxx */
-       { PCI_DEVICE(PCI_VENDOR_ID_SYSKONNECT, 0x9E00) }, /* SK-9Exx */
-@@ -867,6 +871,13 @@ static void sky2_wol_init(struct sky2_po
-       /* Disable PiG firmware */
-       sky2_write16(hw, B0_CTST, Y2_HW_WOL_OFF);
- 
-+      /* Needed by some broken BIOSes, use PCI rather than PCI-e for WOL */
-+      if (legacy_pme) {
-+              u32 reg1 = sky2_pci_read32(hw, PCI_DEV_REG1);
-+              reg1 |= PCI_Y2_PME_LEGACY;
-+              sky2_pci_write32(hw, PCI_DEV_REG1, reg1);
-+      }
-+
-       /* block receiver */
-       sky2_write8(hw, SK_REG(port, RX_GMF_CTRL_T), GMF_RST_SET);
-       sky2_read32(hw, B0_CTST);
-
-
-The RCU problem is likely to be a separate issue.  It might even be a 
-result of the use-after-free problem with the elevator.
-
-At any rate, it's clear that the crash in the refcounting log you
-posted occurred because scsi_setup_blk_pc_cmnd() called
-scsi_prep_state_check(), which tried to dereference the NULL pointer.
-
-Would you like to try this patch to see if it fixes the problem?  As I 
-said before, I'm not certain it's the best thing to do, but it worked 
-on my system.
-
-Alan Stern
-
-
-Index: usb-3.0/drivers/scsi/scsi_lib.c
-===================================================================
---- usb-3.0.orig/drivers/scsi/scsi_lib.c
-+++ usb-3.0/drivers/scsi/scsi_lib.c
-@@ -1247,6 +1247,8 @@ int scsi_prep_fn(struct request_queue *q
-       struct scsi_device *sdev = q->queuedata;
-       int ret = BLKPREP_KILL;
- 
-+      if (!sdev)
-+              return ret;
-       if (req->cmd_type == REQ_TYPE_BLOCK_PC)
-               ret = scsi_setup_blk_pc_cmnd(sdev, req);
-       return scsi_prep_return(q, req, ret);
-Index: usb-3.0/drivers/scsi/scsi_sysfs.c
-===================================================================
---- usb-3.0.orig/drivers/scsi/scsi_sysfs.c
-+++ usb-3.0/drivers/scsi/scsi_sysfs.c
-@@ -322,6 +322,8 @@ static void scsi_device_dev_release_user
-               kfree(evt);
-       }
- 
-+      /* Freeing the queue signals to block that we're done */
-+      scsi_free_queue(sdev->request_queue);
-       blk_put_queue(sdev->request_queue);
-       /* NULL queue means the device can't be used */
-       sdev->request_queue = NULL;
-@@ -936,8 +938,6 @@ void __scsi_remove_device(struct scsi_de
-       /* cause the request function to reject all I/O requests */
-       sdev->request_queue->queuedata = NULL;
- 
--      /* Freeing the queue signals to block that we're done */
--      scsi_free_queue(sdev->request_queue);
-       put_device(dev);
- }
- 
-
-
---
-To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
-the body of a message to majordomo@vger.kernel.org
-More majordomo info at  http://vger.kernel.org/majordomo-info.html
-Please read the FAQ at  http://www.tux.org/lkml/
  --- linux-3.0/scripts/kconfig/lxdialog/check-lxdialog.sh~      2011-07-22 04:17:23.000000000 +0200
  +++ linux-3.0/scripts/kconfig/lxdialog/check-lxdialog.sh       2011-08-25 21:26:04.799150642 +0200
  @@ -9,6 +9,12 @@
@@ -124,6 +27,1067 @@ Please read the FAQ at  http://www.tux.org/lkml/
                         fi
                 done
  
+From 7a29ac474a47eb8cf212b45917683ae89d6fa13b Mon Sep 17 00:00:00 2001
+From: Chris Mason <clm@fb.com>
+Date: Tue, 10 Nov 2015 10:10:34 +1100
+Subject: xfs: give all workqueues rescuer threads
+
+We're consistently hitting deadlocks here with XFS on recent kernels.
+After some digging through the crash files, it looks like everyone in
+the system is waiting for XFS to reclaim memory.
+
+Something like this:
+
+PID: 2733434  TASK: ffff8808cd242800  CPU: 19  COMMAND: "java"
+ #0 [ffff880019c53588] __schedule at ffffffff818c4df2
+ #1 [ffff880019c535d8] schedule at ffffffff818c5517
+ #2 [ffff880019c535f8] _xfs_log_force_lsn at ffffffff81316348
+ #3 [ffff880019c53688] xfs_log_force_lsn at ffffffff813164fb
+ #4 [ffff880019c536b8] xfs_iunpin_wait at ffffffff8130835e
+ #5 [ffff880019c53728] xfs_reclaim_inode at ffffffff812fd453
+ #6 [ffff880019c53778] xfs_reclaim_inodes_ag at ffffffff812fd8c7
+ #7 [ffff880019c53928] xfs_reclaim_inodes_nr at ffffffff812fe433
+ #8 [ffff880019c53958] xfs_fs_free_cached_objects at ffffffff8130d3b9
+ #9 [ffff880019c53968] super_cache_scan at ffffffff811a6f73
+#10 [ffff880019c539c8] shrink_slab at ffffffff811460e6
+#11 [ffff880019c53aa8] shrink_zone at ffffffff8114a53f
+#12 [ffff880019c53b48] do_try_to_free_pages at ffffffff8114a8ba
+#13 [ffff880019c53be8] try_to_free_pages at ffffffff8114ad5a
+#14 [ffff880019c53c78] __alloc_pages_nodemask at ffffffff8113e1b8
+#15 [ffff880019c53d88] alloc_kmem_pages_node at ffffffff8113e671
+#16 [ffff880019c53dd8] copy_process at ffffffff8104f781
+#17 [ffff880019c53ec8] do_fork at ffffffff8105129c
+#18 [ffff880019c53f38] sys_clone at ffffffff810515b6
+#19 [ffff880019c53f48] stub_clone at ffffffff818c8e4d
+
+xfs_log_force_lsn is waiting for logs to get cleaned, which is waiting
+for IO, which is waiting for workers to complete the IO which is waiting
+for worker threads that don't exist yet:
+
+PID: 2752451  TASK: ffff880bd6bdda00  CPU: 37  COMMAND: "kworker/37:1"
+ #0 [ffff8808d20abbb0] __schedule at ffffffff818c4df2
+ #1 [ffff8808d20abc00] schedule at ffffffff818c5517
+ #2 [ffff8808d20abc20] schedule_timeout at ffffffff818c7c6c
+ #3 [ffff8808d20abcc0] wait_for_completion_killable at ffffffff818c6495
+ #4 [ffff8808d20abd30] kthread_create_on_node at ffffffff8106ec82
+ #5 [ffff8808d20abdf0] create_worker at ffffffff8106752f
+ #6 [ffff8808d20abe40] worker_thread at ffffffff810699be
+ #7 [ffff8808d20abec0] kthread at ffffffff8106ef59
+ #8 [ffff8808d20abf50] ret_from_fork at ffffffff818c8ac8
+
+I think we should be using WQ_MEM_RECLAIM to make sure this thread
+pool makes progress when we're not able to allocate new workers.
+
+[dchinner: make all workqueues WQ_MEM_RECLAIM]
+
+Signed-off-by: Chris Mason <clm@fb.com>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+---
+ fs/xfs/xfs_super.c | 7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
+index 29531ec..65fbfb7 100644
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -838,17 +838,18 @@ xfs_init_mount_workqueues(
+               goto out_destroy_unwritten;
+ 
+       mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
+-                      WQ_FREEZABLE, 0, mp->m_fsname);
++                      WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
+       if (!mp->m_reclaim_workqueue)
+               goto out_destroy_cil;
+ 
+       mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
+-                      WQ_FREEZABLE|WQ_HIGHPRI, 0, mp->m_fsname);
++                      WQ_MEM_RECLAIM|WQ_FREEZABLE|WQ_HIGHPRI, 0,
++                      mp->m_fsname);
+       if (!mp->m_log_workqueue)
+               goto out_destroy_reclaim;
+ 
+       mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
+-                      WQ_FREEZABLE, 0, mp->m_fsname);
++                      WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
+       if (!mp->m_eofblocks_workqueue)
+               goto out_destroy_log;
+ 
+-- 
+cgit v0.11.2
+
+commit c2d42c16ad83006a706d83e51a7268db04af733a
+Author: Andrew Morton <akpm@linux-foundation.org>
+Date:   Thu Nov 5 18:48:43 2015 -0800
+
+    mm/vmstat.c: uninline node_page_state()
+    
+    With x86_64 (config http://ozlabs.org/~akpm/config-akpm2.txt) and old gcc
+    (4.4.4), drivers/base/node.c:node_read_meminfo() is using 2344 bytes of
+    stack.  Uninlining node_page_state() reduces this to 440 bytes.
+    
+    The stack consumption issue is fixed by newer gcc (4.8.4) however with
+    that compiler this patch reduces the node.o text size from 7314 bytes to
+    4578.
+    
+    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+
+diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
+index 82e7db7..49dfe40 100644
+--- a/include/linux/vmstat.h
++++ b/include/linux/vmstat.h
+@@ -161,30 +161,8 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
+ }
+ 
+ #ifdef CONFIG_NUMA
+-/*
+- * Determine the per node value of a stat item. This function
+- * is called frequently in a NUMA machine, so try to be as
+- * frugal as possible.
+- */
+-static inline unsigned long node_page_state(int node,
+-                               enum zone_stat_item item)
+-{
+-      struct zone *zones = NODE_DATA(node)->node_zones;
+-
+-      return
+-#ifdef CONFIG_ZONE_DMA
+-              zone_page_state(&zones[ZONE_DMA], item) +
+-#endif
+-#ifdef CONFIG_ZONE_DMA32
+-              zone_page_state(&zones[ZONE_DMA32], item) +
+-#endif
+-#ifdef CONFIG_HIGHMEM
+-              zone_page_state(&zones[ZONE_HIGHMEM], item) +
+-#endif
+-              zone_page_state(&zones[ZONE_NORMAL], item) +
+-              zone_page_state(&zones[ZONE_MOVABLE], item);
+-}
+ 
++extern unsigned long node_page_state(int node, enum zone_stat_item item);
+ extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp);
+ 
+ #else
+diff --git a/mm/vmstat.c b/mm/vmstat.c
+index fbf1448..ffcb4f5 100644
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -591,6 +591,28 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
+       else
+               __inc_zone_state(z, NUMA_OTHER);
+ }
++
++/*
++ * Determine the per node value of a stat item.
++ */
++unsigned long node_page_state(int node, enum zone_stat_item item)
++{
++      struct zone *zones = NODE_DATA(node)->node_zones;
++
++      return
++#ifdef CONFIG_ZONE_DMA
++              zone_page_state(&zones[ZONE_DMA], item) +
++#endif
++#ifdef CONFIG_ZONE_DMA32
++              zone_page_state(&zones[ZONE_DMA32], item) +
++#endif
++#ifdef CONFIG_HIGHMEM
++              zone_page_state(&zones[ZONE_HIGHMEM], item) +
++#endif
++              zone_page_state(&zones[ZONE_NORMAL], item) +
++              zone_page_state(&zones[ZONE_MOVABLE], item);
++}
++
+ #endif
+ 
+ #ifdef CONFIG_COMPACTION
+commit 016c13daa5c9e4827eca703e2f0621c131f2cca3
+Author: Mel Gorman <mgorman@techsingularity.net>
+Date:   Fri Nov 6 16:28:18 2015 -0800
+
+    mm, page_alloc: use masks and shifts when converting GFP flags to migrate types
+    
+    This patch redefines which GFP bits are used for specifying mobility and
+    the order of the migrate types.  Once redefined it's possible to convert
+    GFP flags to a migrate type with a simple mask and shift.  The only
+    downside is that readers of OOM kill messages and allocation failures may
+    have been used to the existing values but scripts/gfp-translate will help.
+    
+    Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+    Acked-by: Vlastimil Babka <vbabka@suse.cz>
+    Cc: Christoph Lameter <cl@linux.com>
+    Cc: David Rientjes <rientjes@google.com>
+    Cc: Johannes Weiner <hannes@cmpxchg.org>
+    Cc: Michal Hocko <mhocko@suse.com>
+    Cc: Vitaly Wool <vitalywool@gmail.com>
+    Cc: Rik van Riel <riel@redhat.com>
+    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+
+diff --git a/include/linux/gfp.h b/include/linux/gfp.h
+index f92cbd2..440fca3 100644
+--- a/include/linux/gfp.h
++++ b/include/linux/gfp.h
+@@ -14,7 +14,7 @@ struct vm_area_struct;
+ #define ___GFP_HIGHMEM                0x02u
+ #define ___GFP_DMA32          0x04u
+ #define ___GFP_MOVABLE                0x08u
+-#define ___GFP_WAIT           0x10u
++#define ___GFP_RECLAIMABLE    0x10u
+ #define ___GFP_HIGH           0x20u
+ #define ___GFP_IO             0x40u
+ #define ___GFP_FS             0x80u
+@@ -29,7 +29,7 @@ struct vm_area_struct;
+ #define ___GFP_NOMEMALLOC     0x10000u
+ #define ___GFP_HARDWALL               0x20000u
+ #define ___GFP_THISNODE               0x40000u
+-#define ___GFP_RECLAIMABLE    0x80000u
++#define ___GFP_WAIT           0x80000u
+ #define ___GFP_NOACCOUNT      0x100000u
+ #define ___GFP_NOTRACK                0x200000u
+ #define ___GFP_NO_KSWAPD      0x400000u
+@@ -126,6 +126,7 @@ struct vm_area_struct;
+ 
+ /* This mask makes up all the page movable related flags */
+ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
++#define GFP_MOVABLE_SHIFT 3
+ 
+ /* Control page allocator reclaim behavior */
+ #define GFP_RECLAIM_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|\
+@@ -152,14 +153,15 @@ struct vm_area_struct;
+ /* Convert GFP flags to their corresponding migrate type */
+ static inline int gfpflags_to_migratetype(const gfp_t gfp_flags)
+ {
+-      WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
++      VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
++      BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE);
++      BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE);
+ 
+       if (unlikely(page_group_by_mobility_disabled))
+               return MIGRATE_UNMOVABLE;
+ 
+       /* Group based on mobility */
+-      return (((gfp_flags & __GFP_MOVABLE) != 0) << 1) |
+-              ((gfp_flags & __GFP_RECLAIMABLE) != 0);
++      return (gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT;
+ }
+ 
+ #ifdef CONFIG_HIGHMEM
+diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
+index e326843..38bed71 100644
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -37,8 +37,8 @@
+ 
+ enum {
+       MIGRATE_UNMOVABLE,
+-      MIGRATE_RECLAIMABLE,
+       MIGRATE_MOVABLE,
++      MIGRATE_RECLAIMABLE,
+       MIGRATE_PCPTYPES,       /* the number of types on the pcp lists */
+       MIGRATE_RESERVE = MIGRATE_PCPTYPES,
+ #ifdef CONFIG_CMA
+commit 974a786e63c96a2401a78ddba926f34c128474f1
+Author: Mel Gorman <mgorman@techsingularity.net>
+Date:   Fri Nov 6 16:28:34 2015 -0800
+
+    mm, page_alloc: remove MIGRATE_RESERVE
+    
+    MIGRATE_RESERVE preserves an old property of the buddy allocator that
+    existed prior to fragmentation avoidance -- min_free_kbytes worth of pages
+    tended to remain contiguous until the only alternative was to fail the
+    allocation.  At the time it was discovered that high-order atomic
+    allocations relied on this property so MIGRATE_RESERVE was introduced.  A
+    later patch will introduce an alternative MIGRATE_HIGHATOMIC so this patch
+    deletes MIGRATE_RESERVE and supporting code so it'll be easier to review.
+    Note that this patch in isolation may look like a false regression if
+    someone was bisecting high-order atomic allocation failures.
+    
+    Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
+    Acked-by: Vlastimil Babka <vbabka@suse.cz>
+    Cc: Christoph Lameter <cl@linux.com>
+    Cc: David Rientjes <rientjes@google.com>
+    Cc: Johannes Weiner <hannes@cmpxchg.org>
+    Cc: Michal Hocko <mhocko@suse.com>
+    Cc: Vitaly Wool <vitalywool@gmail.com>
+    Cc: Rik van Riel <riel@redhat.com>
+    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+    Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+
+diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
+index 1e88aae..b86cfa3 100644
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -39,8 +39,6 @@ enum {
+       MIGRATE_UNMOVABLE,
+       MIGRATE_MOVABLE,
+       MIGRATE_RECLAIMABLE,
+-      MIGRATE_PCPTYPES,       /* the number of types on the pcp lists */
+-      MIGRATE_RESERVE = MIGRATE_PCPTYPES,
+ #ifdef CONFIG_CMA
+       /*
+        * MIGRATE_CMA migration type is designed to mimic the way
+@@ -63,6 +61,8 @@ enum {
+       MIGRATE_TYPES
+ };
+ 
++#define MIGRATE_PCPTYPES (MIGRATE_RECLAIMABLE+1)
++
+ #ifdef CONFIG_CMA
+ #  define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
+ #else
+@@ -429,12 +429,6 @@ struct zone {
+ 
+       const char              *name;
+ 
+-      /*
+-       * Number of MIGRATE_RESERVE page block. To maintain for just
+-       * optimization. Protected by zone->lock.
+-       */
+-      int                     nr_migrate_reserve_block;
+-
+ #ifdef CONFIG_MEMORY_ISOLATION
+       /*
+        * Number of isolated pageblock. It is used to solve incorrect
+diff --git a/mm/huge_memory.c b/mm/huge_memory.c
+index 9812d46..dabd247 100644
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -116,7 +116,7 @@ static void set_recommended_min_free_kbytes(void)
+       for_each_populated_zone(zone)
+               nr_zones++;
+ 
+-      /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
++      /* Ensure 2 pageblocks are free to assist fragmentation avoidance */
+       recommended_min = pageblock_nr_pages * nr_zones * 2;
+ 
+       /*
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 8dc6e3c..5888126 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -817,7 +817,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
+                       if (unlikely(has_isolate_pageblock(zone)))
+                               mt = get_pageblock_migratetype(page);
+ 
+-                      /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
+                       __free_one_page(page, page_to_pfn(page), zone, 0, mt);
+                       trace_mm_page_pcpu_drain(page, 0, mt);
+               } while (--to_free && --batch_free && !list_empty(list));
+@@ -1417,15 +1416,14 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
+  * the free lists for the desirable migrate type are depleted
+  */
+ static int fallbacks[MIGRATE_TYPES][4] = {
+-      [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,     MIGRATE_RESERVE },
+-      [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,     MIGRATE_RESERVE },
+-      [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
++      [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_TYPES },
++      [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_TYPES },
++      [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
+ #ifdef CONFIG_CMA
+-      [MIGRATE_CMA]         = { MIGRATE_RESERVE }, /* Never used */
++      [MIGRATE_CMA]         = { MIGRATE_TYPES }, /* Never used */
+ #endif
+-      [MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */
+ #ifdef CONFIG_MEMORY_ISOLATION
+-      [MIGRATE_ISOLATE]     = { MIGRATE_RESERVE }, /* Never used */
++      [MIGRATE_ISOLATE]     = { MIGRATE_TYPES }, /* Never used */
+ #endif
+ };
+ 
+@@ -1598,7 +1596,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
+       *can_steal = false;
+       for (i = 0;; i++) {
+               fallback_mt = fallbacks[migratetype][i];
+-              if (fallback_mt == MIGRATE_RESERVE)
++              if (fallback_mt == MIGRATE_TYPES)
+                       break;
+ 
+               if (list_empty(&area->free_list[fallback_mt]))
+@@ -1676,25 +1674,13 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order,
+ {
+       struct page *page;
+ 
+-retry_reserve:
+       page = __rmqueue_smallest(zone, order, migratetype);
+-
+-      if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
++      if (unlikely(!page)) {
+               if (migratetype == MIGRATE_MOVABLE)
+                       page = __rmqueue_cma_fallback(zone, order);
+ 
+               if (!page)
+                       page = __rmqueue_fallback(zone, order, migratetype);
+-
+-              /*
+-               * Use MIGRATE_RESERVE rather than fail an allocation. goto
+-               * is used because __rmqueue_smallest is an inline function
+-               * and we want just one call site
+-               */
+-              if (!page) {
+-                      migratetype = MIGRATE_RESERVE;
+-                      goto retry_reserve;
+-              }
+       }
+ 
+       trace_mm_page_alloc_zone_locked(page, order, migratetype);
+@@ -3492,7 +3478,6 @@ static void show_migration_types(unsigned char type)
+               [MIGRATE_UNMOVABLE]     = 'U',
+               [MIGRATE_RECLAIMABLE]   = 'E',
+               [MIGRATE_MOVABLE]       = 'M',
+-              [MIGRATE_RESERVE]       = 'R',
+ #ifdef CONFIG_CMA
+               [MIGRATE_CMA]           = 'C',
+ #endif
+@@ -4303,120 +4288,6 @@ static inline unsigned long wait_table_bits(unsigned long size)
+ }
+ 
+ /*
+- * Check if a pageblock contains reserved pages
+- */
+-static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
+-{
+-      unsigned long pfn;
+-
+-      for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+-              if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
+-                      return 1;
+-      }
+-      return 0;
+-}
+-
+-/*
+- * Mark a number of pageblocks as MIGRATE_RESERVE. The number
+- * of blocks reserved is based on min_wmark_pages(zone). The memory within
+- * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
+- * higher will lead to a bigger reserve which will get freed as contiguous
+- * blocks as reclaim kicks in
+- */
+-static void setup_zone_migrate_reserve(struct zone *zone)
+-{
+-      unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
+-      struct page *page;
+-      unsigned long block_migratetype;
+-      int reserve;
+-      int old_reserve;
+-
+-      /*
+-       * Get the start pfn, end pfn and the number of blocks to reserve
+-       * We have to be careful to be aligned to pageblock_nr_pages to
+-       * make sure that we always check pfn_valid for the first page in
+-       * the block.
+-       */
+-      start_pfn = zone->zone_start_pfn;
+-      end_pfn = zone_end_pfn(zone);
+-      start_pfn = roundup(start_pfn, pageblock_nr_pages);
+-      reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
+-                                                      pageblock_order;
+-
+-      /*
+-       * Reserve blocks are generally in place to help high-order atomic
+-       * allocations that are short-lived. A min_free_kbytes value that
+-       * would result in more than 2 reserve blocks for atomic allocations
+-       * is assumed to be in place to help anti-fragmentation for the
+-       * future allocation of hugepages at runtime.
+-       */
+-      reserve = min(2, reserve);
+-      old_reserve = zone->nr_migrate_reserve_block;
+-
+-      /* When memory hot-add, we almost always need to do nothing */
+-      if (reserve == old_reserve)
+-              return;
+-      zone->nr_migrate_reserve_block = reserve;
+-
+-      for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+-              if (!early_page_nid_uninitialised(pfn, zone_to_nid(zone)))
+-                      return;
+-
+-              if (!pfn_valid(pfn))
+-                      continue;
+-              page = pfn_to_page(pfn);
+-
+-              /* Watch out for overlapping nodes */
+-              if (page_to_nid(page) != zone_to_nid(zone))
+-                      continue;
+-
+-              block_migratetype = get_pageblock_migratetype(page);
+-
+-              /* Only test what is necessary when the reserves are not met */
+-              if (reserve > 0) {
+-                      /*
+-                       * Blocks with reserved pages will never free, skip
+-                       * them.
+-                       */
+-                      block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
+-                      if (pageblock_is_reserved(pfn, block_end_pfn))
+-                              continue;
+-
+-                      /* If this block is reserved, account for it */
+-                      if (block_migratetype == MIGRATE_RESERVE) {
+-                              reserve--;
+-                              continue;
+-                      }
+-
+-                      /* Suitable for reserving if this block is movable */
+-                      if (block_migratetype == MIGRATE_MOVABLE) {
+-                              set_pageblock_migratetype(page,
+-                                                      MIGRATE_RESERVE);
+-                              move_freepages_block(zone, page,
+-                                                      MIGRATE_RESERVE);
+-                              reserve--;
+-                              continue;
+-                      }
+-              } else if (!old_reserve) {
+-                      /*
+-                       * At boot time we don't need to scan the whole zone
+-                       * for turning off MIGRATE_RESERVE.
+-                       */
+-                      break;
+-              }
+-
+-              /*
+-               * If the reserve is met and this is a previous reserved block,
+-               * take it back
+-               */
+-              if (block_migratetype == MIGRATE_RESERVE) {
+-                      set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+-                      move_freepages_block(zone, page, MIGRATE_MOVABLE);
+-              }
+-      }
+-}
+-
+-/*
+  * Initially all pages are reserved - free ones are freed
+  * up by free_all_bootmem() once the early boot process is
+  * done. Non-atomic initialization, single-pass.
+@@ -4455,9 +4326,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+                * movable at startup. This will force kernel allocations
+                * to reserve their blocks rather than leaking throughout
+                * the address space during boot when many long-lived
+-               * kernel allocations are made. Later some blocks near
+-               * the start are marked MIGRATE_RESERVE by
+-               * setup_zone_migrate_reserve()
++               * kernel allocations are made.
+                *
+                * bitmap is created for zone's valid pfn range. but memmap
+                * can be created for invalid pages (for alignment)
+@@ -6018,7 +5887,6 @@ static void __setup_per_zone_wmarks(void)
+                       high_wmark_pages(zone) - low_wmark_pages(zone) -
+                       atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
+ 
+-              setup_zone_migrate_reserve(zone);
+               spin_unlock_irqrestore(&zone->lock, flags);
+       }
+ 
+diff --git a/mm/vmstat.c b/mm/vmstat.c
+index ffcb4f5..5b289dc 100644
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -923,7 +923,6 @@ static char * const migratetype_names[MIGRATE_TYPES] = {
+       "Unmovable",
+       "Reclaimable",
+       "Movable",
+-      "Reserve",
+ #ifdef CONFIG_CMA
+       "CMA",
+ #endif
+diff --git a/mm/backing-dev.c b/mm/backing-dev.c
+index 8ed2ffd963c5..7340353f8aea 100644
+--- a/mm/backing-dev.c
++++ b/mm/backing-dev.c
+@@ -957,8 +957,9 @@ EXPORT_SYMBOL(congestion_wait);
+  * jiffies for either a BDI to exit congestion of the given @sync queue
+  * or a write to complete.
+  *
+- * In the absence of zone congestion, cond_resched() is called to yield
+- * the processor if necessary but otherwise does not sleep.
++ * In the absence of zone congestion, a short sleep or a cond_resched is
++ * performed to yield the processor and to allow other subsystems to make
++ * a forward progress.
+  *
+  * The return value is 0 if the sleep is for the full timeout. Otherwise,
+  * it is the number of jiffies that were still remaining when the function
+@@ -978,7 +979,19 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
+        */
+       if (atomic_read(&nr_wb_congested[sync]) == 0 ||
+           !test_bit(ZONE_CONGESTED, &zone->flags)) {
+-              cond_resched();
++
++              /*
++               * Memory allocation/reclaim might be called from a WQ
++               * context and the current implementation of the WQ
++               * concurrency control doesn't recognize that a particular
++               * WQ is congested if the worker thread is looping without
++               * ever sleeping. Therefore we have to do a short sleep
++               * here rather than calling cond_resched().
++               */
++              if (current->flags & PF_WQ_WORKER)
++                      schedule_timeout(1);
++              else
++                      cond_resched();
+ 
+               /* In case we scheduled, work out time remaining */
+               ret = timeout - (jiffies - start);
+diff --git a/mm/vmstat.c b/mm/vmstat.c
+index 45dcbcb5c594..0975da8e3432 100644
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -1381,6 +1381,7 @@ static const struct file_operations proc_vmstat_file_operations = {
+ #endif /* CONFIG_PROC_FS */
+ 
+ #ifdef CONFIG_SMP
++static struct workqueue_struct *vmstat_wq;
+ static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
+ int sysctl_stat_interval __read_mostly = HZ;
+ static cpumask_var_t cpu_stat_off;
+@@ -1393,7 +1394,7 @@ static void vmstat_update(struct work_struct *w)
+                * to occur in the future. Keep on running the
+                * update worker thread.
+                */
+-              schedule_delayed_work_on(smp_processor_id(),
++              queue_delayed_work_on(smp_processor_id(), vmstat_wq,
+                       this_cpu_ptr(&vmstat_work),
+                       round_jiffies_relative(sysctl_stat_interval));
+       } else {
+@@ -1462,7 +1463,7 @@ static void vmstat_shepherd(struct work_struct *w)
+               if (need_update(cpu) &&
+                       cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
+ 
+-                      schedule_delayed_work_on(cpu,
++                      queue_delayed_work_on(cpu, vmstat_wq,
+                               &per_cpu(vmstat_work, cpu), 0);
+ 
+       put_online_cpus();
+@@ -1551,6 +1552,7 @@ static int __init setup_vmstat(void)
+ 
+       start_shepherd_timer();
+       cpu_notifier_register_done();
++      vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+ #endif
+ #ifdef CONFIG_PROC_FS
+       proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
+-- 
+2.6.2
+
+From 09ccfd238e5a0e670d8178cf50180ea81ae09ae1 Mon Sep 17 00:00:00 2001
+From: WANG Cong <xiyou.wangcong@gmail.com>
+Date: Mon, 14 Dec 2015 13:48:36 -0800
+Subject: pptp: verify sockaddr_len in pptp_bind() and pptp_connect()
+
+Reported-by: Dmitry Vyukov <dvyukov@gmail.com>
+Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+---
+ drivers/net/ppp/pptp.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c
+index fc69e41..597c53e 100644
+--- a/drivers/net/ppp/pptp.c
++++ b/drivers/net/ppp/pptp.c
+@@ -419,6 +419,9 @@ static int pptp_bind(struct socket *sock, struct sockaddr *uservaddr,
+       struct pptp_opt *opt = &po->proto.pptp;
+       int error = 0;
+ 
++      if (sockaddr_len < sizeof(struct sockaddr_pppox))
++              return -EINVAL;
++
+       lock_sock(sk);
+ 
+       opt->src_addr = sp->sa_addr.pptp;
+@@ -440,6 +443,9 @@ static int pptp_connect(struct socket *sock, struct sockaddr *uservaddr,
+       struct flowi4 fl4;
+       int error = 0;
+ 
++      if (sockaddr_len < sizeof(struct sockaddr_pppox))
++              return -EINVAL;
++
+       if (sp->sa_protocol != PX_PROTO_PPTP)
+               return -EINVAL;
+ 
+-- 
+cgit v0.11.2
+
+commit cc57858831e3e9678291de730c4b4d2e52a19f59
+Author: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
+Date:   Fri Dec 18 15:19:16 2015 +1100
+
+    md/raid10: fix data corruption and crash during resync
+    
+    The commit c31df25f20e3 ("md/raid10: make sync_request_write() call
+    bio_copy_data()") replaced manual data copying with bio_copy_data() but
+    it doesn't work as intended. The source bio (fbio) is already processed,
+    so its bvec_iter has bi_size == 0 and bi_idx == bi_vcnt.  Because of
+    this, bio_copy_data() either does not copy anything, or worse, copies
+    data from the ->bi_next bio if it is set.  This causes wrong data to be
+    written to drives during resync and sometimes lockups/crashes in
+    bio_copy_data():
+    
+    [  517.338478] NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [md126_raid10:3319]
+    [  517.347324] Modules linked in: raid10 xt_CHECKSUM ipt_MASQUERADE nf_nat_masquerade_ipv4 tun ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 ipt_REJECT nf_reject_ipv4 xt_conntrack ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw ip6table_filter ip6_tables iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw iptable_filter ip_tables x86_pkg_temp_thermal coretemp kvm_intel kvm crct10dif_pclmul crc32_pclmul cryptd shpchp pcspkr ipmi_si ipmi_msghandler tpm_crb acpi_power_meter acpi_cpufreq ext4 mbcache jbd2 sr_mod cdrom sd_mod e1000e ax88179_178a usbnet mii ahci ata_generic crc32c_intel libahci ptp pata_acpi libata pps_core wmi sunrpc dm_mirror dm_region_hash dm_log dm_mod
+    [  517.440555] CPU: 0 PID: 3319 Comm: md126_raid10 Not tainted 4.3.0-rc6+ #1
+    [  517.448384] Hardware name: Intel Corporation PURLEY/PURLEY, BIOS PLYDCRB1.86B.0055.D14.1509221924 09/22/2015
+    [  517.459768] task: ffff880153773980 ti: ffff880150df8000 task.ti: ffff880150df8000
+    [  517.468529] RIP: 0010:[<ffffffff812e1888>]  [<ffffffff812e1888>] bio_copy_data+0xc8/0x3c0
+    [  517.478164] RSP: 0018:ffff880150dfbc98  EFLAGS: 00000246
+    [  517.484341] RAX: ffff880169356688 RBX: 0000000000001000 RCX: 0000000000000000
+    [  517.492558] RDX: 0000000000000000 RSI: ffffea0001ac2980 RDI: ffffea0000d835c0
+    [  517.500773] RBP: ffff880150dfbd08 R08: 0000000000000001 R09: ffff880153773980
+    [  517.508987] R10: ffff880169356600 R11: 0000000000001000 R12: 0000000000010000
+    [  517.517199] R13: 000000000000e000 R14: 0000000000000000 R15: 0000000000001000
+    [  517.525412] FS:  0000000000000000(0000) GS:ffff880174a00000(0000) knlGS:0000000000000000
+    [  517.534844] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+    [  517.541507] CR2: 00007f8a044d5fed CR3: 0000000169504000 CR4: 00000000001406f0
+    [  517.549722] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+    [  517.557929] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+    [  517.566144] Stack:
+    [  517.568626]  ffff880174a16bc0 ffff880153773980 ffff880169356600 0000000000000000
+    [  517.577659]  0000000000000001 0000000000000001 ffff880153773980 ffff88016a61a800
+    [  517.586715]  ffff880150dfbcf8 0000000000000001 ffff88016dd209e0 0000000000001000
+    [  517.595773] Call Trace:
+    [  517.598747]  [<ffffffffa043ef95>] raid10d+0xfc5/0x1690 [raid10]
+    [  517.605610]  [<ffffffff816697ae>] ? __schedule+0x29e/0x8e2
+    [  517.611987]  [<ffffffff814ff206>] md_thread+0x106/0x140
+    [  517.618072]  [<ffffffff810c1d80>] ? wait_woken+0x80/0x80
+    [  517.624252]  [<ffffffff814ff100>] ? super_1_load+0x520/0x520
+    [  517.630817]  [<ffffffff8109ef89>] kthread+0xc9/0xe0
+    [  517.636506]  [<ffffffff8109eec0>] ? flush_kthread_worker+0x70/0x70
+    [  517.643653]  [<ffffffff8166d99f>] ret_from_fork+0x3f/0x70
+    [  517.649929]  [<ffffffff8109eec0>] ? flush_kthread_worker+0x70/0x70
+    
+    Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
+    Reviewed-by: Shaohua Li <shli@kernel.org>
+    Cc: stable@vger.kernel.org (v4.2+)
+    Fixes: c31df25f20e3 ("md/raid10: make sync_request_write() call bio_copy_data()")
+    Signed-off-by: NeilBrown <neilb@suse.com>
+
+diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
+index 41d70bc..84e597e 100644
+--- a/drivers/md/raid10.c
++++ b/drivers/md/raid10.c
+@@ -1946,6 +1946,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
+ 
+       first = i;
+       fbio = r10_bio->devs[i].bio;
++      fbio->bi_iter.bi_size = r10_bio->sectors << 9;
++      fbio->bi_iter.bi_idx = 0;
+ 
+       vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
+       /* now find blocks with errors */
+@@ -1989,7 +1991,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
+               bio_reset(tbio);
+ 
+               tbio->bi_vcnt = vcnt;
+-              tbio->bi_iter.bi_size = r10_bio->sectors << 9;
++              tbio->bi_iter.bi_size = fbio->bi_iter.bi_size;
+               tbio->bi_rw = WRITE;
+               tbio->bi_private = r10_bio;
+               tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
+From: Dave Chinner <dchinner@redhat.com>
+
+When we do dquot readahead in log recovery, we do not use a verifier
+as the underlying buffer may not have dquots in it. e.g. the
+allocation operation hasn't yet been replayed. Hence we do not want
+to fail recovery because we detect an operation to be replayed has
+not been run yet. This problem was addressed for inodes in commit
+d891400 ("xfs: inode buffers may not be valid during recovery
+readahead") but the problem was not recognised to exist for dquots
+and their buffers as the dquot readahead did not have a verifier.
+
+The result of not using a verifier is that when the buffer is then
+next read to replay a dquot modification, the dquot buffer verifier
+will only be attached to the buffer if *readahead is not complete*.
+Hence we can read the buffer, replay the dquot changes and then add
+it to the delwri submission list without it having a verifier
+attached to it. This then generates warnings in xfs_buf_ioapply(),
+which catches and warns about this case.
+
+Fix this and make it handle the same readahead verifier error cases
+as for inode buffers by adding a new readahead verifier that has a
+write operation as well as a read operation that marks the buffer as
+not done if any corruption is detected.  Also make sure we don't run
+readahead if the dquot buffer has been marked as cancelled by
+recovery.
+
+This will result in readahead either succeeding and the buffer
+having a valid write verifier, or readahead failing and the buffer
+state requiring the subsequent read to resubmit the IO with the new
+verifier.  In either case, this will result in the buffer always
+ending up with a valid write verifier on it.
+
+Note: we also need to fix the inode buffer readahead error handling
+to mark the buffer with EIO. Brian noticed the code I copied from
+there wrong during review, so fix it at the same time. Add comments
+linking the two functions that handle readahead verifier errors
+together so we don't forget this behavioural link in future.
+
+cc: <stable@vger.kernel.org> # 3.12 - current
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+---
  
+Version 2
+- fix logic error in determining if verify failed
+- set error on buffer when verifier fails
+- fix inode buffer readahead verifier to set error when it fails
+- better comments, link dquot and inode buffer ra verifiers in the
+  comments
  
+ fs/xfs/libxfs/xfs_dquot_buf.c  | 36 ++++++++++++++++++++++++++++++------
+ fs/xfs/libxfs/xfs_inode_buf.c  | 14 +++++++++-----
+ fs/xfs/libxfs/xfs_quota_defs.h |  2 +-
+ fs/xfs/libxfs/xfs_shared.h     |  1 +
+ fs/xfs/xfs_log_recover.c       |  9 +++++++--
+ 5 files changed, 48 insertions(+), 14 deletions(-)
+
+diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
+index 11cefb2..3cc3cf7 100644
+--- a/fs/xfs/libxfs/xfs_dquot_buf.c
++++ b/fs/xfs/libxfs/xfs_dquot_buf.c
+@@ -54,7 +54,7 @@ xfs_dqcheck(
+       xfs_dqid_t       id,
+       uint             type,    /* used only when IO_dorepair is true */
+       uint             flags,
+-      char             *str)
++      const char       *str)
+ {
+       xfs_dqblk_t      *d = (xfs_dqblk_t *)ddq;
+       int             errs = 0;
+@@ -207,7 +207,8 @@ xfs_dquot_buf_verify_crc(
+ STATIC bool
+ xfs_dquot_buf_verify(
+       struct xfs_mount        *mp,
+-      struct xfs_buf          *bp)
++      struct xfs_buf          *bp,
++      int                     warn)
+ {
+       struct xfs_dqblk        *d = (struct xfs_dqblk *)bp->b_addr;
+       xfs_dqid_t              id = 0;
+@@ -240,8 +241,7 @@ xfs_dquot_buf_verify(
+               if (i == 0)
+                       id = be32_to_cpu(ddq->d_id);
+ 
+-              error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
+-                                     "xfs_dquot_buf_verify");
++              error = xfs_dqcheck(mp, ddq, id + i, 0, warn, __func__);
+               if (error)
+                       return false;
+       }
+@@ -256,7 +256,7 @@ xfs_dquot_buf_read_verify(
+ 
+       if (!xfs_dquot_buf_verify_crc(mp, bp))
+               xfs_buf_ioerror(bp, -EFSBADCRC);
+-      else if (!xfs_dquot_buf_verify(mp, bp))
++      else if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN))
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+ 
+       if (bp->b_error)
+@@ -264,6 +264,25 @@ xfs_dquot_buf_read_verify(
+ }
+ 
+ /*
++ * readahead errors are silent and simply leave the buffer as !done so a real
++ * read will then be run with the xfs_dquot_buf_ops verifier. See
++ * xfs_inode_buf_verify() for why we use EIO and ~XBF_DONE here rather than
++ * reporting the failure.
++ */
++static void
++xfs_dquot_buf_readahead_verify(
++      struct xfs_buf  *bp)
++{
++      struct xfs_mount        *mp = bp->b_target->bt_mount;
++
++      if (!xfs_dquot_buf_verify_crc(mp, bp) ||
++          !xfs_dquot_buf_verify(mp, bp, 0)) {
++              xfs_buf_ioerror(bp, -EIO);
++              bp->b_flags &= ~XBF_DONE;
++      }
++}
++
++/*
+  * we don't calculate the CRC here as that is done when the dquot is flushed to
+  * the buffer after the update is done. This ensures that the dquot in the
+  * buffer always has an up-to-date CRC value.
+@@ -274,7 +293,7 @@ xfs_dquot_buf_write_verify(
+ {
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+ 
+-      if (!xfs_dquot_buf_verify(mp, bp)) {
++      if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN)) {
+               xfs_buf_ioerror(bp, -EFSCORRUPTED);
+               xfs_verifier_error(bp);
+               return;
+@@ -287,3 +306,8 @@ const struct xfs_buf_ops xfs_dquot_buf_ops = {
+       .verify_write = xfs_dquot_buf_write_verify,
+ };
+ 
++const struct xfs_buf_ops xfs_dquot_buf_ra_ops = {
++
++      .verify_read = xfs_dquot_buf_readahead_verify,
++      .verify_write = xfs_dquot_buf_write_verify,
++};
+diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
+index 1b8d98a..4816209 100644
+--- a/fs/xfs/libxfs/xfs_inode_buf.c
++++ b/fs/xfs/libxfs/xfs_inode_buf.c
+@@ -62,11 +62,14 @@ xfs_inobp_check(
+  * has not had the inode cores stamped into it. Hence for readahead, the buffer
+  * may be potentially invalid.
+  *
+- * If the readahead buffer is invalid, we don't want to mark it with an error,
+- * but we do want to clear the DONE status of the buffer so that a followup read
+- * will re-read it from disk. This will ensure that we don't get an unnecessary
+- * warnings during log recovery and we don't get unnecssary panics on debug
+- * kernels.
++ * If the readahead buffer is invalid, we need to mark it with an error and
++ * clear the DONE status of the buffer so that a followup read will re-read it
++ * from disk. We don't report the error otherwise to avoid warnings during log
++ * recovery and we don't get unnecssary panics on debug kernels. We use EIO here
++ * because all we want to do is say readahead failed; there is no-one to report
++ * the error to, so this will distinguish it from a non-ra verifier failure.
++ * Changes to this readahead error behavour also need to be reflected in
++ * xfs_dquot_buf_readahead_verify().
+  */
+ static void
+ xfs_inode_buf_verify(
+@@ -92,6 +95,7 @@ xfs_inode_buf_verify(
+                                               XFS_ERRTAG_ITOBP_INOTOBP,
+                                               XFS_RANDOM_ITOBP_INOTOBP))) {
+                       if (readahead) {
++                              xfs_buf_ioerror(bp, -EIO);
+                               bp->b_flags &= ~XBF_DONE;
+                               return;
+                       }
+diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
+index 1b0a083..f51078f 100644
+--- a/fs/xfs/libxfs/xfs_quota_defs.h
++++ b/fs/xfs/libxfs/xfs_quota_defs.h
+@@ -153,7 +153,7 @@ typedef __uint16_t xfs_qwarncnt_t;
+ #define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
+ 
+ extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq,
+-                     xfs_dqid_t id, uint type, uint flags, char *str);
++                     xfs_dqid_t id, uint type, uint flags, const char *str);
+ extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
+ 
+ #endif        /* __XFS_QUOTA_H__ */
+diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
+index 5be5297..15c3ceb 100644
+--- a/fs/xfs/libxfs/xfs_shared.h
++++ b/fs/xfs/libxfs/xfs_shared.h
+@@ -49,6 +49,7 @@ extern const struct xfs_buf_ops xfs_inobt_buf_ops;
+ extern const struct xfs_buf_ops xfs_inode_buf_ops;
+ extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
+ extern const struct xfs_buf_ops xfs_dquot_buf_ops;
++extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops;
+ extern const struct xfs_buf_ops xfs_sb_buf_ops;
+ extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
+ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
+diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
+index 26e67b4..da37beb 100644
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -3521,6 +3521,7 @@ xlog_recover_dquot_ra_pass2(
+       struct xfs_disk_dquot   *recddq;
+       struct xfs_dq_logformat *dq_f;
+       uint                    type;
++      int                     len;
+ 
+ 
+       if (mp->m_qflags == 0)
+@@ -3541,8 +3542,12 @@ xlog_recover_dquot_ra_pass2(
+       ASSERT(dq_f);
+       ASSERT(dq_f->qlf_len == 1);
+ 
+-      xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno,
+-                        XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL);
++      len = XFS_FSB_TO_BB(mp, dq_f->qlf_len);
++      if (xlog_peek_buffer_cancelled(log, dq_f->qlf_blkno, len, 0))
++              return;
++
++      xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, len,
++                        &xfs_dquot_buf_ra_ops);
+ }
+ 
+ STATIC void
+
+_______________________________________________
+xfs mailing list
+xfs@oss.sgi.com
+http://oss.sgi.com/mailman/listinfo/xfs
+From: Michal Hocko <mhocko@suse.com>
+
+kernel test robot has reported the following crash:
+[    3.870718] BUG: unable to handle kernel NULL pointer dereferenceNULL pointer dereference at 00000100
+ at 00000100
+[    3.872615] IP: [<c1074df6>] __queue_work+0x26/0x390 [<c1074df6>] __queue_work+0x26/0x390
+[    3.873758] *pdpt = 0000000000000000 *pde = f000ff53f000ff53 *pde = f000ff53f000ff53
+[    3.875096] Oops: 0000 [#1] PREEMPT PREEMPT SMP SMP
+[    3.876130] CPU: 0 PID: 24 Comm: kworker/0:1 Not tainted 4.4.0-rc4-00139-g373ccbe #1
+[    3.878135] Workqueue: events vmstat_shepherd
+[    3.879207] task: cb684600 ti: cb7ba000 task.ti: cb7ba000
+[    3.880445] EIP: 0060:[<c1074df6>] EFLAGS: 00010046 CPU: 0
+[    3.881704] EIP is at __queue_work+0x26/0x390
+[    3.882823] EAX: 00000046 EBX: cbb37800 ECX: cbb37800 EDX: 00000000
+[    3.884457] ESI: 00000000 EDI: 00000000 EBP: cb7bbe68 ESP: cb7bbe38
+[    3.886005]  DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
+[    3.887229] CR0: 8005003b CR2: 00000100 CR3: 01fd5000 CR4: 000006b0
+[    3.888663] Stack:
+[    3.895204] Call Trace:
+[    3.895854]  [<c1a381dd>] ? mutex_unlock+0xd/0x10
+[    3.897120]  [<c1075221>] __queue_delayed_work+0xa1/0x160
+[    3.898530]  [<c10764c6>] queue_delayed_work_on+0x36/0x60
+[    3.899790]  [<c11494bd>] vmstat_shepherd+0xad/0xf0
+[    3.900899]  [<c1075a7a>] process_one_work+0x1aa/0x4c0
+[    3.902093]  [<c10759e2>] ? process_one_work+0x112/0x4c0
+[    3.903520]  [<c10ac31e>] ? do_raw_spin_lock+0xe/0x150
+[    3.904853]  [<c1075dd1>] worker_thread+0x41/0x440
+[    3.906023]  [<c1075d90>] ? process_one_work+0x4c0/0x4c0
+[    3.907242]  [<c107b7c0>] kthread+0xb0/0xd0
+[    3.908188]  [<c1a3c651>] ret_from_kernel_thread+0x21/0x40
+[    3.909601]  [<c107b710>] ? __kthread_parkme+0x80/0x80
+
+The reason is that start_shepherd_timer schedules the shepherd work item
+which uses vmstat_wq (vmstat_shepherd) before setup_vmstat allocates
+that workqueue so if the further initialization takes more than HZ
+we might end up scheduling on a NULL vmstat_wq. This is really unlikely
+but not impossible.
+
+Fixes: 373ccbe59270 ("mm, vmstat: allow WQ concurrency to discover memory reclaim doesn't make any progress")
+Reported-by: kernel test robot <ying.huang@linux.intel.com>
+Signed-off-by: Michal Hocko <mhocko@suse.com>
+---
+Hi Linus,
+I am not marking this for stable because I hope we can sneak it into 4.4.
+The patch is trivial and obvious. I am sorry about the breakage. If you prefer 
+to postpone it to 4.5-rc1 because this is not really that critical and shouldn't
+happen most of the time then I will repost with stable tag added.
+
+Thanks!
+
+ mm/vmstat.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/mm/vmstat.c b/mm/vmstat.c
+index 4ebc17d948cb..c54fd2924f25 100644
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -1483,6 +1483,7 @@ static void __init start_shepherd_timer(void)
+               BUG();
+       cpumask_copy(cpu_stat_off, cpu_online_mask);
+ 
++      vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+       schedule_delayed_work(&shepherd,
+               round_jiffies_relative(sysctl_stat_interval));
+ }
+@@ -1550,7 +1551,6 @@ static int __init setup_vmstat(void)
+ 
+       start_shepherd_timer();
+       cpu_notifier_register_done();
+-      vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+ #endif
+ #ifdef CONFIG_PROC_FS
+       proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
+-- 
+2.6.4