kernel-small_fixes.patch

   1 --- linux-2.6.33/scripts/mod/modpost.c~ 2010-02-24 19:52:17.000000000 +0100
   2 +++ linux-2.6.33/scripts/mod/modpost.c  2010-03-07 14:26:47.242168558 +0100
   3 @@ -15,7 +15,8 @@
   4  #include <stdio.h>
   5  #include <ctype.h>
   6  #include "modpost.h"
   7 -#include "../../include/generated/autoconf.h"
   8 +// PLD architectures don't use CONFIG_SYMBOL_PREFIX
   9 +//#include "../../include/generated/autoconf.h"
  10  #include "../../include/linux/license.h"
  11
  12  /* Some toolchains use a `_' prefix for all user symbols. */
  13
  14 --- linux-3.0/scripts/kconfig/lxdialog/check-lxdialog.sh~       2011-07-22 04:17:23.000000000 +0200
  15 +++ linux-3.0/scripts/kconfig/lxdialog/check-lxdialog.sh        2011-08-25 21:26:04.799150642 +0200
  16 @@ -9,6 +9,12 @@
  17                         $cc -print-file-name=lib${lib}.${ext} | grep -q /
  18                         if [ $? -eq 0 ]; then
  19                                 echo "-l${lib}"
  20 +                               for libt in tinfow tinfo ; do
  21 +                                       $cc -print-file-name=lib${libt}.${ext} | grep -q /
  22 +                                       if [ $? -eq 0 ]; then
  23 +                                               echo "-l${libt}"
  24 +                                       fi
  25 +                               done
  26                                 exit
  27                         fi
  28                 done
  29
  30 From 7a29ac474a47eb8cf212b45917683ae89d6fa13b Mon Sep 17 00:00:00 2001
  31 From: Chris Mason <clm@fb.com>
  32 Date: Tue, 10 Nov 2015 10:10:34 +1100
  33 Subject: xfs: give all workqueues rescuer threads
  34
  35 We're consistently hitting deadlocks here with XFS on recent kernels.
  36 After some digging through the crash files, it looks like everyone in
  37 the system is waiting for XFS to reclaim memory.
  38
  39 Something like this:
  40
  41 PID: 2733434  TASK: ffff8808cd242800  CPU: 19  COMMAND: "java"
  42  #0 [ffff880019c53588] __schedule at ffffffff818c4df2
  43  #1 [ffff880019c535d8] schedule at ffffffff818c5517
  44  #2 [ffff880019c535f8] _xfs_log_force_lsn at ffffffff81316348
  45  #3 [ffff880019c53688] xfs_log_force_lsn at ffffffff813164fb
  46  #4 [ffff880019c536b8] xfs_iunpin_wait at ffffffff8130835e
  47  #5 [ffff880019c53728] xfs_reclaim_inode at ffffffff812fd453
  48  #6 [ffff880019c53778] xfs_reclaim_inodes_ag at ffffffff812fd8c7
  49  #7 [ffff880019c53928] xfs_reclaim_inodes_nr at ffffffff812fe433
  50  #8 [ffff880019c53958] xfs_fs_free_cached_objects at ffffffff8130d3b9
  51  #9 [ffff880019c53968] super_cache_scan at ffffffff811a6f73
  52 #10 [ffff880019c539c8] shrink_slab at ffffffff811460e6
  53 #11 [ffff880019c53aa8] shrink_zone at ffffffff8114a53f
  54 #12 [ffff880019c53b48] do_try_to_free_pages at ffffffff8114a8ba
  55 #13 [ffff880019c53be8] try_to_free_pages at ffffffff8114ad5a
  56 #14 [ffff880019c53c78] __alloc_pages_nodemask at ffffffff8113e1b8
  57 #15 [ffff880019c53d88] alloc_kmem_pages_node at ffffffff8113e671
  58 #16 [ffff880019c53dd8] copy_process at ffffffff8104f781
  59 #17 [ffff880019c53ec8] do_fork at ffffffff8105129c
  60 #18 [ffff880019c53f38] sys_clone at ffffffff810515b6
  61 #19 [ffff880019c53f48] stub_clone at ffffffff818c8e4d
  62
  63 xfs_log_force_lsn is waiting for logs to get cleaned, which is waiting
  64 for IO, which is waiting for workers to complete the IO which is waiting
  65 for worker threads that don't exist yet:
  66
  67 PID: 2752451  TASK: ffff880bd6bdda00  CPU: 37  COMMAND: "kworker/37:1"
  68  #0 [ffff8808d20abbb0] __schedule at ffffffff818c4df2
  69  #1 [ffff8808d20abc00] schedule at ffffffff818c5517
  70  #2 [ffff8808d20abc20] schedule_timeout at ffffffff818c7c6c
  71  #3 [ffff8808d20abcc0] wait_for_completion_killable at ffffffff818c6495
  72  #4 [ffff8808d20abd30] kthread_create_on_node at ffffffff8106ec82
  73  #5 [ffff8808d20abdf0] create_worker at ffffffff8106752f
  74  #6 [ffff8808d20abe40] worker_thread at ffffffff810699be
  75  #7 [ffff8808d20abec0] kthread at ffffffff8106ef59
  76  #8 [ffff8808d20abf50] ret_from_fork at ffffffff818c8ac8
  77
  78 I think we should be using WQ_MEM_RECLAIM to make sure this thread
  79 pool makes progress when we're not able to allocate new workers.
  80
  81 [dchinner: make all workqueues WQ_MEM_RECLAIM]
  82
  83 Signed-off-by: Chris Mason <clm@fb.com>
  84 Reviewed-by: Dave Chinner <dchinner@redhat.com>
  85 Signed-off-by: Dave Chinner <david@fromorbit.com>
  86 ---
  87  fs/xfs/xfs_super.c | 7 ++++---
  88  1 file changed, 4 insertions(+), 3 deletions(-)
  89
  90 diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
  91 index 29531ec..65fbfb7 100644
  92 --- a/fs/xfs/xfs_super.c
  93 +++ b/fs/xfs/xfs_super.c
  94 @@ -838,17 +838,18 @@ xfs_init_mount_workqueues(
  95                 goto out_destroy_unwritten;
  96
  97         mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
  98 -                       WQ_FREEZABLE, 0, mp->m_fsname);
  99 +                       WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
 100         if (!mp->m_reclaim_workqueue)
 101                 goto out_destroy_cil;
 102
 103         mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
 104 -                       WQ_FREEZABLE|WQ_HIGHPRI, 0, mp->m_fsname);
 105 +                       WQ_MEM_RECLAIM|WQ_FREEZABLE|WQ_HIGHPRI, 0,
 106 +                       mp->m_fsname);
 107         if (!mp->m_log_workqueue)
 108                 goto out_destroy_reclaim;
 109
 110         mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
 111 -                       WQ_FREEZABLE, 0, mp->m_fsname);
 112 +                       WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
 113         if (!mp->m_eofblocks_workqueue)
 114                 goto out_destroy_log;
 115
 116 --
 117 cgit v0.11.2
 118
 119 commit c2d42c16ad83006a706d83e51a7268db04af733a
 120 Author: Andrew Morton <akpm@linux-foundation.org>
 121 Date:   Thu Nov 5 18:48:43 2015 -0800
 122
 123     mm/vmstat.c: uninline node_page_state()
 124
 125     With x86_64 (config http://ozlabs.org/~akpm/config-akpm2.txt) and old gcc
 126     (4.4.4), drivers/base/node.c:node_read_meminfo() is using 2344 bytes of
 127     stack.  Uninlining node_page_state() reduces this to 440 bytes.
 128
 129     The stack consumption issue is fixed by newer gcc (4.8.4) however with
 130     that compiler this patch reduces the node.o text size from 7314 bytes to
 131     4578.
 132
 133     Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 134     Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
 135
 136 diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
 137 index 82e7db7..49dfe40 100644
 138 --- a/include/linux/vmstat.h
 139 +++ b/include/linux/vmstat.h
 140 @@ -161,30 +161,8 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
 141  }
 142
 143  #ifdef CONFIG_NUMA
 144 -/*
 145 - * Determine the per node value of a stat item. This function
 146 - * is called frequently in a NUMA machine, so try to be as
 147 - * frugal as possible.
 148 - */
 149 -static inline unsigned long node_page_state(int node,
 150 -                                enum zone_stat_item item)
 151 -{
 152 -       struct zone *zones = NODE_DATA(node)->node_zones;
 153 -
 154 -       return
 155 -#ifdef CONFIG_ZONE_DMA
 156 -               zone_page_state(&zones[ZONE_DMA], item) +
 157 -#endif
 158 -#ifdef CONFIG_ZONE_DMA32
 159 -               zone_page_state(&zones[ZONE_DMA32], item) +
 160 -#endif
 161 -#ifdef CONFIG_HIGHMEM
 162 -               zone_page_state(&zones[ZONE_HIGHMEM], item) +
 163 -#endif
 164 -               zone_page_state(&zones[ZONE_NORMAL], item) +
 165 -               zone_page_state(&zones[ZONE_MOVABLE], item);
 166 -}
 167
 168 +extern unsigned long node_page_state(int node, enum zone_stat_item item);
 169  extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp);
 170
 171  #else
 172 diff --git a/mm/vmstat.c b/mm/vmstat.c
 173 index fbf1448..ffcb4f5 100644
 174 --- a/mm/vmstat.c
 175 +++ b/mm/vmstat.c
 176 @@ -591,6 +591,28 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
 177         else
 178                 __inc_zone_state(z, NUMA_OTHER);
 179  }
 180 +
 181 +/*
 182 + * Determine the per node value of a stat item.
 183 + */
 184 +unsigned long node_page_state(int node, enum zone_stat_item item)
 185 +{
 186 +       struct zone *zones = NODE_DATA(node)->node_zones;
 187 +
 188 +       return
 189 +#ifdef CONFIG_ZONE_DMA
 190 +               zone_page_state(&zones[ZONE_DMA], item) +
 191 +#endif
 192 +#ifdef CONFIG_ZONE_DMA32
 193 +               zone_page_state(&zones[ZONE_DMA32], item) +
 194 +#endif
 195 +#ifdef CONFIG_HIGHMEM
 196 +               zone_page_state(&zones[ZONE_HIGHMEM], item) +
 197 +#endif
 198 +               zone_page_state(&zones[ZONE_NORMAL], item) +
 199 +               zone_page_state(&zones[ZONE_MOVABLE], item);
 200 +}
 201 +
 202  #endif
 203
 204  #ifdef CONFIG_COMPACTION
 205 commit 016c13daa5c9e4827eca703e2f0621c131f2cca3
 206 Author: Mel Gorman <mgorman@techsingularity.net>
 207 Date:   Fri Nov 6 16:28:18 2015 -0800
 208
 209     mm, page_alloc: use masks and shifts when converting GFP flags to migrate types
 210
 211     This patch redefines which GFP bits are used for specifying mobility and
 212     the order of the migrate types.  Once redefined it's possible to convert
 213     GFP flags to a migrate type with a simple mask and shift.  The only
 214     downside is that readers of OOM kill messages and allocation failures may
 215     have been used to the existing values but scripts/gfp-translate will help.
 216
 217     Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
 218     Acked-by: Vlastimil Babka <vbabka@suse.cz>
 219     Cc: Christoph Lameter <cl@linux.com>
 220     Cc: David Rientjes <rientjes@google.com>
 221     Cc: Johannes Weiner <hannes@cmpxchg.org>
 222     Cc: Michal Hocko <mhocko@suse.com>
 223     Cc: Vitaly Wool <vitalywool@gmail.com>
 224     Cc: Rik van Riel <riel@redhat.com>
 225     Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 226     Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
 227
 228 diff --git a/include/linux/gfp.h b/include/linux/gfp.h
 229 index f92cbd2..440fca3 100644
 230 --- a/include/linux/gfp.h
 231 +++ b/include/linux/gfp.h
 232 @@ -14,7 +14,7 @@ struct vm_area_struct;
 233  #define ___GFP_HIGHMEM         0x02u
 234  #define ___GFP_DMA32           0x04u
 235  #define ___GFP_MOVABLE         0x08u
 236 -#define ___GFP_WAIT            0x10u
 237 +#define ___GFP_RECLAIMABLE     0x10u
 238  #define ___GFP_HIGH            0x20u
 239  #define ___GFP_IO              0x40u
 240  #define ___GFP_FS              0x80u
 241 @@ -29,7 +29,7 @@ struct vm_area_struct;
 242  #define ___GFP_NOMEMALLOC      0x10000u
 243  #define ___GFP_HARDWALL                0x20000u
 244  #define ___GFP_THISNODE                0x40000u
 245 -#define ___GFP_RECLAIMABLE     0x80000u
 246 +#define ___GFP_WAIT            0x80000u
 247  #define ___GFP_NOACCOUNT       0x100000u
 248  #define ___GFP_NOTRACK         0x200000u
 249  #define ___GFP_NO_KSWAPD       0x400000u
 250 @@ -126,6 +126,7 @@ struct vm_area_struct;
 251
 252  /* This mask makes up all the page movable related flags */
 253  #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
 254 +#define GFP_MOVABLE_SHIFT 3
 255
 256  /* Control page allocator reclaim behavior */
 257  #define GFP_RECLAIM_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|\
 258 @@ -152,14 +153,15 @@ struct vm_area_struct;
 259  /* Convert GFP flags to their corresponding migrate type */
 260  static inline int gfpflags_to_migratetype(const gfp_t gfp_flags)
 261  {
 262 -       WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
 263 +       VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
 264 +       BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE);
 265 +       BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE);
 266
 267         if (unlikely(page_group_by_mobility_disabled))
 268                 return MIGRATE_UNMOVABLE;
 269
 270         /* Group based on mobility */
 271 -       return (((gfp_flags & __GFP_MOVABLE) != 0) << 1) |
 272 -               ((gfp_flags & __GFP_RECLAIMABLE) != 0);
 273 +       return (gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT;
 274  }
 275
 276  #ifdef CONFIG_HIGHMEM
 277 diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
 278 index e326843..38bed71 100644
 279 --- a/include/linux/mmzone.h
 280 +++ b/include/linux/mmzone.h
 281 @@ -37,8 +37,8 @@
 282
 283  enum {
 284         MIGRATE_UNMOVABLE,
 285 -       MIGRATE_RECLAIMABLE,
 286         MIGRATE_MOVABLE,
 287 +       MIGRATE_RECLAIMABLE,
 288         MIGRATE_PCPTYPES,       /* the number of types on the pcp lists */
 289         MIGRATE_RESERVE = MIGRATE_PCPTYPES,
 290  #ifdef CONFIG_CMA
 291 commit 974a786e63c96a2401a78ddba926f34c128474f1
 292 Author: Mel Gorman <mgorman@techsingularity.net>
 293 Date:   Fri Nov 6 16:28:34 2015 -0800
 294
 295     mm, page_alloc: remove MIGRATE_RESERVE
 296
 297     MIGRATE_RESERVE preserves an old property of the buddy allocator that
 298     existed prior to fragmentation avoidance -- min_free_kbytes worth of pages
 299     tended to remain contiguous until the only alternative was to fail the
 300     allocation.  At the time it was discovered that high-order atomic
 301     allocations relied on this property so MIGRATE_RESERVE was introduced.  A
 302     later patch will introduce an alternative MIGRATE_HIGHATOMIC so this patch
 303     deletes MIGRATE_RESERVE and supporting code so it'll be easier to review.
 304     Note that this patch in isolation may look like a false regression if
 305     someone was bisecting high-order atomic allocation failures.
 306
 307     Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
 308     Acked-by: Vlastimil Babka <vbabka@suse.cz>
 309     Cc: Christoph Lameter <cl@linux.com>
 310     Cc: David Rientjes <rientjes@google.com>
 311     Cc: Johannes Weiner <hannes@cmpxchg.org>
 312     Cc: Michal Hocko <mhocko@suse.com>
 313     Cc: Vitaly Wool <vitalywool@gmail.com>
 314     Cc: Rik van Riel <riel@redhat.com>
 315     Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 316     Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
 317
 318 diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
 319 index 1e88aae..b86cfa3 100644
 320 --- a/include/linux/mmzone.h
 321 +++ b/include/linux/mmzone.h
 322 @@ -39,8 +39,6 @@ enum {
 323         MIGRATE_UNMOVABLE,
 324         MIGRATE_MOVABLE,
 325         MIGRATE_RECLAIMABLE,
 326 -       MIGRATE_PCPTYPES,       /* the number of types on the pcp lists */
 327 -       MIGRATE_RESERVE = MIGRATE_PCPTYPES,
 328  #ifdef CONFIG_CMA
 329         /*
 330          * MIGRATE_CMA migration type is designed to mimic the way
 331 @@ -63,6 +61,8 @@ enum {
 332         MIGRATE_TYPES
 333  };
 334
 335 +#define MIGRATE_PCPTYPES (MIGRATE_RECLAIMABLE+1)
 336 +
 337  #ifdef CONFIG_CMA
 338  #  define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
 339  #else
 340 @@ -429,12 +429,6 @@ struct zone {
 341
 342         const char              *name;
 343
 344 -       /*
 345 -        * Number of MIGRATE_RESERVE page block. To maintain for just
 346 -        * optimization. Protected by zone->lock.
 347 -        */
 348 -       int                     nr_migrate_reserve_block;
 349 -
 350  #ifdef CONFIG_MEMORY_ISOLATION
 351         /*
 352          * Number of isolated pageblock. It is used to solve incorrect
 353 diff --git a/mm/huge_memory.c b/mm/huge_memory.c
 354 index 9812d46..dabd247 100644
 355 --- a/mm/huge_memory.c
 356 +++ b/mm/huge_memory.c
 357 @@ -116,7 +116,7 @@ static void set_recommended_min_free_kbytes(void)
 358         for_each_populated_zone(zone)
 359                 nr_zones++;
 360
 361 -       /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
 362 +       /* Ensure 2 pageblocks are free to assist fragmentation avoidance */
 363         recommended_min = pageblock_nr_pages * nr_zones * 2;
 364
 365         /*
 366 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
 367 index 8dc6e3c..5888126 100644
 368 --- a/mm/page_alloc.c
 369 +++ b/mm/page_alloc.c
 370 @@ -817,7 +817,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 371                         if (unlikely(has_isolate_pageblock(zone)))
 372                                 mt = get_pageblock_migratetype(page);
 373
 374 -                       /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
 375                         __free_one_page(page, page_to_pfn(page), zone, 0, mt);
 376                         trace_mm_page_pcpu_drain(page, 0, mt);
 377                 } while (--to_free && --batch_free && !list_empty(list));
 378 @@ -1417,15 +1416,14 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 379   * the free lists for the desirable migrate type are depleted
 380   */
 381  static int fallbacks[MIGRATE_TYPES][4] = {
 382 -       [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,     MIGRATE_RESERVE },
 383 -       [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,     MIGRATE_RESERVE },
 384 -       [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
 385 +       [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_TYPES },
 386 +       [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_TYPES },
 387 +       [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
 388  #ifdef CONFIG_CMA
 389 -       [MIGRATE_CMA]         = { MIGRATE_RESERVE }, /* Never used */
 390 +       [MIGRATE_CMA]         = { MIGRATE_TYPES }, /* Never used */
 391  #endif
 392 -       [MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */
 393  #ifdef CONFIG_MEMORY_ISOLATION
 394 -       [MIGRATE_ISOLATE]     = { MIGRATE_RESERVE }, /* Never used */
 395 +       [MIGRATE_ISOLATE]     = { MIGRATE_TYPES }, /* Never used */
 396  #endif
 397  };
 398
 399 @@ -1598,7 +1596,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order,
 400         *can_steal = false;
 401         for (i = 0;; i++) {
 402                 fallback_mt = fallbacks[migratetype][i];
 403 -               if (fallback_mt == MIGRATE_RESERVE)
 404 +               if (fallback_mt == MIGRATE_TYPES)
 405                         break;
 406
 407                 if (list_empty(&area->free_list[fallback_mt]))
 408 @@ -1676,25 +1674,13 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order,
 409  {
 410         struct page *page;
 411
 412 -retry_reserve:
 413         page = __rmqueue_smallest(zone, order, migratetype);
 414 -
 415 -       if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
 416 +       if (unlikely(!page)) {
 417                 if (migratetype == MIGRATE_MOVABLE)
 418                         page = __rmqueue_cma_fallback(zone, order);
 419
 420                 if (!page)
 421                         page = __rmqueue_fallback(zone, order, migratetype);
 422 -
 423 -               /*
 424 -                * Use MIGRATE_RESERVE rather than fail an allocation. goto
 425 -                * is used because __rmqueue_smallest is an inline function
 426 -                * and we want just one call site
 427 -                */
 428 -               if (!page) {
 429 -                       migratetype = MIGRATE_RESERVE;
 430 -                       goto retry_reserve;
 431 -               }
 432         }
 433
 434         trace_mm_page_alloc_zone_locked(page, order, migratetype);
 435 @@ -3492,7 +3478,6 @@ static void show_migration_types(unsigned char type)
 436                 [MIGRATE_UNMOVABLE]     = 'U',
 437                 [MIGRATE_RECLAIMABLE]   = 'E',
 438                 [MIGRATE_MOVABLE]       = 'M',
 439 -               [MIGRATE_RESERVE]       = 'R',
 440  #ifdef CONFIG_CMA
 441                 [MIGRATE_CMA]           = 'C',
 442  #endif
 443 @@ -4303,120 +4288,6 @@ static inline unsigned long wait_table_bits(unsigned long size)
 444  }
 445
 446  /*
 447 - * Check if a pageblock contains reserved pages
 448 - */
 449 -static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
 450 -{
 451 -       unsigned long pfn;
 452 -
 453 -       for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 454 -               if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
 455 -                       return 1;
 456 -       }
 457 -       return 0;
 458 -}
 459 -
 460 -/*
 461 - * Mark a number of pageblocks as MIGRATE_RESERVE. The number
 462 - * of blocks reserved is based on min_wmark_pages(zone). The memory within
 463 - * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
 464 - * higher will lead to a bigger reserve which will get freed as contiguous
 465 - * blocks as reclaim kicks in
 466 - */
 467 -static void setup_zone_migrate_reserve(struct zone *zone)
 468 -{
 469 -       unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
 470 -       struct page *page;
 471 -       unsigned long block_migratetype;
 472 -       int reserve;
 473 -       int old_reserve;
 474 -
 475 -       /*
 476 -        * Get the start pfn, end pfn and the number of blocks to reserve
 477 -        * We have to be careful to be aligned to pageblock_nr_pages to
 478 -        * make sure that we always check pfn_valid for the first page in
 479 -        * the block.
 480 -        */
 481 -       start_pfn = zone->zone_start_pfn;
 482 -       end_pfn = zone_end_pfn(zone);
 483 -       start_pfn = roundup(start_pfn, pageblock_nr_pages);
 484 -       reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
 485 -                                                       pageblock_order;
 486 -
 487 -       /*
 488 -        * Reserve blocks are generally in place to help high-order atomic
 489 -        * allocations that are short-lived. A min_free_kbytes value that
 490 -        * would result in more than 2 reserve blocks for atomic allocations
 491 -        * is assumed to be in place to help anti-fragmentation for the
 492 -        * future allocation of hugepages at runtime.
 493 -        */
 494 -       reserve = min(2, reserve);
 495 -       old_reserve = zone->nr_migrate_reserve_block;
 496 -
 497 -       /* When memory hot-add, we almost always need to do nothing */
 498 -       if (reserve == old_reserve)
 499 -               return;
 500 -       zone->nr_migrate_reserve_block = reserve;
 501 -
 502 -       for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
 503 -               if (!early_page_nid_uninitialised(pfn, zone_to_nid(zone)))
 504 -                       return;
 505 -
 506 -               if (!pfn_valid(pfn))
 507 -                       continue;
 508 -               page = pfn_to_page(pfn);
 509 -
 510 -               /* Watch out for overlapping nodes */
 511 -               if (page_to_nid(page) != zone_to_nid(zone))
 512 -                       continue;
 513 -
 514 -               block_migratetype = get_pageblock_migratetype(page);
 515 -
 516 -               /* Only test what is necessary when the reserves are not met */
 517 -               if (reserve > 0) {
 518 -                       /*
 519 -                        * Blocks with reserved pages will never free, skip
 520 -                        * them.
 521 -                        */
 522 -                       block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
 523 -                       if (pageblock_is_reserved(pfn, block_end_pfn))
 524 -                               continue;
 525 -
 526 -                       /* If this block is reserved, account for it */
 527 -                       if (block_migratetype == MIGRATE_RESERVE) {
 528 -                               reserve--;
 529 -                               continue;
 530 -                       }
 531 -
 532 -                       /* Suitable for reserving if this block is movable */
 533 -                       if (block_migratetype == MIGRATE_MOVABLE) {
 534 -                               set_pageblock_migratetype(page,
 535 -                                                       MIGRATE_RESERVE);
 536 -                               move_freepages_block(zone, page,
 537 -                                                       MIGRATE_RESERVE);
 538 -                               reserve--;
 539 -                               continue;
 540 -                       }
 541 -               } else if (!old_reserve) {
 542 -                       /*
 543 -                        * At boot time we don't need to scan the whole zone
 544 -                        * for turning off MIGRATE_RESERVE.
 545 -                        */
 546 -                       break;
 547 -               }
 548 -
 549 -               /*
 550 -                * If the reserve is met and this is a previous reserved block,
 551 -                * take it back
 552 -                */
 553 -               if (block_migratetype == MIGRATE_RESERVE) {
 554 -                       set_pageblock_migratetype(page, MIGRATE_MOVABLE);
 555 -                       move_freepages_block(zone, page, MIGRATE_MOVABLE);
 556 -               }
 557 -       }
 558 -}
 559 -
 560 -/*
 561   * Initially all pages are reserved - free ones are freed
 562   * up by free_all_bootmem() once the early boot process is
 563   * done. Non-atomic initialization, single-pass.
 564 @@ -4455,9 +4326,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 565                  * movable at startup. This will force kernel allocations
 566                  * to reserve their blocks rather than leaking throughout
 567                  * the address space during boot when many long-lived
 568 -                * kernel allocations are made. Later some blocks near
 569 -                * the start are marked MIGRATE_RESERVE by
 570 -                * setup_zone_migrate_reserve()
 571 +                * kernel allocations are made.
 572                  *
 573                  * bitmap is created for zone's valid pfn range. but memmap
 574                  * can be created for invalid pages (for alignment)
 575 @@ -6018,7 +5887,6 @@ static void __setup_per_zone_wmarks(void)
 576                         high_wmark_pages(zone) - low_wmark_pages(zone) -
 577                         atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]));
 578
 579 -               setup_zone_migrate_reserve(zone);
 580                 spin_unlock_irqrestore(&zone->lock, flags);
 581         }
 582
 583 diff --git a/mm/vmstat.c b/mm/vmstat.c
 584 index ffcb4f5..5b289dc 100644
 585 --- a/mm/vmstat.c
 586 +++ b/mm/vmstat.c
 587 @@ -923,7 +923,6 @@ static char * const migratetype_names[MIGRATE_TYPES] = {
 588         "Unmovable",
 589         "Reclaimable",
 590         "Movable",
 591 -       "Reserve",
 592  #ifdef CONFIG_CMA
 593         "CMA",
 594  #endif
 595 diff --git a/mm/backing-dev.c b/mm/backing-dev.c
 596 index 8ed2ffd963c5..7340353f8aea 100644
 597 --- a/mm/backing-dev.c
 598 +++ b/mm/backing-dev.c
 599 @@ -957,8 +957,9 @@ EXPORT_SYMBOL(congestion_wait);
 600   * jiffies for either a BDI to exit congestion of the given @sync queue
 601   * or a write to complete.
 602   *
 603 - * In the absence of zone congestion, cond_resched() is called to yield
 604 - * the processor if necessary but otherwise does not sleep.
 605 + * In the absence of zone congestion, a short sleep or a cond_resched is
 606 + * performed to yield the processor and to allow other subsystems to make
 607 + * a forward progress.
 608   *
 609   * The return value is 0 if the sleep is for the full timeout. Otherwise,
 610   * it is the number of jiffies that were still remaining when the function
 611 @@ -978,7 +979,19 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
 612          */
 613         if (atomic_read(&nr_wb_congested[sync]) == 0 ||
 614             !test_bit(ZONE_CONGESTED, &zone->flags)) {
 615 -               cond_resched();
 616 +
 617 +               /*
 618 +                * Memory allocation/reclaim might be called from a WQ
 619 +                * context and the current implementation of the WQ
 620 +                * concurrency control doesn't recognize that a particular
 621 +                * WQ is congested if the worker thread is looping without
 622 +                * ever sleeping. Therefore we have to do a short sleep
 623 +                * here rather than calling cond_resched().
 624 +                */
 625 +               if (current->flags & PF_WQ_WORKER)
 626 +                       schedule_timeout(1);
 627 +               else
 628 +                       cond_resched();
 629
 630                 /* In case we scheduled, work out time remaining */
 631                 ret = timeout - (jiffies - start);
 632 diff --git a/mm/vmstat.c b/mm/vmstat.c
 633 index 45dcbcb5c594..0975da8e3432 100644
 634 --- a/mm/vmstat.c
 635 +++ b/mm/vmstat.c
 636 @@ -1381,6 +1381,7 @@ static const struct file_operations proc_vmstat_file_operations = {
 637  #endif /* CONFIG_PROC_FS */
 638
 639  #ifdef CONFIG_SMP
 640 +static struct workqueue_struct *vmstat_wq;
 641  static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
 642  int sysctl_stat_interval __read_mostly = HZ;
 643  static cpumask_var_t cpu_stat_off;
 644 @@ -1393,7 +1394,7 @@ static void vmstat_update(struct work_struct *w)
 645                  * to occur in the future. Keep on running the
 646                  * update worker thread.
 647                  */
 648 -               schedule_delayed_work_on(smp_processor_id(),
 649 +               queue_delayed_work_on(smp_processor_id(), vmstat_wq,
 650                         this_cpu_ptr(&vmstat_work),
 651                         round_jiffies_relative(sysctl_stat_interval));
 652         } else {
 653 @@ -1462,7 +1463,7 @@ static void vmstat_shepherd(struct work_struct *w)
 654                 if (need_update(cpu) &&
 655                         cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
 656
 657 -                       schedule_delayed_work_on(cpu,
 658 +                       queue_delayed_work_on(cpu, vmstat_wq,
 659                                 &per_cpu(vmstat_work, cpu), 0);
 660
 661         put_online_cpus();
 662 @@ -1551,6 +1552,7 @@ static int __init setup_vmstat(void)
 663
 664         start_shepherd_timer();
 665         cpu_notifier_register_done();
 666 +       vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
 667  #endif
 668  #ifdef CONFIG_PROC_FS
 669         proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
 670 --
 671 2.6.2
 672
 673 From 09ccfd238e5a0e670d8178cf50180ea81ae09ae1 Mon Sep 17 00:00:00 2001
 674 From: WANG Cong <xiyou.wangcong@gmail.com>
 675 Date: Mon, 14 Dec 2015 13:48:36 -0800
 676 Subject: pptp: verify sockaddr_len in pptp_bind() and pptp_connect()
 677
 678 Reported-by: Dmitry Vyukov <dvyukov@gmail.com>
 679 Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
 680 Signed-off-by: David S. Miller <davem@davemloft.net>
 681 ---
 682  drivers/net/ppp/pptp.c | 6 ++++++
 683  1 file changed, 6 insertions(+)
 684
 685 diff --git a/drivers/net/ppp/pptp.c b/drivers/net/ppp/pptp.c
 686 index fc69e41..597c53e 100644
 687 --- a/drivers/net/ppp/pptp.c
 688 +++ b/drivers/net/ppp/pptp.c
 689 @@ -419,6 +419,9 @@ static int pptp_bind(struct socket *sock, struct sockaddr *uservaddr,
 690         struct pptp_opt *opt = &po->proto.pptp;
 691         int error = 0;
 692
 693 +       if (sockaddr_len < sizeof(struct sockaddr_pppox))
 694 +               return -EINVAL;
 695 +
 696         lock_sock(sk);
 697
 698         opt->src_addr = sp->sa_addr.pptp;
 699 @@ -440,6 +443,9 @@ static int pptp_connect(struct socket *sock, struct sockaddr *uservaddr,
 700         struct flowi4 fl4;
 701         int error = 0;
 702
 703 +       if (sockaddr_len < sizeof(struct sockaddr_pppox))
 704 +               return -EINVAL;
 705 +
 706         if (sp->sa_protocol != PX_PROTO_PPTP)
 707                 return -EINVAL;
 708
 709 --
 710 cgit v0.11.2
 711
 712 commit cc57858831e3e9678291de730c4b4d2e52a19f59
 713 Author: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
 714 Date:   Fri Dec 18 15:19:16 2015 +1100
 715
 716     md/raid10: fix data corruption and crash during resync
 717
 718     The commit c31df25f20e3 ("md/raid10: make sync_request_write() call
 719     bio_copy_data()") replaced manual data copying with bio_copy_data() but
 720     it doesn't work as intended. The source bio (fbio) is already processed,
 721     so its bvec_iter has bi_size == 0 and bi_idx == bi_vcnt.  Because of
 722     this, bio_copy_data() either does not copy anything, or worse, copies
 723     data from the ->bi_next bio if it is set.  This causes wrong data to be
 724     written to drives during resync and sometimes lockups/crashes in
 725     bio_copy_data():
 726
 727     [  517.338478] NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [md126_raid10:3319]
 728     [  517.347324] Modules linked in: raid10 xt_CHECKSUM ipt_MASQUERADE nf_nat_masquerade_ipv4 tun ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 ipt_REJECT nf_reject_ipv4 xt_conntrack ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw ip6table_filter ip6_tables iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw iptable_filter ip_tables x86_pkg_temp_thermal coretemp kvm_intel kvm crct10dif_pclmul crc32_pclmul cryptd shpchp pcspkr ipmi_si ipmi_msghandler tpm_crb acpi_power_meter acpi_cpufreq ext4 mbcache jbd2 sr_mod cdrom sd_mod e1000e ax88179_178a usbnet mii ahci ata_generic crc32c_intel libahci ptp pata_acpi libata pps_core wmi sunrpc dm_mirror dm_region_hash dm_log dm_mod
 729     [  517.440555] CPU: 0 PID: 3319 Comm: md126_raid10 Not tainted 4.3.0-rc6+ #1
 730     [  517.448384] Hardware name: Intel Corporation PURLEY/PURLEY, BIOS PLYDCRB1.86B.0055.D14.1509221924 09/22/2015
 731     [  517.459768] task: ffff880153773980 ti: ffff880150df8000 task.ti: ffff880150df8000
 732     [  517.468529] RIP: 0010:[<ffffffff812e1888>]  [<ffffffff812e1888>] bio_copy_data+0xc8/0x3c0
 733     [  517.478164] RSP: 0018:ffff880150dfbc98  EFLAGS: 00000246
 734     [  517.484341] RAX: ffff880169356688 RBX: 0000000000001000 RCX: 0000000000000000
 735     [  517.492558] RDX: 0000000000000000 RSI: ffffea0001ac2980 RDI: ffffea0000d835c0
 736     [  517.500773] RBP: ffff880150dfbd08 R08: 0000000000000001 R09: ffff880153773980
 737     [  517.508987] R10: ffff880169356600 R11: 0000000000001000 R12: 0000000000010000
 738     [  517.517199] R13: 000000000000e000 R14: 0000000000000000 R15: 0000000000001000
 739     [  517.525412] FS:  0000000000000000(0000) GS:ffff880174a00000(0000) knlGS:0000000000000000
 740     [  517.534844] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 741     [  517.541507] CR2: 00007f8a044d5fed CR3: 0000000169504000 CR4: 00000000001406f0
 742     [  517.549722] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 743     [  517.557929] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 744     [  517.566144] Stack:
 745     [  517.568626]  ffff880174a16bc0 ffff880153773980 ffff880169356600 0000000000000000
 746     [  517.577659]  0000000000000001 0000000000000001 ffff880153773980 ffff88016a61a800
 747     [  517.586715]  ffff880150dfbcf8 0000000000000001 ffff88016dd209e0 0000000000001000
 748     [  517.595773] Call Trace:
 749     [  517.598747]  [<ffffffffa043ef95>] raid10d+0xfc5/0x1690 [raid10]
 750     [  517.605610]  [<ffffffff816697ae>] ? __schedule+0x29e/0x8e2
 751     [  517.611987]  [<ffffffff814ff206>] md_thread+0x106/0x140
 752     [  517.618072]  [<ffffffff810c1d80>] ? wait_woken+0x80/0x80
 753     [  517.624252]  [<ffffffff814ff100>] ? super_1_load+0x520/0x520
 754     [  517.630817]  [<ffffffff8109ef89>] kthread+0xc9/0xe0
 755     [  517.636506]  [<ffffffff8109eec0>] ? flush_kthread_worker+0x70/0x70
 756     [  517.643653]  [<ffffffff8166d99f>] ret_from_fork+0x3f/0x70
 757     [  517.649929]  [<ffffffff8109eec0>] ? flush_kthread_worker+0x70/0x70
 758
 759     Signed-off-by: Artur Paszkiewicz <artur.paszkiewicz@intel.com>
 760     Reviewed-by: Shaohua Li <shli@kernel.org>
 761     Cc: stable@vger.kernel.org (v4.2+)
 762     Fixes: c31df25f20e3 ("md/raid10: make sync_request_write() call bio_copy_data()")
 763     Signed-off-by: NeilBrown <neilb@suse.com>
 764
 765 diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
 766 index 41d70bc..84e597e 100644
 767 --- a/drivers/md/raid10.c
 768 +++ b/drivers/md/raid10.c
 769 @@ -1946,6 +1946,8 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 770
 771         first = i;
 772         fbio = r10_bio->devs[i].bio;
 773 +       fbio->bi_iter.bi_size = r10_bio->sectors << 9;
 774 +       fbio->bi_iter.bi_idx = 0;
 775
 776         vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
 777         /* now find blocks with errors */
 778 @@ -1989,7 +1991,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 779                 bio_reset(tbio);
 780
 781                 tbio->bi_vcnt = vcnt;
 782 -               tbio->bi_iter.bi_size = r10_bio->sectors << 9;
 783 +               tbio->bi_iter.bi_size = fbio->bi_iter.bi_size;
 784                 tbio->bi_rw = WRITE;
 785                 tbio->bi_private = r10_bio;
 786                 tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
 787 From: Dave Chinner <dchinner@redhat.com>
 788
 789 When we do dquot readahead in log recovery, we do not use a verifier
 790 as the underlying buffer may not have dquots in it. e.g. the
 791 allocation operation hasn't yet been replayed. Hence we do not want
 792 to fail recovery because we detect an operation to be replayed has
 793 not been run yet. This problem was addressed for inodes in commit
 794 d891400 ("xfs: inode buffers may not be valid during recovery
 795 readahead") but the problem was not recognised to exist for dquots
 796 and their buffers as the dquot readahead did not have a verifier.
 797
 798 The result of not using a verifier is that when the buffer is then
 799 next read to replay a dquot modification, the dquot buffer verifier
 800 will only be attached to the buffer if *readahead is not complete*.
 801 Hence we can read the buffer, replay the dquot changes and then add
 802 it to the delwri submission list without it having a verifier
 803 attached to it. This then generates warnings in xfs_buf_ioapply(),
 804 which catches and warns about this case.
 805
 806 Fix this and make it handle the same readahead verifier error cases
 807 as for inode buffers by adding a new readahead verifier that has a
 808 write operation as well as a read operation that marks the buffer as
 809 not done if any corruption is detected.  Also make sure we don't run
 810 readahead if the dquot buffer has been marked as cancelled by
 811 recovery.
 812
 813 This will result in readahead either succeeding and the buffer
 814 having a valid write verifier, or readahead failing and the buffer
 815 state requiring the subsequent read to resubmit the IO with the new
 816 verifier.  In either case, this will result in the buffer always
 817 ending up with a valid write verifier on it.
 818
 819 Note: we also need to fix the inode buffer readahead error handling
 820 to mark the buffer with EIO. Brian noticed the code I copied from
 821 there wrong during review, so fix it at the same time. Add comments
 822 linking the two functions that handle readahead verifier errors
 823 together so we don't forget this behavioural link in future.
 824
 825 cc: <stable@vger.kernel.org> # 3.12 - current
 826 Signed-off-by: Dave Chinner <dchinner@redhat.com>
 827 ---
 828
 829 Version 2
 830 - fix logic error in determining if verify failed
 831 - set error on buffer when verifier fails
 832 - fix inode buffer readahead verifier to set error when it fails
 833 - better comments, link dquot and inode buffer ra verifiers in the
 834   comments
 835
 836  fs/xfs/libxfs/xfs_dquot_buf.c  | 36 ++++++++++++++++++++++++++++++------
 837  fs/xfs/libxfs/xfs_inode_buf.c  | 14 +++++++++-----
 838  fs/xfs/libxfs/xfs_quota_defs.h |  2 +-
 839  fs/xfs/libxfs/xfs_shared.h     |  1 +
 840  fs/xfs/xfs_log_recover.c       |  9 +++++++--
 841  5 files changed, 48 insertions(+), 14 deletions(-)
 842
 843 diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
 844 index 11cefb2..3cc3cf7 100644
 845 --- a/fs/xfs/libxfs/xfs_dquot_buf.c
 846 +++ b/fs/xfs/libxfs/xfs_dquot_buf.c
 847 @@ -54,7 +54,7 @@ xfs_dqcheck(
 848         xfs_dqid_t       id,
 849         uint             type,    /* used only when IO_dorepair is true */
 850         uint             flags,
 851 -       char             *str)
 852 +       const char       *str)
 853  {
 854         xfs_dqblk_t      *d = (xfs_dqblk_t *)ddq;
 855         int             errs = 0;
 856 @@ -207,7 +207,8 @@ xfs_dquot_buf_verify_crc(
 857  STATIC bool
 858  xfs_dquot_buf_verify(
 859         struct xfs_mount        *mp,
 860 -       struct xfs_buf          *bp)
 861 +       struct xfs_buf          *bp,
 862 +       int                     warn)
 863  {
 864         struct xfs_dqblk        *d = (struct xfs_dqblk *)bp->b_addr;
 865         xfs_dqid_t              id = 0;
 866 @@ -240,8 +241,7 @@ xfs_dquot_buf_verify(
 867                 if (i == 0)
 868                         id = be32_to_cpu(ddq->d_id);
 869
 870 -               error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
 871 -                                      "xfs_dquot_buf_verify");
 872 +               error = xfs_dqcheck(mp, ddq, id + i, 0, warn, __func__);
 873                 if (error)
 874                         return false;
 875         }
 876 @@ -256,7 +256,7 @@ xfs_dquot_buf_read_verify(
 877
 878         if (!xfs_dquot_buf_verify_crc(mp, bp))
 879                 xfs_buf_ioerror(bp, -EFSBADCRC);
 880 -       else if (!xfs_dquot_buf_verify(mp, bp))
 881 +       else if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN))
 882                 xfs_buf_ioerror(bp, -EFSCORRUPTED);
 883
 884         if (bp->b_error)
 885 @@ -264,6 +264,25 @@ xfs_dquot_buf_read_verify(
 886  }
 887
 888  /*
 889 + * readahead errors are silent and simply leave the buffer as !done so a real
 890 + * read will then be run with the xfs_dquot_buf_ops verifier. See
 891 + * xfs_inode_buf_verify() for why we use EIO and ~XBF_DONE here rather than
 892 + * reporting the failure.
 893 + */
 894 +static void
 895 +xfs_dquot_buf_readahead_verify(
 896 +       struct xfs_buf  *bp)
 897 +{
 898 +       struct xfs_mount        *mp = bp->b_target->bt_mount;
 899 +
 900 +       if (!xfs_dquot_buf_verify_crc(mp, bp) ||
 901 +           !xfs_dquot_buf_verify(mp, bp, 0)) {
 902 +               xfs_buf_ioerror(bp, -EIO);
 903 +               bp->b_flags &= ~XBF_DONE;
 904 +       }
 905 +}
 906 +
 907 +/*
 908   * we don't calculate the CRC here as that is done when the dquot is flushed to
 909   * the buffer after the update is done. This ensures that the dquot in the
 910   * buffer always has an up-to-date CRC value.
 911 @@ -274,7 +293,7 @@ xfs_dquot_buf_write_verify(
 912  {
 913         struct xfs_mount        *mp = bp->b_target->bt_mount;
 914
 915 -       if (!xfs_dquot_buf_verify(mp, bp)) {
 916 +       if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN)) {
 917                 xfs_buf_ioerror(bp, -EFSCORRUPTED);
 918                 xfs_verifier_error(bp);
 919                 return;
 920 @@ -287,3 +306,8 @@ const struct xfs_buf_ops xfs_dquot_buf_ops = {
 921         .verify_write = xfs_dquot_buf_write_verify,
 922  };
 923
 924 +const struct xfs_buf_ops xfs_dquot_buf_ra_ops = {
 925 +
 926 +       .verify_read = xfs_dquot_buf_readahead_verify,
 927 +       .verify_write = xfs_dquot_buf_write_verify,
 928 +};
 929 diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
 930 index 1b8d98a..4816209 100644
 931 --- a/fs/xfs/libxfs/xfs_inode_buf.c
 932 +++ b/fs/xfs/libxfs/xfs_inode_buf.c
 933 @@ -62,11 +62,14 @@ xfs_inobp_check(
 934   * has not had the inode cores stamped into it. Hence for readahead, the buffer
 935   * may be potentially invalid.
 936   *
 937 - * If the readahead buffer is invalid, we don't want to mark it with an error,
 938 - * but we do want to clear the DONE status of the buffer so that a followup read
 939 - * will re-read it from disk. This will ensure that we don't get an unnecessary
 940 - * warnings during log recovery and we don't get unnecssary panics on debug
 941 - * kernels.
 942 + * If the readahead buffer is invalid, we need to mark it with an error and
 943 + * clear the DONE status of the buffer so that a followup read will re-read it
 944 + * from disk. We don't report the error otherwise to avoid warnings during log
 945 + * recovery and we don't get unnecssary panics on debug kernels. We use EIO here
 946 + * because all we want to do is say readahead failed; there is no-one to report
 947 + * the error to, so this will distinguish it from a non-ra verifier failure.
 948 + * Changes to this readahead error behavour also need to be reflected in
 949 + * xfs_dquot_buf_readahead_verify().
 950   */
 951  static void
 952  xfs_inode_buf_verify(
 953 @@ -92,6 +95,7 @@ xfs_inode_buf_verify(
 954                                                 XFS_ERRTAG_ITOBP_INOTOBP,
 955                                                 XFS_RANDOM_ITOBP_INOTOBP))) {
 956                         if (readahead) {
 957 +                               xfs_buf_ioerror(bp, -EIO);
 958                                 bp->b_flags &= ~XBF_DONE;
 959                                 return;
 960                         }
 961 diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
 962 index 1b0a083..f51078f 100644
 963 --- a/fs/xfs/libxfs/xfs_quota_defs.h
 964 +++ b/fs/xfs/libxfs/xfs_quota_defs.h
 965 @@ -153,7 +153,7 @@ typedef __uint16_t  xfs_qwarncnt_t;
 966  #define XFS_QMOPT_RESBLK_MASK  (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
 967
 968  extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq,
 969 -                      xfs_dqid_t id, uint type, uint flags, char *str);
 970 +                      xfs_dqid_t id, uint type, uint flags, const char *str);
 971  extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
 972
 973  #endif /* __XFS_QUOTA_H__ */
 974 diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
 975 index 5be5297..15c3ceb 100644
 976 --- a/fs/xfs/libxfs/xfs_shared.h
 977 +++ b/fs/xfs/libxfs/xfs_shared.h
 978 @@ -49,6 +49,7 @@ extern const struct xfs_buf_ops xfs_inobt_buf_ops;
 979  extern const struct xfs_buf_ops xfs_inode_buf_ops;
 980  extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
 981  extern const struct xfs_buf_ops xfs_dquot_buf_ops;
 982 +extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops;
 983  extern const struct xfs_buf_ops xfs_sb_buf_ops;
 984  extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
 985  extern const struct xfs_buf_ops xfs_symlink_buf_ops;
 986 diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
 987 index 26e67b4..da37beb 100644
 988 --- a/fs/xfs/xfs_log_recover.c
 989 +++ b/fs/xfs/xfs_log_recover.c
 990 @@ -3521,6 +3521,7 @@ xlog_recover_dquot_ra_pass2(
 991         struct xfs_disk_dquot   *recddq;
 992         struct xfs_dq_logformat *dq_f;
 993         uint                    type;
 994 +       int                     len;
 995
 996
 997         if (mp->m_qflags == 0)
 998 @@ -3541,8 +3542,12 @@ xlog_recover_dquot_ra_pass2(
 999         ASSERT(dq_f);
1000         ASSERT(dq_f->qlf_len == 1);
1001
1002 -       xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno,
1003 -                         XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL);
1004 +       len = XFS_FSB_TO_BB(mp, dq_f->qlf_len);
1005 +       if (xlog_peek_buffer_cancelled(log, dq_f->qlf_blkno, len, 0))
1006 +               return;
1007 +
1008 +       xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, len,
1009 +                         &xfs_dquot_buf_ra_ops);
1010  }
1011
1012  STATIC void
1013
1014 _______________________________________________
1015 xfs mailing list
1016 xfs@oss.sgi.com
1017 http://oss.sgi.com/mailman/listinfo/xfs
1018 From: Michal Hocko <mhocko@suse.com>
1019
1020 kernel test robot has reported the following crash:
1021 [    3.870718] BUG: unable to handle kernel NULL pointer dereferenceNULL pointer dereference at 00000100
1022  at 00000100
1023 [    3.872615] IP: [<c1074df6>] __queue_work+0x26/0x390 [<c1074df6>] __queue_work+0x26/0x390
1024 [    3.873758] *pdpt = 0000000000000000 *pde = f000ff53f000ff53 *pde = f000ff53f000ff53
1025 [    3.875096] Oops: 0000 [#1] PREEMPT PREEMPT SMP SMP
1026 [    3.876130] CPU: 0 PID: 24 Comm: kworker/0:1 Not tainted 4.4.0-rc4-00139-g373ccbe #1
1027 [    3.878135] Workqueue: events vmstat_shepherd
1028 [    3.879207] task: cb684600 ti: cb7ba000 task.ti: cb7ba000
1029 [    3.880445] EIP: 0060:[<c1074df6>] EFLAGS: 00010046 CPU: 0
1030 [    3.881704] EIP is at __queue_work+0x26/0x390
1031 [    3.882823] EAX: 00000046 EBX: cbb37800 ECX: cbb37800 EDX: 00000000
1032 [    3.884457] ESI: 00000000 EDI: 00000000 EBP: cb7bbe68 ESP: cb7bbe38
1033 [    3.886005]  DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
1034 [    3.887229] CR0: 8005003b CR2: 00000100 CR3: 01fd5000 CR4: 000006b0
1035 [    3.888663] Stack:
1036 [    3.895204] Call Trace:
1037 [    3.895854]  [<c1a381dd>] ? mutex_unlock+0xd/0x10
1038 [    3.897120]  [<c1075221>] __queue_delayed_work+0xa1/0x160
1039 [    3.898530]  [<c10764c6>] queue_delayed_work_on+0x36/0x60
1040 [    3.899790]  [<c11494bd>] vmstat_shepherd+0xad/0xf0
1041 [    3.900899]  [<c1075a7a>] process_one_work+0x1aa/0x4c0
1042 [    3.902093]  [<c10759e2>] ? process_one_work+0x112/0x4c0
1043 [    3.903520]  [<c10ac31e>] ? do_raw_spin_lock+0xe/0x150
1044 [    3.904853]  [<c1075dd1>] worker_thread+0x41/0x440
1045 [    3.906023]  [<c1075d90>] ? process_one_work+0x4c0/0x4c0
1046 [    3.907242]  [<c107b7c0>] kthread+0xb0/0xd0
1047 [    3.908188]  [<c1a3c651>] ret_from_kernel_thread+0x21/0x40
1048 [    3.909601]  [<c107b710>] ? __kthread_parkme+0x80/0x80
1049
1050 The reason is that start_shepherd_timer schedules the shepherd work item
1051 which uses vmstat_wq (vmstat_shepherd) before setup_vmstat allocates
1052 that workqueue so if the further initialization takes more than HZ
1053 we might end up scheduling on a NULL vmstat_wq. This is really unlikely
1054 but not impossible.
1055
1056 Fixes: 373ccbe59270 ("mm, vmstat: allow WQ concurrency to discover memory reclaim doesn't make any progress")
1057 Reported-by: kernel test robot <ying.huang@linux.intel.com>
1058 Signed-off-by: Michal Hocko <mhocko@suse.com>
1059 ---
1060 Hi Linus,
1061 I am not marking this for stable because I hope we can sneak it into 4.4.
1062 The patch is trivial and obvious. I am sorry about the breakage. If you prefer
1063 to postpone it to 4.5-rc1 because this is not really that critical and shouldn't
1064 happen most of the time then I will repost with stable tag added.
1065
1066 Thanks!
1067
1068  mm/vmstat.c | 2 +-
1069  1 file changed, 1 insertion(+), 1 deletion(-)
1070
1071 diff --git a/mm/vmstat.c b/mm/vmstat.c
1072 index 4ebc17d948cb..c54fd2924f25 100644
1073 --- a/mm/vmstat.c
1074 +++ b/mm/vmstat.c
1075 @@ -1483,6 +1483,7 @@ static void __init start_shepherd_timer(void)
1076                 BUG();
1077         cpumask_copy(cpu_stat_off, cpu_online_mask);
1078
1079 +       vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
1080         schedule_delayed_work(&shepherd,
1081                 round_jiffies_relative(sysctl_stat_interval));
1082  }
1083 @@ -1550,7 +1551,6 @@ static int __init setup_vmstat(void)
1084
1085         start_shepherd_timer();
1086         cpu_notifier_register_done();
1087 -       vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
1088  #endif
1089  #ifdef CONFIG_PROC_FS
1090         proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
1091 --
1092 2.6.4
1093