- rel 2

[packages/kernel.git] / kernel-small_fixes.patch
diff --git a/kernel-small_fixes.patch b/kernel-small_fixes.patch

index 392d1212b54690cc4ea01e399c80095dce704114..5fcf48ab9e00060302ea72f256a091861e15d1f1 100644 (file)
--- a/kernel-small_fixes.patch
+++ b/kernel-small_fixes.patch
@@ -1,48 +1,318 @@
---- linux-2.6.33/scripts/mod/modpost.c~        2010-02-24 19:52:17.000000000 +0100
-+++ linux-2.6.33/scripts/mod/modpost.c 2010-03-07 14:26:47.242168558 +0100
-@@ -15,7 +15,8 @@
- #include <stdio.h>
- #include <ctype.h>
- #include "modpost.h"
--#include "../../include/generated/autoconf.h"
-+// PLD architectures don't use CONFIG_SYMBOL_PREFIX
-+//#include "../../include/generated/autoconf.h"
- #include "../../include/linux/license.h"
+From: Shaohua Li <shli@fb.com>
+
+Basically this is a copy of commit 001e4a8775f6(ext4: implement cgroup
+writeback support). Tested with a fio test, verified writeback is
+throttled against cgroup io.max write bandwidth, also verified moving
+the fio test to another cgroup and the writeback is throttled against
+new cgroup setting.
+
+Cc: Tejun Heo <tj@kernel.org>
+Signed-off-by: Shaohua Li <shli@fb.com>
+---
+ fs/xfs/xfs_aops.c  | 2 ++
+ fs/xfs/xfs_super.c | 1 +
+ 2 files changed, 3 insertions(+)
+
+--- linux-4.19/fs/xfs/xfs_aops.c.org   2018-11-21 10:31:12.348955352 +0100
++++ linux-4.19/fs/xfs/xfs_aops.c       2018-11-21 10:34:35.241764742 +0100
+@@ -613,8 +613,10 @@ xfs_add_to_ioend(
+                       list_add(&wpc->ioend->io_list, iolist);
+               wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset,
+                               bdev, sector);
++              wbc_init_bio(wbc, wpc->ioend->io_bio);
+       }
   
- /* Some toolchains use a `_' prefix for all user symbols. */
-
---- linux-3.0/scripts/kconfig/lxdialog/check-lxdialog.sh~      2011-07-22 04:17:23.000000000 +0200
-+++ linux-3.0/scripts/kconfig/lxdialog/check-lxdialog.sh       2011-08-25 21:26:04.799150642 +0200
-@@ -9,6 +9,12 @@
-                       $cc -print-file-name=lib${lib}.${ext} | grep -q /
-                       if [ $? -eq 0 ]; then
-                               echo "-l${lib}"
-+                              for libt in tinfow tinfo ; do
-+                                      $cc -print-file-name=lib${libt}.${ext} | grep -q /
-+                                      if [ $? -eq 0 ]; then
-+                                              echo "-l${libt}"
-+                                      fi
-+                              done
-                               exit
-                       fi
-               done
-
-diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
-index 7a0c800..ec5ebbb 100644
---- a/drivers/net/ethernet/realtek/r8169.c
-+++ b/drivers/net/ethernet/realtek/r8169.c
-@@ -6927,6 +6927,14 @@ rtl_init_one(struct pci_dev *pdev, const
-       for (i = 0; i < ETH_ALEN; i++)
-               dev->dev_addr[i] = RTL_R8(MAC0 + i);
++      wbc_account_io(wbc, page, len);
+       if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
+               if (iop)
+                       atomic_inc(&iop->write_count);
+diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
+index 584cf2d..aea3bc2 100644
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1634,6 +1634,7 @@ xfs_fs_fill_super(
+       sb->s_max_links = XFS_MAXLINK;
+       sb->s_time_gran = 1;
+       set_posix_acl_flag(sb);
++      sb->s_iflags |= SB_I_CGROUPWB;
+ 
+       /* version 5 superblocks support inode version counters. */
+       if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
+From e820d55cb99dd93ac2dc949cf486bb187e5cd70d Mon Sep 17 00:00:00 2001
+From: Guoqing Jiang <gqjiang@suse.com>
+Date: Wed, 19 Dec 2018 14:19:25 +0800
+Subject: md: fix raid10 hang issue caused by barrier
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+When both regular IO and resync IO happen at the same time,
+and if we also need to split regular. Then we can see tasks
+hang due to barrier.
+
+1. resync thread
+[ 1463.757205] INFO: task md1_resync:5215 blocked for more than 480 seconds.
+[ 1463.757207]       Not tainted 4.19.5-1-default #1
+[ 1463.757209] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+[ 1463.757212] md1_resync      D    0  5215      2 0x80000000
+[ 1463.757216] Call Trace:
+[ 1463.757223]  ? __schedule+0x29a/0x880
+[ 1463.757231]  ? raise_barrier+0x8d/0x140 [raid10]
+[ 1463.757236]  schedule+0x78/0x110
+[ 1463.757243]  raise_barrier+0x8d/0x140 [raid10]
+[ 1463.757248]  ? wait_woken+0x80/0x80
+[ 1463.757257]  raid10_sync_request+0x1f6/0x1e30 [raid10]
+[ 1463.757265]  ? _raw_spin_unlock_irq+0x22/0x40
+[ 1463.757284]  ? is_mddev_idle+0x125/0x137 [md_mod]
+[ 1463.757302]  md_do_sync.cold.78+0x404/0x969 [md_mod]
+[ 1463.757311]  ? wait_woken+0x80/0x80
+[ 1463.757336]  ? md_rdev_init+0xb0/0xb0 [md_mod]
+[ 1463.757351]  md_thread+0xe9/0x140 [md_mod]
+[ 1463.757358]  ? _raw_spin_unlock_irqrestore+0x2e/0x60
+[ 1463.757364]  ? __kthread_parkme+0x4c/0x70
+[ 1463.757369]  kthread+0x112/0x130
+[ 1463.757374]  ? kthread_create_worker_on_cpu+0x40/0x40
+[ 1463.757380]  ret_from_fork+0x3a/0x50
+
+2. regular IO
+[ 1463.760679] INFO: task kworker/0:8:5367 blocked for more than 480 seconds.
+[ 1463.760683]       Not tainted 4.19.5-1-default #1
+[ 1463.760684] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
+[ 1463.760687] kworker/0:8     D    0  5367      2 0x80000000
+[ 1463.760718] Workqueue: md submit_flushes [md_mod]
+[ 1463.760721] Call Trace:
+[ 1463.760731]  ? __schedule+0x29a/0x880
+[ 1463.760741]  ? wait_barrier+0xdd/0x170 [raid10]
+[ 1463.760746]  schedule+0x78/0x110
+[ 1463.760753]  wait_barrier+0xdd/0x170 [raid10]
+[ 1463.760761]  ? wait_woken+0x80/0x80
+[ 1463.760768]  raid10_write_request+0xf2/0x900 [raid10]
+[ 1463.760774]  ? wait_woken+0x80/0x80
+[ 1463.760778]  ? mempool_alloc+0x55/0x160
+[ 1463.760795]  ? md_write_start+0xa9/0x270 [md_mod]
+[ 1463.760801]  ? try_to_wake_up+0x44/0x470
+[ 1463.760810]  raid10_make_request+0xc1/0x120 [raid10]
+[ 1463.760816]  ? wait_woken+0x80/0x80
+[ 1463.760831]  md_handle_request+0x121/0x190 [md_mod]
+[ 1463.760851]  md_make_request+0x78/0x190 [md_mod]
+[ 1463.760860]  generic_make_request+0x1c6/0x470
+[ 1463.760870]  raid10_write_request+0x77a/0x900 [raid10]
+[ 1463.760875]  ? wait_woken+0x80/0x80
+[ 1463.760879]  ? mempool_alloc+0x55/0x160
+[ 1463.760895]  ? md_write_start+0xa9/0x270 [md_mod]
+[ 1463.760904]  raid10_make_request+0xc1/0x120 [raid10]
+[ 1463.760910]  ? wait_woken+0x80/0x80
+[ 1463.760926]  md_handle_request+0x121/0x190 [md_mod]
+[ 1463.760931]  ? _raw_spin_unlock_irq+0x22/0x40
+[ 1463.760936]  ? finish_task_switch+0x74/0x260
+[ 1463.760954]  submit_flushes+0x21/0x40 [md_mod]
+
+So resync io is waiting for regular write io to complete to
+decrease nr_pending (conf->barrier++ is called before waiting).
+The regular write io splits another bio after call wait_barrier
+which call nr_pending++, then the splitted bio would continue
+with raid10_write_request -> wait_barrier, so the splitted bio
+has to wait for barrier to be zero, then deadlock happens as
+follows.
+
+       resync io               regular io
+
+       raise_barrier
+                               wait_barrier
+                               generic_make_request
+                               wait_barrier
+
+To resolve the issue, we need to call allow_barrier to decrease
+nr_pending before generic_make_request since regular IO is not
+issued to underlying devices, and wait_barrier is called again
+to ensure no internal IO happening.
+
+Fixes: fc9977dd069e ("md/raid10: simplify the splitting of requests.")
+Reported-and-tested-by: Siniša Bandin <sinisa@4net.rs>
+Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
+Signed-off-by: Shaohua Li <shli@fb.com>
+---
+ drivers/md/raid10.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
+index 76c92e31afc0..abb5d382f64d 100644
+--- a/drivers/md/raid10.c
++++ b/drivers/md/raid10.c
+@@ -1209,7 +1209,9 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
+               struct bio *split = bio_split(bio, max_sectors,
+                                             gfp, &conf->bio_split);
+               bio_chain(split, bio);
++              allow_barrier(conf);
+               generic_make_request(bio);
++              wait_barrier(conf);
+               bio = split;
+               r10_bio->master_bio = bio;
+               r10_bio->sectors = max_sectors;
+@@ -1492,7 +1494,9 @@ retry_write:
+               struct bio *split = bio_split(bio, r10_bio->sectors,
+                                             GFP_NOIO, &conf->bio_split);
+               bio_chain(split, bio);
++              allow_barrier(conf);
+               generic_make_request(bio);
++              wait_barrier(conf);
+               bio = split;
+               r10_bio->master_bio = bio;
+       }
+-- 
+cgit 1.2-0.3.lf.el7
+
+From 9c9e935fc038342c48461aabca666f1b544e32b1 Mon Sep 17 00:00:00 2001
+From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
+Date: Sun, 27 Jan 2019 23:51:37 +0900
+Subject: [PATCH v3] oom, oom_reaper: do not enqueue same task twice
+
+Arkadiusz reported that enabling memcg's group oom killing causes
+strange memcg statistics where there is no task in a memcg despite
+the number of tasks in that memcg is not 0. It turned out that there
+is a bug in wake_oom_reaper() which allows enqueuing same task twice
+which makes impossible to decrease the number of tasks in that memcg
+due to a refcount leak.
+
+This bug existed since the OOM reaper became invokable from
+task_will_free_mem(current) path in out_of_memory() in Linux 4.7,
+but memcg's group oom killing made it easier to trigger this bug by
+calling wake_oom_reaper() on the same task from one out_of_memory()
+request.
+
+Fix this bug using an approach used by commit 855b018325737f76
+("oom, oom_reaper: disable oom_reaper for oom_kill_allocating_task").
+As a side effect of this patch, this patch also avoids enqueuing
+multiple threads sharing memory via task_will_free_mem(current) path.
+
+Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
+Reported-by: Arkadiusz Miśkiewicz <arekm@maven.pl>
+Tested-by: Arkadiusz Miśkiewicz <arekm@maven.pl>
+Fixes: af8e15cc85a25315 ("oom, oom_reaper: do not enqueue task if it is on the oom_reaper_list head")
+---
+ include/linux/sched/coredump.h | 1 +
+ mm/oom_kill.c                  | 4 ++--
+ 2 files changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
+index ec912d0..ecdc654 100644
+--- a/include/linux/sched/coredump.h
++++ b/include/linux/sched/coredump.h
+@@ -71,6 +71,7 @@ static inline int get_dumpable(struct mm_struct *mm)
+ #define MMF_HUGE_ZERO_PAGE    23      /* mm has ever used the global huge zero page */
+ #define MMF_DISABLE_THP               24      /* disable THP for all VMAs */
+ #define MMF_OOM_VICTIM                25      /* mm is the oom victim */
++#define MMF_OOM_REAP_QUEUED   26      /* mm was queued for oom_reaper */
+ #define MMF_DISABLE_THP_MASK  (1 << MMF_DISABLE_THP)
   
-+      if (!is_valid_ether_addr(dev->dev_addr)) {
-+              /* Report it and use a random ethernet address instead */
-+              netdev_err(dev, "Invalid MAC address: %pM\n", dev->dev_addr);
-+              random_ether_addr(dev->dev_addr);
-+              netdev_info(dev, "Using random MAC address: %pM\n",
-+                              dev->dev_addr);
-+      }
-+
-       SET_ETHTOOL_OPS(dev, &rtl8169_ethtool_ops);
-       dev->watchdog_timeo = RTL8169_TX_TIMEOUT;
+ #define MMF_INIT_MASK         (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
+diff --git a/mm/oom_kill.c b/mm/oom_kill.c
+index f0e8cd9..059e617 100644
+--- a/mm/oom_kill.c
++++ b/mm/oom_kill.c
+@@ -647,8 +647,8 @@ static int oom_reaper(void *unused)
   
+ static void wake_oom_reaper(struct task_struct *tsk)
+ {
+-      /* tsk is already queued? */
+-      if (tsk == oom_reaper_list || tsk->oom_reaper_list)
++      /* mm is already queued? */
++      if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
+               return;
+ 
+       get_task_struct(tsk);
+-- 
+1.8.3.1
+
+From: Dave Chinner <dchinner@redhat.com>
+
+This reverts commit a76cf1a474d7dbcd9336b5f5afb0162baa142cf0.
+
+This change causes serious changes to page cache and inode cache
+behaviour and balance, resulting in major performance regressions
+when combining worklaods such as large file copies and kernel
+compiles.
+
+https://bugzilla.kernel.org/show_bug.cgi?id=202441
+
+This change is a hack to work around the problems introduced by
+changing how agressive shrinkers are on small caches in commit
+172b06c32b94 ("mm: slowly shrink slabs with a relatively small
+number of objects"). It creates more problems than it solves, wasn't
+adequately reviewed or tested, so it needs to be reverted.
+
+cc: <stable@vger.kernel.org>
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+---
+ fs/inode.c | 7 ++-----
+ 1 file changed, 2 insertions(+), 5 deletions(-)
+
+diff --git a/fs/inode.c b/fs/inode.c
+index 0cd47fe0dbe5..73432e64f874 100644
+--- a/fs/inode.c
++++ b/fs/inode.c
+@@ -730,11 +730,8 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
+               return LRU_REMOVED;
+       }
+ 
+-      /*
+-       * Recently referenced inodes and inodes with many attached pages
+-       * get one more pass.
+-       */
+-      if (inode->i_state & I_REFERENCED || inode->i_data.nrpages > 1) {
++      /* recently referenced inodes get one more pass */
++      if (inode->i_state & I_REFERENCED) {
+               inode->i_state &= ~I_REFERENCED;
+               spin_unlock(&inode->i_lock);
+               return LRU_ROTATE;
+-- 
+2.20.1
+
+This reverts commit 172b06c32b949759fe6313abec514bc4f15014f4.
+
+This change changes the agressiveness of shrinker reclaim, causing
+small cache and low priority reclaim to greatly increase
+scanning pressure on small caches. As a result, light memory
+pressure has a disproportionate affect on small caches, and causes
+large caches to be reclaimed much faster than previously.
+
+As a result, it greatly perturbs the delicate balance of the VFS
+caches (dentry/inode vs file page cache) such that the inode/dentry
+caches are reclaimed much, much faster than the page cache and this
+drives us into several other caching imbalance related problems.
+
+As such, this is a bad change and needs to be reverted.
+
+[ Needs some massaging to retain the later seekless shrinker
+modifications. ]
+
+cc: <stable@vger.kernel.org>
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+---
+ mm/vmscan.c | 10 ----------
+ 1 file changed, 10 deletions(-)
+
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index a714c4f800e9..e979705bbf32 100644
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -491,16 +491,6 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
+               delta = freeable / 2;
+       }
+ 
+-      /*
+-       * Make sure we apply some minimal pressure on default priority
+-       * even on small cgroups. Stale objects are not only consuming memory
+-       * by themselves, but can also hold a reference to a dying cgroup,
+-       * preventing it from being reclaimed. A dying cgroup with all
+-       * corresponding structures like per-cpu stats and kmem caches
+-       * can be really big, so it may lead to a significant waste of memory.
+-       */
+-      delta = max_t(unsigned long long, delta, min(freeable, batch_size));
+-
+       total_scan += delta;
+       if (total_scan < 0) {
+               pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
+-- 
+2.20.1
+