kernel-small_fixes.patch

   1 From: Shaohua Li <shli@fb.com>
   2
   3 Basically this is a copy of commit 001e4a8775f6(ext4: implement cgroup
   4 writeback support). Tested with a fio test, verified writeback is
   5 throttled against cgroup io.max write bandwidth, also verified moving
   6 the fio test to another cgroup and the writeback is throttled against
   7 new cgroup setting.
   8
   9 Cc: Tejun Heo <tj@kernel.org>
  10 Signed-off-by: Shaohua Li <shli@fb.com>
  11 ---
  12  fs/xfs/xfs_aops.c  | 2 ++
  13  fs/xfs/xfs_super.c | 1 +
  14  2 files changed, 3 insertions(+)
  15
  16 --- linux-4.19/fs/xfs/xfs_aops.c.org    2018-11-21 10:31:12.348955352 +0100
  17 +++ linux-4.19/fs/xfs/xfs_aops.c        2018-11-21 10:34:35.241764742 +0100
  18 @@ -613,8 +613,10 @@ xfs_add_to_ioend(
  19                         list_add(&wpc->ioend->io_list, iolist);
  20                 wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset,
  21                                 bdev, sector);
  22 +               wbc_init_bio(wbc, wpc->ioend->io_bio);
  23         }
  24
  25 +       wbc_account_io(wbc, page, len);
  26         if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
  27                 if (iop)
  28                         atomic_inc(&iop->write_count);
  29 diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
  30 index 584cf2d..aea3bc2 100644
  31 --- a/fs/xfs/xfs_super.c
  32 +++ b/fs/xfs/xfs_super.c
  33 @@ -1634,6 +1634,7 @@ xfs_fs_fill_super(
  34         sb->s_max_links = XFS_MAXLINK;
  35         sb->s_time_gran = 1;
  36         set_posix_acl_flag(sb);
  37 +       sb->s_iflags |= SB_I_CGROUPWB;
  38
  39         /* version 5 superblocks support inode version counters. */
  40         if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
  41 From e820d55cb99dd93ac2dc949cf486bb187e5cd70d Mon Sep 17 00:00:00 2001
  42 From: Guoqing Jiang <gqjiang@suse.com>
  43 Date: Wed, 19 Dec 2018 14:19:25 +0800
  44 Subject: md: fix raid10 hang issue caused by barrier
  45 MIME-Version: 1.0
  46 Content-Type: text/plain; charset=UTF-8
  47 Content-Transfer-Encoding: 8bit
  48
  49 When both regular IO and resync IO happen at the same time,
  50 and if we also need to split regular. Then we can see tasks
  51 hang due to barrier.
  52
  53 1. resync thread
  54 [ 1463.757205] INFO: task md1_resync:5215 blocked for more than 480 seconds.
  55 [ 1463.757207]       Not tainted 4.19.5-1-default #1
  56 [ 1463.757209] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
  57 [ 1463.757212] md1_resync      D    0  5215      2 0x80000000
  58 [ 1463.757216] Call Trace:
  59 [ 1463.757223]  ? __schedule+0x29a/0x880
  60 [ 1463.757231]  ? raise_barrier+0x8d/0x140 [raid10]
  61 [ 1463.757236]  schedule+0x78/0x110
  62 [ 1463.757243]  raise_barrier+0x8d/0x140 [raid10]
  63 [ 1463.757248]  ? wait_woken+0x80/0x80
  64 [ 1463.757257]  raid10_sync_request+0x1f6/0x1e30 [raid10]
  65 [ 1463.757265]  ? _raw_spin_unlock_irq+0x22/0x40
  66 [ 1463.757284]  ? is_mddev_idle+0x125/0x137 [md_mod]
  67 [ 1463.757302]  md_do_sync.cold.78+0x404/0x969 [md_mod]
  68 [ 1463.757311]  ? wait_woken+0x80/0x80
  69 [ 1463.757336]  ? md_rdev_init+0xb0/0xb0 [md_mod]
  70 [ 1463.757351]  md_thread+0xe9/0x140 [md_mod]
  71 [ 1463.757358]  ? _raw_spin_unlock_irqrestore+0x2e/0x60
  72 [ 1463.757364]  ? __kthread_parkme+0x4c/0x70
  73 [ 1463.757369]  kthread+0x112/0x130
  74 [ 1463.757374]  ? kthread_create_worker_on_cpu+0x40/0x40
  75 [ 1463.757380]  ret_from_fork+0x3a/0x50
  76
  77 2. regular IO
  78 [ 1463.760679] INFO: task kworker/0:8:5367 blocked for more than 480 seconds.
  79 [ 1463.760683]       Not tainted 4.19.5-1-default #1
  80 [ 1463.760684] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
  81 [ 1463.760687] kworker/0:8     D    0  5367      2 0x80000000
  82 [ 1463.760718] Workqueue: md submit_flushes [md_mod]
  83 [ 1463.760721] Call Trace:
  84 [ 1463.760731]  ? __schedule+0x29a/0x880
  85 [ 1463.760741]  ? wait_barrier+0xdd/0x170 [raid10]
  86 [ 1463.760746]  schedule+0x78/0x110
  87 [ 1463.760753]  wait_barrier+0xdd/0x170 [raid10]
  88 [ 1463.760761]  ? wait_woken+0x80/0x80
  89 [ 1463.760768]  raid10_write_request+0xf2/0x900 [raid10]
  90 [ 1463.760774]  ? wait_woken+0x80/0x80
  91 [ 1463.760778]  ? mempool_alloc+0x55/0x160
  92 [ 1463.760795]  ? md_write_start+0xa9/0x270 [md_mod]
  93 [ 1463.760801]  ? try_to_wake_up+0x44/0x470
  94 [ 1463.760810]  raid10_make_request+0xc1/0x120 [raid10]
  95 [ 1463.760816]  ? wait_woken+0x80/0x80
  96 [ 1463.760831]  md_handle_request+0x121/0x190 [md_mod]
  97 [ 1463.760851]  md_make_request+0x78/0x190 [md_mod]
  98 [ 1463.760860]  generic_make_request+0x1c6/0x470
  99 [ 1463.760870]  raid10_write_request+0x77a/0x900 [raid10]
 100 [ 1463.760875]  ? wait_woken+0x80/0x80
 101 [ 1463.760879]  ? mempool_alloc+0x55/0x160
 102 [ 1463.760895]  ? md_write_start+0xa9/0x270 [md_mod]
 103 [ 1463.760904]  raid10_make_request+0xc1/0x120 [raid10]
 104 [ 1463.760910]  ? wait_woken+0x80/0x80
 105 [ 1463.760926]  md_handle_request+0x121/0x190 [md_mod]
 106 [ 1463.760931]  ? _raw_spin_unlock_irq+0x22/0x40
 107 [ 1463.760936]  ? finish_task_switch+0x74/0x260
 108 [ 1463.760954]  submit_flushes+0x21/0x40 [md_mod]
 109
 110 So resync io is waiting for regular write io to complete to
 111 decrease nr_pending (conf->barrier++ is called before waiting).
 112 The regular write io splits another bio after call wait_barrier
 113 which call nr_pending++, then the splitted bio would continue
 114 with raid10_write_request -> wait_barrier, so the splitted bio
 115 has to wait for barrier to be zero, then deadlock happens as
 116 follows.
 117
 118         resync io               regular io
 119
 120         raise_barrier
 121                                 wait_barrier
 122                                 generic_make_request
 123                                 wait_barrier
 124
 125 To resolve the issue, we need to call allow_barrier to decrease
 126 nr_pending before generic_make_request since regular IO is not
 127 issued to underlying devices, and wait_barrier is called again
 128 to ensure no internal IO happening.
 129
 130 Fixes: fc9977dd069e ("md/raid10: simplify the splitting of requests.")
 131 Reported-and-tested-by: Siniša Bandin <sinisa@4net.rs>
 132 Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
 133 Signed-off-by: Shaohua Li <shli@fb.com>
 134 ---
 135  drivers/md/raid10.c | 4 ++++
 136  1 file changed, 4 insertions(+)
 137
 138 diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
 139 index 76c92e31afc0..abb5d382f64d 100644
 140 --- a/drivers/md/raid10.c
 141 +++ b/drivers/md/raid10.c
 142 @@ -1209,7 +1209,9 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
 143                 struct bio *split = bio_split(bio, max_sectors,
 144                                               gfp, &conf->bio_split);
 145                 bio_chain(split, bio);
 146 +               allow_barrier(conf);
 147                 generic_make_request(bio);
 148 +               wait_barrier(conf);
 149                 bio = split;
 150                 r10_bio->master_bio = bio;
 151                 r10_bio->sectors = max_sectors;
 152 @@ -1492,7 +1494,9 @@ retry_write:
 153                 struct bio *split = bio_split(bio, r10_bio->sectors,
 154                                               GFP_NOIO, &conf->bio_split);
 155                 bio_chain(split, bio);
 156 +               allow_barrier(conf);
 157                 generic_make_request(bio);
 158 +               wait_barrier(conf);
 159                 bio = split;
 160                 r10_bio->master_bio = bio;
 161         }
 162 --
 163 cgit 1.2-0.3.lf.el7
 164
 165 From 48744b6339cf649a69b55997e138c17df1ecc897 Mon Sep 17 00:00:00 2001\r
 166 From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>\r
 167 Date: Sat, 26 Jan 2019 20:00:51 +0900\r
 168 Subject: [PATCH] oom, oom_reaper: do not enqueue same task twice\r
 169 \r
 170 Arkadiusz reported that enabling memcg's group oom killing causes\r
 171 strange memcg statistics where there is no task in a memcg despite\r
 172 the number of tasks in that memcg is not 0. It turned out that there\r
 173 is a bug in wake_oom_reaper() which allows enqueuing same task twice\r
 174 which makes impossible to decrease the number of tasks in that memcg\r
 175 due to a refcount leak.\r
 176 \r
 177 This bug existed since the OOM reaper became invokable from\r
 178 task_will_free_mem(current) path in out_of_memory() in Linux 4.7,\r
 179 but memcg's group oom killing made it easier to trigger this bug by\r
 180 calling wake_oom_reaper() on the same task from one out_of_memory()\r
 181 request.\r
 182 \r
 183 Fix this bug using an approach used by commit 855b018325737f76\r
 184 ("oom, oom_reaper: disable oom_reaper for oom_kill_allocating_task").\r
 185 Since task_will_free_mem(p) == false if p->mm == NULL, we can assume that\r
 186 p->mm != NULL when wake_oom_reaper() is called from task_will_free_mem()\r
 187 paths. As a side effect of this patch, this patch also avoids enqueuing\r
 188 multiple threads sharing memory via task_will_free_mem(current) path.\r
 189 \r
 190 Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>\r
 191 Reported-by: Arkadiusz Miśkiewicz <a.miskiewicz@gmail.com>\r
 192 Fixes: af8e15cc85a25315 ("oom, oom_reaper: do not enqueue task if it is on the oom_reaper_list head")\r
 193 ---\r
 194  mm/oom_kill.c | 28 +++++++++++++---------------\r
 195  1 file changed, 13 insertions(+), 15 deletions(-)\r
 196 \r
 197 diff --git a/mm/oom_kill.c b/mm/oom_kill.c\r
 198 index f0e8cd9..457f240 100644\r
 199 --- a/mm/oom_kill.c\r
 200 +++ b/mm/oom_kill.c\r
 201 @@ -505,14 +505,6 @@ bool __oom_reap_task_mm(struct mm_struct *mm)\r
 202         struct vm_area_struct *vma;\r
 203         bool ret = true;\r
 204  \r
 205 -       /*\r
 206 -        * Tell all users of get_user/copy_from_user etc... that the content\r
 207 -        * is no longer stable. No barriers really needed because unmapping\r
 208 -        * should imply barriers already and the reader would hit a page fault\r
 209 -        * if it stumbled over a reaped memory.\r
 210 -        */\r
 211 -       set_bit(MMF_UNSTABLE, &mm->flags);\r
 212 -\r
 213         for (vma = mm->mmap ; vma; vma = vma->vm_next) {\r
 214                 if (!can_madv_dontneed_vma(vma))\r
 215                         continue;\r
 216 @@ -645,10 +637,15 @@ static int oom_reaper(void *unused)\r
 217         return 0;\r
 218  }\r
 219  \r
 220 -static void wake_oom_reaper(struct task_struct *tsk)\r
 221 +static void wake_oom_reaper(struct task_struct *tsk, struct mm_struct *mm)\r
 222  {\r
 223 -       /* tsk is already queued? */\r
 224 -       if (tsk == oom_reaper_list || tsk->oom_reaper_list)\r
 225 +       /*\r
 226 +        * Tell all users of get_user/copy_from_user etc... that the content\r
 227 +        * is no longer stable. No barriers really needed because unmapping\r
 228 +        * should imply barriers already and the reader would hit a page fault\r
 229 +        * if it stumbled over a reaped memory.\r
 230 +        */\r
 231 +       if (test_and_set_bit(MMF_UNSTABLE, &mm->flags))\r
 232                 return;\r
 233  \r
 234         get_task_struct(tsk);\r
 235 @@ -668,7 +665,8 @@ static int __init oom_init(void)\r
 236  }\r
 237  subsys_initcall(oom_init)\r
 238  #else\r
 239 -static inline void wake_oom_reaper(struct task_struct *tsk)\r
 240 +static inline void wake_oom_reaper(struct task_struct *tsk,\r
 241 +                                  struct mm_struct *mm)\r
 242  {\r
 243  }\r
 244  #endif /* CONFIG_MMU */\r
 245 @@ -915,7 +913,7 @@ static void __oom_kill_process(struct task_struct *victim)\r
 246         rcu_read_unlock();\r
 247  \r
 248         if (can_oom_reap)\r
 249 -               wake_oom_reaper(victim);\r
 250 +               wake_oom_reaper(victim, mm);\r
 251  \r
 252         mmdrop(mm);\r
 253         put_task_struct(victim);\r
 254 @@ -955,7 +953,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)\r
 255         task_lock(p);\r
 256         if (task_will_free_mem(p)) {\r
 257                 mark_oom_victim(p);\r
 258 -               wake_oom_reaper(p);\r
 259 +               wake_oom_reaper(p, p->mm);\r
 260                 task_unlock(p);\r
 261                 put_task_struct(p);\r
 262                 return;\r
 263 @@ -1085,7 +1083,7 @@ bool out_of_memory(struct oom_control *oc)\r
 264          */\r
 265         if (task_will_free_mem(current)) {\r
 266                 mark_oom_victim(current);\r
 267 -               wake_oom_reaper(current);\r
 268 +               wake_oom_reaper(current, current->mm);\r
 269                 return true;\r
 270         }\r
 271  \r
 272 -- \r
 273 1.8.3.1\r
 274 \r