]> git.pld-linux.org Git - packages/kernel.git/blame - kernel-small_fixes.patch
- rel 3; v2 version of the patch
[packages/kernel.git] / kernel-small_fixes.patch
CommitLineData
5da6072d
AM
1From: Shaohua Li <shli@fb.com>
2
3Basically this is a copy of commit 001e4a8775f6(ext4: implement cgroup
4writeback support). Tested with a fio test, verified writeback is
5throttled against cgroup io.max write bandwidth, also verified moving
6the fio test to another cgroup and the writeback is throttled against
7new cgroup setting.
8
9Cc: Tejun Heo <tj@kernel.org>
10Signed-off-by: Shaohua Li <shli@fb.com>
11---
12 fs/xfs/xfs_aops.c | 2 ++
13 fs/xfs/xfs_super.c | 1 +
14 2 files changed, 3 insertions(+)
15
679d237f
AM
16--- linux-4.19/fs/xfs/xfs_aops.c.org 2018-11-21 10:31:12.348955352 +0100
17+++ linux-4.19/fs/xfs/xfs_aops.c 2018-11-21 10:34:35.241764742 +0100
5da6072d
AM
18@@ -613,8 +613,10 @@ xfs_add_to_ioend(
19 list_add(&wpc->ioend->io_list, iolist);
20 wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset,
21 bdev, sector);
22+ wbc_init_bio(wbc, wpc->ioend->io_bio);
23 }
24
679d237f 25+ wbc_account_io(wbc, page, len);
5da6072d
AM
26 if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
27 if (iop)
28 atomic_inc(&iop->write_count);
29diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
30index 584cf2d..aea3bc2 100644
31--- a/fs/xfs/xfs_super.c
32+++ b/fs/xfs/xfs_super.c
33@@ -1634,6 +1634,7 @@ xfs_fs_fill_super(
34 sb->s_max_links = XFS_MAXLINK;
35 sb->s_time_gran = 1;
36 set_posix_acl_flag(sb);
37+ sb->s_iflags |= SB_I_CGROUPWB;
38
39 /* version 5 superblocks support inode version counters. */
40 if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
28950590
AM
41From e820d55cb99dd93ac2dc949cf486bb187e5cd70d Mon Sep 17 00:00:00 2001
42From: Guoqing Jiang <gqjiang@suse.com>
43Date: Wed, 19 Dec 2018 14:19:25 +0800
44Subject: md: fix raid10 hang issue caused by barrier
45MIME-Version: 1.0
46Content-Type: text/plain; charset=UTF-8
47Content-Transfer-Encoding: 8bit
48
49When both regular IO and resync IO happen at the same time,
50and if we also need to split regular. Then we can see tasks
51hang due to barrier.
52
531. resync thread
54[ 1463.757205] INFO: task md1_resync:5215 blocked for more than 480 seconds.
55[ 1463.757207] Not tainted 4.19.5-1-default #1
56[ 1463.757209] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
57[ 1463.757212] md1_resync D 0 5215 2 0x80000000
58[ 1463.757216] Call Trace:
59[ 1463.757223] ? __schedule+0x29a/0x880
60[ 1463.757231] ? raise_barrier+0x8d/0x140 [raid10]
61[ 1463.757236] schedule+0x78/0x110
62[ 1463.757243] raise_barrier+0x8d/0x140 [raid10]
63[ 1463.757248] ? wait_woken+0x80/0x80
64[ 1463.757257] raid10_sync_request+0x1f6/0x1e30 [raid10]
65[ 1463.757265] ? _raw_spin_unlock_irq+0x22/0x40
66[ 1463.757284] ? is_mddev_idle+0x125/0x137 [md_mod]
67[ 1463.757302] md_do_sync.cold.78+0x404/0x969 [md_mod]
68[ 1463.757311] ? wait_woken+0x80/0x80
69[ 1463.757336] ? md_rdev_init+0xb0/0xb0 [md_mod]
70[ 1463.757351] md_thread+0xe9/0x140 [md_mod]
71[ 1463.757358] ? _raw_spin_unlock_irqrestore+0x2e/0x60
72[ 1463.757364] ? __kthread_parkme+0x4c/0x70
73[ 1463.757369] kthread+0x112/0x130
74[ 1463.757374] ? kthread_create_worker_on_cpu+0x40/0x40
75[ 1463.757380] ret_from_fork+0x3a/0x50
76
772. regular IO
78[ 1463.760679] INFO: task kworker/0:8:5367 blocked for more than 480 seconds.
79[ 1463.760683] Not tainted 4.19.5-1-default #1
80[ 1463.760684] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
81[ 1463.760687] kworker/0:8 D 0 5367 2 0x80000000
82[ 1463.760718] Workqueue: md submit_flushes [md_mod]
83[ 1463.760721] Call Trace:
84[ 1463.760731] ? __schedule+0x29a/0x880
85[ 1463.760741] ? wait_barrier+0xdd/0x170 [raid10]
86[ 1463.760746] schedule+0x78/0x110
87[ 1463.760753] wait_barrier+0xdd/0x170 [raid10]
88[ 1463.760761] ? wait_woken+0x80/0x80
89[ 1463.760768] raid10_write_request+0xf2/0x900 [raid10]
90[ 1463.760774] ? wait_woken+0x80/0x80
91[ 1463.760778] ? mempool_alloc+0x55/0x160
92[ 1463.760795] ? md_write_start+0xa9/0x270 [md_mod]
93[ 1463.760801] ? try_to_wake_up+0x44/0x470
94[ 1463.760810] raid10_make_request+0xc1/0x120 [raid10]
95[ 1463.760816] ? wait_woken+0x80/0x80
96[ 1463.760831] md_handle_request+0x121/0x190 [md_mod]
97[ 1463.760851] md_make_request+0x78/0x190 [md_mod]
98[ 1463.760860] generic_make_request+0x1c6/0x470
99[ 1463.760870] raid10_write_request+0x77a/0x900 [raid10]
100[ 1463.760875] ? wait_woken+0x80/0x80
101[ 1463.760879] ? mempool_alloc+0x55/0x160
102[ 1463.760895] ? md_write_start+0xa9/0x270 [md_mod]
103[ 1463.760904] raid10_make_request+0xc1/0x120 [raid10]
104[ 1463.760910] ? wait_woken+0x80/0x80
105[ 1463.760926] md_handle_request+0x121/0x190 [md_mod]
106[ 1463.760931] ? _raw_spin_unlock_irq+0x22/0x40
107[ 1463.760936] ? finish_task_switch+0x74/0x260
108[ 1463.760954] submit_flushes+0x21/0x40 [md_mod]
109
110So resync io is waiting for regular write io to complete to
111decrease nr_pending (conf->barrier++ is called before waiting).
112The regular write io splits another bio after call wait_barrier
113which call nr_pending++, then the splitted bio would continue
114with raid10_write_request -> wait_barrier, so the splitted bio
115has to wait for barrier to be zero, then deadlock happens as
116follows.
117
118 resync io regular io
119
120 raise_barrier
121 wait_barrier
122 generic_make_request
123 wait_barrier
124
125To resolve the issue, we need to call allow_barrier to decrease
126nr_pending before generic_make_request since regular IO is not
127issued to underlying devices, and wait_barrier is called again
128to ensure no internal IO happening.
129
130Fixes: fc9977dd069e ("md/raid10: simplify the splitting of requests.")
131Reported-and-tested-by: Siniša Bandin <sinisa@4net.rs>
132Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
133Signed-off-by: Shaohua Li <shli@fb.com>
134---
135 drivers/md/raid10.c | 4 ++++
136 1 file changed, 4 insertions(+)
137
138diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
139index 76c92e31afc0..abb5d382f64d 100644
140--- a/drivers/md/raid10.c
141+++ b/drivers/md/raid10.c
142@@ -1209,7 +1209,9 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
143 struct bio *split = bio_split(bio, max_sectors,
144 gfp, &conf->bio_split);
145 bio_chain(split, bio);
146+ allow_barrier(conf);
147 generic_make_request(bio);
148+ wait_barrier(conf);
149 bio = split;
150 r10_bio->master_bio = bio;
151 r10_bio->sectors = max_sectors;
152@@ -1492,7 +1494,9 @@ retry_write:
153 struct bio *split = bio_split(bio, r10_bio->sectors,
154 GFP_NOIO, &conf->bio_split);
155 bio_chain(split, bio);
156+ allow_barrier(conf);
157 generic_make_request(bio);
158+ wait_barrier(conf);
159 bio = split;
160 r10_bio->master_bio = bio;
161 }
162--
163cgit 1.2-0.3.lf.el7
164
db555a62
AM
165
166From 9c9e935fc038342c48461aabca666f1b544e32b1 Mon Sep 17 00:00:00 2001\r
5d90ba12 167From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>\r
db555a62
AM
168Date: Sat, 26 Jan 2019 21:57:25 +0900\r
169Subject: [PATCH v2] oom, oom_reaper: do not enqueue same task twice\r
5d90ba12
AM
170\r
171Arkadiusz reported that enabling memcg's group oom killing causes\r
172strange memcg statistics where there is no task in a memcg despite\r
173the number of tasks in that memcg is not 0. It turned out that there\r
174is a bug in wake_oom_reaper() which allows enqueuing same task twice\r
175which makes impossible to decrease the number of tasks in that memcg\r
176due to a refcount leak.\r
177\r
178This bug existed since the OOM reaper became invokable from\r
179task_will_free_mem(current) path in out_of_memory() in Linux 4.7,\r
180but memcg's group oom killing made it easier to trigger this bug by\r
181calling wake_oom_reaper() on the same task from one out_of_memory()\r
182request.\r
183\r
184Fix this bug using an approach used by commit 855b018325737f76\r
185("oom, oom_reaper: disable oom_reaper for oom_kill_allocating_task").\r
db555a62 186As a side effect of this patch, this patch also avoids enqueuing\r
5d90ba12
AM
187multiple threads sharing memory via task_will_free_mem(current) path.\r
188\r
189Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>\r
db555a62
AM
190Reported-by: Arkadiusz Miśkiewicz <arekm@maven.pl>\r
191Tested-by: Arkadiusz Miśkiewicz <arekm@maven.pl>\r
5d90ba12
AM
192Fixes: af8e15cc85a25315 ("oom, oom_reaper: do not enqueue task if it is on the oom_reaper_list head")\r
193---\r
db555a62
AM
194 mm/oom_kill.c | 17 +++++++----------\r
195 1 file changed, 7 insertions(+), 10 deletions(-)\r
5d90ba12
AM
196\r
197diff --git a/mm/oom_kill.c b/mm/oom_kill.c\r
db555a62 198index f0e8cd9..057bfee 100644\r
5d90ba12
AM
199--- a/mm/oom_kill.c\r
200+++ b/mm/oom_kill.c\r
201@@ -505,14 +505,6 @@ bool __oom_reap_task_mm(struct mm_struct *mm)\r
202 struct vm_area_struct *vma;\r
203 bool ret = true;\r
204 \r
205- /*\r
206- * Tell all users of get_user/copy_from_user etc... that the content\r
207- * is no longer stable. No barriers really needed because unmapping\r
208- * should imply barriers already and the reader would hit a page fault\r
209- * if it stumbled over a reaped memory.\r
210- */\r
211- set_bit(MMF_UNSTABLE, &mm->flags);\r
212-\r
213 for (vma = mm->mmap ; vma; vma = vma->vm_next) {\r
214 if (!can_madv_dontneed_vma(vma))\r
215 continue;\r
db555a62 216@@ -647,8 +639,13 @@ static int oom_reaper(void *unused)\r
5d90ba12 217 \r
db555a62 218 static void wake_oom_reaper(struct task_struct *tsk)\r
5d90ba12
AM
219 {\r
220- /* tsk is already queued? */\r
221- if (tsk == oom_reaper_list || tsk->oom_reaper_list)\r
222+ /*\r
223+ * Tell all users of get_user/copy_from_user etc... that the content\r
224+ * is no longer stable. No barriers really needed because unmapping\r
225+ * should imply barriers already and the reader would hit a page fault\r
226+ * if it stumbled over a reaped memory.\r
227+ */\r
db555a62 228+ if (test_and_set_bit(MMF_UNSTABLE, &tsk->signal->oom_mm->flags))\r
5d90ba12
AM
229 return;\r
230 \r
231 get_task_struct(tsk);\r
5d90ba12
AM
232-- \r
2331.8.3.1\r
234\r
This page took 0.112681 seconds and 4 git commands to generate.