]>
Commit | Line | Data |
---|---|---|
5da6072d AM |
1 | From: Shaohua Li <shli@fb.com> |
2 | ||
3 | Basically this is a copy of commit 001e4a8775f6(ext4: implement cgroup | |
4 | writeback support). Tested with a fio test, verified writeback is | |
5 | throttled against cgroup io.max write bandwidth, also verified moving | |
6 | the fio test to another cgroup and the writeback is throttled against | |
7 | new cgroup setting. | |
8 | ||
9 | Cc: Tejun Heo <tj@kernel.org> | |
10 | Signed-off-by: Shaohua Li <shli@fb.com> | |
11 | --- | |
12 | fs/xfs/xfs_aops.c | 2 ++ | |
13 | fs/xfs/xfs_super.c | 1 + | |
14 | 2 files changed, 3 insertions(+) | |
15 | ||
679d237f AM |
16 | --- linux-4.19/fs/xfs/xfs_aops.c.org 2018-11-21 10:31:12.348955352 +0100 |
17 | +++ linux-4.19/fs/xfs/xfs_aops.c 2018-11-21 10:34:35.241764742 +0100 | |
5da6072d AM |
18 | @@ -613,8 +613,10 @@ xfs_add_to_ioend( |
19 | list_add(&wpc->ioend->io_list, iolist); | |
20 | wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, | |
21 | bdev, sector); | |
22 | + wbc_init_bio(wbc, wpc->ioend->io_bio); | |
23 | } | |
24 | ||
679d237f | 25 | + wbc_account_io(wbc, page, len); |
5da6072d AM |
26 | if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) { |
27 | if (iop) | |
28 | atomic_inc(&iop->write_count); | |
29 | diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c | |
30 | index 584cf2d..aea3bc2 100644 | |
31 | --- a/fs/xfs/xfs_super.c | |
32 | +++ b/fs/xfs/xfs_super.c | |
33 | @@ -1634,6 +1634,7 @@ xfs_fs_fill_super( | |
34 | sb->s_max_links = XFS_MAXLINK; | |
35 | sb->s_time_gran = 1; | |
36 | set_posix_acl_flag(sb); | |
37 | + sb->s_iflags |= SB_I_CGROUPWB; | |
38 | ||
39 | /* version 5 superblocks support inode version counters. */ | |
40 | if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) | |
28950590 AM |
41 | From e820d55cb99dd93ac2dc949cf486bb187e5cd70d Mon Sep 17 00:00:00 2001 |
42 | From: Guoqing Jiang <gqjiang@suse.com> | |
43 | Date: Wed, 19 Dec 2018 14:19:25 +0800 | |
44 | Subject: md: fix raid10 hang issue caused by barrier | |
45 | MIME-Version: 1.0 | |
46 | Content-Type: text/plain; charset=UTF-8 | |
47 | Content-Transfer-Encoding: 8bit | |
48 | ||
49 | When both regular IO and resync IO happen at the same time, | |
50 | and if we also need to split regular. Then we can see tasks | |
51 | hang due to barrier. | |
52 | ||
53 | 1. resync thread | |
54 | [ 1463.757205] INFO: task md1_resync:5215 blocked for more than 480 seconds. | |
55 | [ 1463.757207] Not tainted 4.19.5-1-default #1 | |
56 | [ 1463.757209] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. | |
57 | [ 1463.757212] md1_resync D 0 5215 2 0x80000000 | |
58 | [ 1463.757216] Call Trace: | |
59 | [ 1463.757223] ? __schedule+0x29a/0x880 | |
60 | [ 1463.757231] ? raise_barrier+0x8d/0x140 [raid10] | |
61 | [ 1463.757236] schedule+0x78/0x110 | |
62 | [ 1463.757243] raise_barrier+0x8d/0x140 [raid10] | |
63 | [ 1463.757248] ? wait_woken+0x80/0x80 | |
64 | [ 1463.757257] raid10_sync_request+0x1f6/0x1e30 [raid10] | |
65 | [ 1463.757265] ? _raw_spin_unlock_irq+0x22/0x40 | |
66 | [ 1463.757284] ? is_mddev_idle+0x125/0x137 [md_mod] | |
67 | [ 1463.757302] md_do_sync.cold.78+0x404/0x969 [md_mod] | |
68 | [ 1463.757311] ? wait_woken+0x80/0x80 | |
69 | [ 1463.757336] ? md_rdev_init+0xb0/0xb0 [md_mod] | |
70 | [ 1463.757351] md_thread+0xe9/0x140 [md_mod] | |
71 | [ 1463.757358] ? _raw_spin_unlock_irqrestore+0x2e/0x60 | |
72 | [ 1463.757364] ? __kthread_parkme+0x4c/0x70 | |
73 | [ 1463.757369] kthread+0x112/0x130 | |
74 | [ 1463.757374] ? kthread_create_worker_on_cpu+0x40/0x40 | |
75 | [ 1463.757380] ret_from_fork+0x3a/0x50 | |
76 | ||
77 | 2. regular IO | |
78 | [ 1463.760679] INFO: task kworker/0:8:5367 blocked for more than 480 seconds. | |
79 | [ 1463.760683] Not tainted 4.19.5-1-default #1 | |
80 | [ 1463.760684] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. | |
81 | [ 1463.760687] kworker/0:8 D 0 5367 2 0x80000000 | |
82 | [ 1463.760718] Workqueue: md submit_flushes [md_mod] | |
83 | [ 1463.760721] Call Trace: | |
84 | [ 1463.760731] ? __schedule+0x29a/0x880 | |
85 | [ 1463.760741] ? wait_barrier+0xdd/0x170 [raid10] | |
86 | [ 1463.760746] schedule+0x78/0x110 | |
87 | [ 1463.760753] wait_barrier+0xdd/0x170 [raid10] | |
88 | [ 1463.760761] ? wait_woken+0x80/0x80 | |
89 | [ 1463.760768] raid10_write_request+0xf2/0x900 [raid10] | |
90 | [ 1463.760774] ? wait_woken+0x80/0x80 | |
91 | [ 1463.760778] ? mempool_alloc+0x55/0x160 | |
92 | [ 1463.760795] ? md_write_start+0xa9/0x270 [md_mod] | |
93 | [ 1463.760801] ? try_to_wake_up+0x44/0x470 | |
94 | [ 1463.760810] raid10_make_request+0xc1/0x120 [raid10] | |
95 | [ 1463.760816] ? wait_woken+0x80/0x80 | |
96 | [ 1463.760831] md_handle_request+0x121/0x190 [md_mod] | |
97 | [ 1463.760851] md_make_request+0x78/0x190 [md_mod] | |
98 | [ 1463.760860] generic_make_request+0x1c6/0x470 | |
99 | [ 1463.760870] raid10_write_request+0x77a/0x900 [raid10] | |
100 | [ 1463.760875] ? wait_woken+0x80/0x80 | |
101 | [ 1463.760879] ? mempool_alloc+0x55/0x160 | |
102 | [ 1463.760895] ? md_write_start+0xa9/0x270 [md_mod] | |
103 | [ 1463.760904] raid10_make_request+0xc1/0x120 [raid10] | |
104 | [ 1463.760910] ? wait_woken+0x80/0x80 | |
105 | [ 1463.760926] md_handle_request+0x121/0x190 [md_mod] | |
106 | [ 1463.760931] ? _raw_spin_unlock_irq+0x22/0x40 | |
107 | [ 1463.760936] ? finish_task_switch+0x74/0x260 | |
108 | [ 1463.760954] submit_flushes+0x21/0x40 [md_mod] | |
109 | ||
110 | So resync io is waiting for regular write io to complete to | |
111 | decrease nr_pending (conf->barrier++ is called before waiting). | |
112 | The regular write io splits another bio after call wait_barrier | |
113 | which call nr_pending++, then the splitted bio would continue | |
114 | with raid10_write_request -> wait_barrier, so the splitted bio | |
115 | has to wait for barrier to be zero, then deadlock happens as | |
116 | follows. | |
117 | ||
118 | resync io regular io | |
119 | ||
120 | raise_barrier | |
121 | wait_barrier | |
122 | generic_make_request | |
123 | wait_barrier | |
124 | ||
125 | To resolve the issue, we need to call allow_barrier to decrease | |
126 | nr_pending before generic_make_request since regular IO is not | |
127 | issued to underlying devices, and wait_barrier is called again | |
128 | to ensure no internal IO happening. | |
129 | ||
130 | Fixes: fc9977dd069e ("md/raid10: simplify the splitting of requests.") | |
131 | Reported-and-tested-by: Siniša Bandin <sinisa@4net.rs> | |
132 | Signed-off-by: Guoqing Jiang <gqjiang@suse.com> | |
133 | Signed-off-by: Shaohua Li <shli@fb.com> | |
134 | --- | |
135 | drivers/md/raid10.c | 4 ++++ | |
136 | 1 file changed, 4 insertions(+) | |
137 | ||
138 | diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c | |
139 | index 76c92e31afc0..abb5d382f64d 100644 | |
140 | --- a/drivers/md/raid10.c | |
141 | +++ b/drivers/md/raid10.c | |
142 | @@ -1209,7 +1209,9 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, | |
143 | struct bio *split = bio_split(bio, max_sectors, | |
144 | gfp, &conf->bio_split); | |
145 | bio_chain(split, bio); | |
146 | + allow_barrier(conf); | |
147 | generic_make_request(bio); | |
148 | + wait_barrier(conf); | |
149 | bio = split; | |
150 | r10_bio->master_bio = bio; | |
151 | r10_bio->sectors = max_sectors; | |
152 | @@ -1492,7 +1494,9 @@ retry_write: | |
153 | struct bio *split = bio_split(bio, r10_bio->sectors, | |
154 | GFP_NOIO, &conf->bio_split); | |
155 | bio_chain(split, bio); | |
156 | + allow_barrier(conf); | |
157 | generic_make_request(bio); | |
158 | + wait_barrier(conf); | |
159 | bio = split; | |
160 | r10_bio->master_bio = bio; | |
161 | } | |
162 | -- | |
163 | cgit 1.2-0.3.lf.el7 | |
164 | ||
d882a367 AM |
165 | From 9c9e935fc038342c48461aabca666f1b544e32b1 Mon Sep 17 00:00:00 2001 |
166 | From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> | |
167 | Date: Sun, 27 Jan 2019 23:51:37 +0900 | |
168 | Subject: [PATCH v3] oom, oom_reaper: do not enqueue same task twice | |
169 | ||
170 | Arkadiusz reported that enabling memcg's group oom killing causes | |
171 | strange memcg statistics where there is no task in a memcg despite | |
172 | the number of tasks in that memcg is not 0. It turned out that there | |
173 | is a bug in wake_oom_reaper() which allows enqueuing same task twice | |
174 | which makes impossible to decrease the number of tasks in that memcg | |
175 | due to a refcount leak. | |
176 | ||
177 | This bug existed since the OOM reaper became invokable from | |
178 | task_will_free_mem(current) path in out_of_memory() in Linux 4.7, | |
179 | but memcg's group oom killing made it easier to trigger this bug by | |
180 | calling wake_oom_reaper() on the same task from one out_of_memory() | |
181 | request. | |
182 | ||
183 | Fix this bug using an approach used by commit 855b018325737f76 | |
184 | ("oom, oom_reaper: disable oom_reaper for oom_kill_allocating_task"). | |
185 | As a side effect of this patch, this patch also avoids enqueuing | |
186 | multiple threads sharing memory via task_will_free_mem(current) path. | |
187 | ||
188 | Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> | |
189 | Reported-by: Arkadiusz Miśkiewicz <arekm@maven.pl> | |
190 | Tested-by: Arkadiusz Miśkiewicz <arekm@maven.pl> | |
191 | Fixes: af8e15cc85a25315 ("oom, oom_reaper: do not enqueue task if it is on the oom_reaper_list head") | |
192 | --- | |
193 | include/linux/sched/coredump.h | 1 + | |
194 | mm/oom_kill.c | 4 ++-- | |
195 | 2 files changed, 3 insertions(+), 2 deletions(-) | |
196 | ||
197 | diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h | |
198 | index ec912d0..ecdc654 100644 | |
199 | --- a/include/linux/sched/coredump.h | |
200 | +++ b/include/linux/sched/coredump.h | |
201 | @@ -71,6 +71,7 @@ static inline int get_dumpable(struct mm_struct *mm) | |
202 | #define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ | |
203 | #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ | |
204 | #define MMF_OOM_VICTIM 25 /* mm is the oom victim */ | |
205 | +#define MMF_OOM_REAP_QUEUED 26 /* mm was queued for oom_reaper */ | |
206 | #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) | |
207 | ||
208 | #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ | |
209 | diff --git a/mm/oom_kill.c b/mm/oom_kill.c | |
210 | index f0e8cd9..059e617 100644 | |
211 | --- a/mm/oom_kill.c | |
212 | +++ b/mm/oom_kill.c | |
213 | @@ -647,8 +647,8 @@ static int oom_reaper(void *unused) | |
214 | ||
215 | static void wake_oom_reaper(struct task_struct *tsk) | |
216 | { | |
217 | - /* tsk is already queued? */ | |
218 | - if (tsk == oom_reaper_list || tsk->oom_reaper_list) | |
219 | + /* mm is already queued? */ | |
220 | + if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) | |
221 | return; | |
222 | ||
223 | get_task_struct(tsk); | |
224 | -- | |
225 | 1.8.3.1 | |
db555a62 | 226 | |
48e8ffca AM |
227 | From: Dave Chinner <dchinner@redhat.com> |
228 | ||
229 | This reverts commit a76cf1a474d7dbcd9336b5f5afb0162baa142cf0. | |
230 | ||
231 | This change causes serious changes to page cache and inode cache | |
232 | behaviour and balance, resulting in major performance regressions | |
233 | when combining worklaods such as large file copies and kernel | |
234 | compiles. | |
235 | ||
236 | https://bugzilla.kernel.org/show_bug.cgi?id=202441 | |
237 | ||
238 | This change is a hack to work around the problems introduced by | |
239 | changing how agressive shrinkers are on small caches in commit | |
240 | 172b06c32b94 ("mm: slowly shrink slabs with a relatively small | |
241 | number of objects"). It creates more problems than it solves, wasn't | |
242 | adequately reviewed or tested, so it needs to be reverted. | |
243 | ||
244 | cc: <stable@vger.kernel.org> | |
245 | Signed-off-by: Dave Chinner <dchinner@redhat.com> | |
246 | --- | |
247 | fs/inode.c | 7 ++----- | |
248 | 1 file changed, 2 insertions(+), 5 deletions(-) | |
249 | ||
250 | diff --git a/fs/inode.c b/fs/inode.c | |
251 | index 0cd47fe0dbe5..73432e64f874 100644 | |
252 | --- a/fs/inode.c | |
253 | +++ b/fs/inode.c | |
254 | @@ -730,11 +730,8 @@ static enum lru_status inode_lru_isolate(struct list_head *item, | |
255 | return LRU_REMOVED; | |
256 | } | |
257 | ||
258 | - /* | |
259 | - * Recently referenced inodes and inodes with many attached pages | |
260 | - * get one more pass. | |
261 | - */ | |
262 | - if (inode->i_state & I_REFERENCED || inode->i_data.nrpages > 1) { | |
263 | + /* recently referenced inodes get one more pass */ | |
264 | + if (inode->i_state & I_REFERENCED) { | |
265 | inode->i_state &= ~I_REFERENCED; | |
266 | spin_unlock(&inode->i_lock); | |
267 | return LRU_ROTATE; | |
268 | -- | |
269 | 2.20.1 | |
270 | ||
271 | This reverts commit 172b06c32b949759fe6313abec514bc4f15014f4. | |
272 | ||
273 | This change changes the agressiveness of shrinker reclaim, causing | |
274 | small cache and low priority reclaim to greatly increase | |
275 | scanning pressure on small caches. As a result, light memory | |
276 | pressure has a disproportionate affect on small caches, and causes | |
277 | large caches to be reclaimed much faster than previously. | |
278 | ||
279 | As a result, it greatly perturbs the delicate balance of the VFS | |
280 | caches (dentry/inode vs file page cache) such that the inode/dentry | |
281 | caches are reclaimed much, much faster than the page cache and this | |
282 | drives us into several other caching imbalance related problems. | |
283 | ||
284 | As such, this is a bad change and needs to be reverted. | |
285 | ||
286 | [ Needs some massaging to retain the later seekless shrinker | |
287 | modifications. ] | |
288 | ||
289 | cc: <stable@vger.kernel.org> | |
290 | Signed-off-by: Dave Chinner <dchinner@redhat.com> | |
291 | --- | |
292 | mm/vmscan.c | 10 ---------- | |
293 | 1 file changed, 10 deletions(-) | |
294 | ||
295 | diff --git a/mm/vmscan.c b/mm/vmscan.c | |
296 | index a714c4f800e9..e979705bbf32 100644 | |
297 | --- a/mm/vmscan.c | |
298 | +++ b/mm/vmscan.c | |
299 | @@ -491,16 +491,6 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, | |
300 | delta = freeable / 2; | |
301 | } | |
302 | ||
303 | - /* | |
304 | - * Make sure we apply some minimal pressure on default priority | |
305 | - * even on small cgroups. Stale objects are not only consuming memory | |
306 | - * by themselves, but can also hold a reference to a dying cgroup, | |
307 | - * preventing it from being reclaimed. A dying cgroup with all | |
308 | - * corresponding structures like per-cpu stats and kmem caches | |
309 | - * can be really big, so it may lead to a significant waste of memory. | |
310 | - */ | |
311 | - delta = max_t(unsigned long long, delta, min(freeable, batch_size)); | |
312 | - | |
313 | total_scan += delta; | |
314 | if (total_scan < 0) { | |
315 | pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", | |
316 | -- | |
317 | 2.20.1 | |
318 |