1 From: Shaohua Li <shli@fb.com>
3 Basically this is a copy of commit 001e4a8775f6(ext4: implement cgroup
4 writeback support). Tested with a fio test, verified writeback is
5 throttled against cgroup io.max write bandwidth, also verified moving
6 the fio test to another cgroup and the writeback is throttled against
9 Cc: Tejun Heo <tj@kernel.org>
10 Signed-off-by: Shaohua Li <shli@fb.com>
12 fs/xfs/xfs_aops.c | 2 ++
13 fs/xfs/xfs_super.c | 1 +
14 2 files changed, 3 insertions(+)
16 --- linux-4.19/fs/xfs/xfs_aops.c.org 2018-11-21 10:31:12.348955352 +0100
17 +++ linux-4.19/fs/xfs/xfs_aops.c 2018-11-21 10:34:35.241764742 +0100
18 @@ -613,8 +613,10 @@ xfs_add_to_ioend(
19 list_add(&wpc->ioend->io_list, iolist);
20 wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset,
22 + wbc_init_bio(wbc, wpc->ioend->io_bio);
25 + wbc_account_io(wbc, page, len);
26 if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
28 atomic_inc(&iop->write_count);
29 diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
30 index 584cf2d..aea3bc2 100644
31 --- a/fs/xfs/xfs_super.c
32 +++ b/fs/xfs/xfs_super.c
33 @@ -1634,6 +1634,7 @@ xfs_fs_fill_super(
34 sb->s_max_links = XFS_MAXLINK;
36 set_posix_acl_flag(sb);
37 + sb->s_iflags |= SB_I_CGROUPWB;
39 /* version 5 superblocks support inode version counters. */
40 if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
41 From e820d55cb99dd93ac2dc949cf486bb187e5cd70d Mon Sep 17 00:00:00 2001
42 From: Guoqing Jiang <gqjiang@suse.com>
43 Date: Wed, 19 Dec 2018 14:19:25 +0800
44 Subject: md: fix raid10 hang issue caused by barrier
46 Content-Type: text/plain; charset=UTF-8
47 Content-Transfer-Encoding: 8bit
49 When both regular IO and resync IO happen at the same time,
50 and if we also need to split regular. Then we can see tasks
54 [ 1463.757205] INFO: task md1_resync:5215 blocked for more than 480 seconds.
55 [ 1463.757207] Not tainted 4.19.5-1-default #1
56 [ 1463.757209] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
57 [ 1463.757212] md1_resync D 0 5215 2 0x80000000
58 [ 1463.757216] Call Trace:
59 [ 1463.757223] ? __schedule+0x29a/0x880
60 [ 1463.757231] ? raise_barrier+0x8d/0x140 [raid10]
61 [ 1463.757236] schedule+0x78/0x110
62 [ 1463.757243] raise_barrier+0x8d/0x140 [raid10]
63 [ 1463.757248] ? wait_woken+0x80/0x80
64 [ 1463.757257] raid10_sync_request+0x1f6/0x1e30 [raid10]
65 [ 1463.757265] ? _raw_spin_unlock_irq+0x22/0x40
66 [ 1463.757284] ? is_mddev_idle+0x125/0x137 [md_mod]
67 [ 1463.757302] md_do_sync.cold.78+0x404/0x969 [md_mod]
68 [ 1463.757311] ? wait_woken+0x80/0x80
69 [ 1463.757336] ? md_rdev_init+0xb0/0xb0 [md_mod]
70 [ 1463.757351] md_thread+0xe9/0x140 [md_mod]
71 [ 1463.757358] ? _raw_spin_unlock_irqrestore+0x2e/0x60
72 [ 1463.757364] ? __kthread_parkme+0x4c/0x70
73 [ 1463.757369] kthread+0x112/0x130
74 [ 1463.757374] ? kthread_create_worker_on_cpu+0x40/0x40
75 [ 1463.757380] ret_from_fork+0x3a/0x50
78 [ 1463.760679] INFO: task kworker/0:8:5367 blocked for more than 480 seconds.
79 [ 1463.760683] Not tainted 4.19.5-1-default #1
80 [ 1463.760684] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
81 [ 1463.760687] kworker/0:8 D 0 5367 2 0x80000000
82 [ 1463.760718] Workqueue: md submit_flushes [md_mod]
83 [ 1463.760721] Call Trace:
84 [ 1463.760731] ? __schedule+0x29a/0x880
85 [ 1463.760741] ? wait_barrier+0xdd/0x170 [raid10]
86 [ 1463.760746] schedule+0x78/0x110
87 [ 1463.760753] wait_barrier+0xdd/0x170 [raid10]
88 [ 1463.760761] ? wait_woken+0x80/0x80
89 [ 1463.760768] raid10_write_request+0xf2/0x900 [raid10]
90 [ 1463.760774] ? wait_woken+0x80/0x80
91 [ 1463.760778] ? mempool_alloc+0x55/0x160
92 [ 1463.760795] ? md_write_start+0xa9/0x270 [md_mod]
93 [ 1463.760801] ? try_to_wake_up+0x44/0x470
94 [ 1463.760810] raid10_make_request+0xc1/0x120 [raid10]
95 [ 1463.760816] ? wait_woken+0x80/0x80
96 [ 1463.760831] md_handle_request+0x121/0x190 [md_mod]
97 [ 1463.760851] md_make_request+0x78/0x190 [md_mod]
98 [ 1463.760860] generic_make_request+0x1c6/0x470
99 [ 1463.760870] raid10_write_request+0x77a/0x900 [raid10]
100 [ 1463.760875] ? wait_woken+0x80/0x80
101 [ 1463.760879] ? mempool_alloc+0x55/0x160
102 [ 1463.760895] ? md_write_start+0xa9/0x270 [md_mod]
103 [ 1463.760904] raid10_make_request+0xc1/0x120 [raid10]
104 [ 1463.760910] ? wait_woken+0x80/0x80
105 [ 1463.760926] md_handle_request+0x121/0x190 [md_mod]
106 [ 1463.760931] ? _raw_spin_unlock_irq+0x22/0x40
107 [ 1463.760936] ? finish_task_switch+0x74/0x260
108 [ 1463.760954] submit_flushes+0x21/0x40 [md_mod]
110 So resync io is waiting for regular write io to complete to
111 decrease nr_pending (conf->barrier++ is called before waiting).
112 The regular write io splits another bio after call wait_barrier
113 which call nr_pending++, then the splitted bio would continue
114 with raid10_write_request -> wait_barrier, so the splitted bio
115 has to wait for barrier to be zero, then deadlock happens as
125 To resolve the issue, we need to call allow_barrier to decrease
126 nr_pending before generic_make_request since regular IO is not
127 issued to underlying devices, and wait_barrier is called again
128 to ensure no internal IO happening.
130 Fixes: fc9977dd069e ("md/raid10: simplify the splitting of requests.")
131 Reported-and-tested-by: SiniĊĦa Bandin <sinisa@4net.rs>
132 Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
133 Signed-off-by: Shaohua Li <shli@fb.com>
135 drivers/md/raid10.c | 4 ++++
136 1 file changed, 4 insertions(+)
138 diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
139 index 76c92e31afc0..abb5d382f64d 100644
140 --- a/drivers/md/raid10.c
141 +++ b/drivers/md/raid10.c
142 @@ -1209,7 +1209,9 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
143 struct bio *split = bio_split(bio, max_sectors,
144 gfp, &conf->bio_split);
145 bio_chain(split, bio);
146 + allow_barrier(conf);
147 generic_make_request(bio);
148 + wait_barrier(conf);
150 r10_bio->master_bio = bio;
151 r10_bio->sectors = max_sectors;
152 @@ -1492,7 +1494,9 @@ retry_write:
153 struct bio *split = bio_split(bio, r10_bio->sectors,
154 GFP_NOIO, &conf->bio_split);
155 bio_chain(split, bio);
156 + allow_barrier(conf);
157 generic_make_request(bio);
158 + wait_barrier(conf);
160 r10_bio->master_bio = bio;
165 From: Dave Chinner <dchinner@redhat.com>
167 This reverts commit a76cf1a474d7dbcd9336b5f5afb0162baa142cf0.
169 This change causes serious changes to page cache and inode cache
170 behaviour and balance, resulting in major performance regressions
171 when combining worklaods such as large file copies and kernel
174 https://bugzilla.kernel.org/show_bug.cgi?id=202441
176 This change is a hack to work around the problems introduced by
177 changing how agressive shrinkers are on small caches in commit
178 172b06c32b94 ("mm: slowly shrink slabs with a relatively small
179 number of objects"). It creates more problems than it solves, wasn't
180 adequately reviewed or tested, so it needs to be reverted.
182 cc: <stable@vger.kernel.org>
183 Signed-off-by: Dave Chinner <dchinner@redhat.com>
185 fs/inode.c | 7 ++-----
186 1 file changed, 2 insertions(+), 5 deletions(-)
188 diff --git a/fs/inode.c b/fs/inode.c
189 index 0cd47fe0dbe5..73432e64f874 100644
192 @@ -730,11 +730,8 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
197 - * Recently referenced inodes and inodes with many attached pages
198 - * get one more pass.
200 - if (inode->i_state & I_REFERENCED || inode->i_data.nrpages > 1) {
201 + /* recently referenced inodes get one more pass */
202 + if (inode->i_state & I_REFERENCED) {
203 inode->i_state &= ~I_REFERENCED;
204 spin_unlock(&inode->i_lock);
209 This reverts commit 172b06c32b949759fe6313abec514bc4f15014f4.
211 This change changes the agressiveness of shrinker reclaim, causing
212 small cache and low priority reclaim to greatly increase
213 scanning pressure on small caches. As a result, light memory
214 pressure has a disproportionate affect on small caches, and causes
215 large caches to be reclaimed much faster than previously.
217 As a result, it greatly perturbs the delicate balance of the VFS
218 caches (dentry/inode vs file page cache) such that the inode/dentry
219 caches are reclaimed much, much faster than the page cache and this
220 drives us into several other caching imbalance related problems.
222 As such, this is a bad change and needs to be reverted.
224 [ Needs some massaging to retain the later seekless shrinker
227 cc: <stable@vger.kernel.org>
228 Signed-off-by: Dave Chinner <dchinner@redhat.com>
230 mm/vmscan.c | 10 ----------
231 1 file changed, 10 deletions(-)
233 diff --git a/mm/vmscan.c b/mm/vmscan.c
234 index a714c4f800e9..e979705bbf32 100644
237 @@ -491,16 +491,6 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
238 delta = freeable / 2;
242 - * Make sure we apply some minimal pressure on default priority
243 - * even on small cgroups. Stale objects are not only consuming memory
244 - * by themselves, but can also hold a reference to a dying cgroup,
245 - * preventing it from being reclaimed. A dying cgroup with all
246 - * corresponding structures like per-cpu stats and kmem caches
247 - * can be really big, so it may lead to a significant waste of memory.
249 - delta = max_t(unsigned long long, delta, min(freeable, batch_size));
252 if (total_scan < 0) {
253 pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",