From: Shaohua Li Basically this is a copy of commit 001e4a8775f6(ext4: implement cgroup writeback support). Tested with a fio test, verified writeback is throttled against cgroup io.max write bandwidth, also verified moving the fio test to another cgroup and the writeback is throttled against new cgroup setting. Cc: Tejun Heo Signed-off-by: Shaohua Li --- fs/xfs/xfs_aops.c | 2 ++ fs/xfs/xfs_super.c | 1 + 2 files changed, 3 insertions(+) --- linux-4.19/fs/xfs/xfs_aops.c.org 2018-11-21 10:31:12.348955352 +0100 +++ linux-4.19/fs/xfs/xfs_aops.c 2018-11-21 10:34:35.241764742 +0100 @@ -613,8 +613,10 @@ xfs_add_to_ioend( list_add(&wpc->ioend->io_list, iolist); wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, bdev, sector); + wbc_init_bio(wbc, wpc->ioend->io_bio); } + wbc_account_io(wbc, page, len); if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) { if (iop) atomic_inc(&iop->write_count); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 584cf2d..aea3bc2 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1634,6 +1634,7 @@ xfs_fs_fill_super( sb->s_max_links = XFS_MAXLINK; sb->s_time_gran = 1; set_posix_acl_flag(sb); + sb->s_iflags |= SB_I_CGROUPWB; /* version 5 superblocks support inode version counters. */ if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) From e820d55cb99dd93ac2dc949cf486bb187e5cd70d Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Wed, 19 Dec 2018 14:19:25 +0800 Subject: md: fix raid10 hang issue caused by barrier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When both regular IO and resync IO happen at the same time, and if we also need to split regular. Then we can see tasks hang due to barrier. 1. resync thread [ 1463.757205] INFO: task md1_resync:5215 blocked for more than 480 seconds. [ 1463.757207] Not tainted 4.19.5-1-default #1 [ 1463.757209] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1463.757212] md1_resync D 0 5215 2 0x80000000 [ 1463.757216] Call Trace: [ 1463.757223] ? __schedule+0x29a/0x880 [ 1463.757231] ? raise_barrier+0x8d/0x140 [raid10] [ 1463.757236] schedule+0x78/0x110 [ 1463.757243] raise_barrier+0x8d/0x140 [raid10] [ 1463.757248] ? wait_woken+0x80/0x80 [ 1463.757257] raid10_sync_request+0x1f6/0x1e30 [raid10] [ 1463.757265] ? _raw_spin_unlock_irq+0x22/0x40 [ 1463.757284] ? is_mddev_idle+0x125/0x137 [md_mod] [ 1463.757302] md_do_sync.cold.78+0x404/0x969 [md_mod] [ 1463.757311] ? wait_woken+0x80/0x80 [ 1463.757336] ? md_rdev_init+0xb0/0xb0 [md_mod] [ 1463.757351] md_thread+0xe9/0x140 [md_mod] [ 1463.757358] ? _raw_spin_unlock_irqrestore+0x2e/0x60 [ 1463.757364] ? __kthread_parkme+0x4c/0x70 [ 1463.757369] kthread+0x112/0x130 [ 1463.757374] ? kthread_create_worker_on_cpu+0x40/0x40 [ 1463.757380] ret_from_fork+0x3a/0x50 2. regular IO [ 1463.760679] INFO: task kworker/0:8:5367 blocked for more than 480 seconds. [ 1463.760683] Not tainted 4.19.5-1-default #1 [ 1463.760684] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1463.760687] kworker/0:8 D 0 5367 2 0x80000000 [ 1463.760718] Workqueue: md submit_flushes [md_mod] [ 1463.760721] Call Trace: [ 1463.760731] ? __schedule+0x29a/0x880 [ 1463.760741] ? wait_barrier+0xdd/0x170 [raid10] [ 1463.760746] schedule+0x78/0x110 [ 1463.760753] wait_barrier+0xdd/0x170 [raid10] [ 1463.760761] ? wait_woken+0x80/0x80 [ 1463.760768] raid10_write_request+0xf2/0x900 [raid10] [ 1463.760774] ? wait_woken+0x80/0x80 [ 1463.760778] ? mempool_alloc+0x55/0x160 [ 1463.760795] ? md_write_start+0xa9/0x270 [md_mod] [ 1463.760801] ? try_to_wake_up+0x44/0x470 [ 1463.760810] raid10_make_request+0xc1/0x120 [raid10] [ 1463.760816] ? wait_woken+0x80/0x80 [ 1463.760831] md_handle_request+0x121/0x190 [md_mod] [ 1463.760851] md_make_request+0x78/0x190 [md_mod] [ 1463.760860] generic_make_request+0x1c6/0x470 [ 1463.760870] raid10_write_request+0x77a/0x900 [raid10] [ 1463.760875] ? wait_woken+0x80/0x80 [ 1463.760879] ? mempool_alloc+0x55/0x160 [ 1463.760895] ? md_write_start+0xa9/0x270 [md_mod] [ 1463.760904] raid10_make_request+0xc1/0x120 [raid10] [ 1463.760910] ? wait_woken+0x80/0x80 [ 1463.760926] md_handle_request+0x121/0x190 [md_mod] [ 1463.760931] ? _raw_spin_unlock_irq+0x22/0x40 [ 1463.760936] ? finish_task_switch+0x74/0x260 [ 1463.760954] submit_flushes+0x21/0x40 [md_mod] So resync io is waiting for regular write io to complete to decrease nr_pending (conf->barrier++ is called before waiting). The regular write io splits another bio after call wait_barrier which call nr_pending++, then the splitted bio would continue with raid10_write_request -> wait_barrier, so the splitted bio has to wait for barrier to be zero, then deadlock happens as follows. resync io regular io raise_barrier wait_barrier generic_make_request wait_barrier To resolve the issue, we need to call allow_barrier to decrease nr_pending before generic_make_request since regular IO is not issued to underlying devices, and wait_barrier is called again to ensure no internal IO happening. Fixes: fc9977dd069e ("md/raid10: simplify the splitting of requests.") Reported-and-tested-by: Siniša Bandin Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 76c92e31afc0..abb5d382f64d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1209,7 +1209,9 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, struct bio *split = bio_split(bio, max_sectors, gfp, &conf->bio_split); bio_chain(split, bio); + allow_barrier(conf); generic_make_request(bio); + wait_barrier(conf); bio = split; r10_bio->master_bio = bio; r10_bio->sectors = max_sectors; @@ -1492,7 +1494,9 @@ retry_write: struct bio *split = bio_split(bio, r10_bio->sectors, GFP_NOIO, &conf->bio_split); bio_chain(split, bio); + allow_barrier(conf); generic_make_request(bio); + wait_barrier(conf); bio = split; r10_bio->master_bio = bio; } -- cgit 1.2-0.3.lf.el7 From 9c9e935fc038342c48461aabca666f1b544e32b1 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Sun, 27 Jan 2019 23:51:37 +0900 Subject: [PATCH v3] oom, oom_reaper: do not enqueue same task twice Arkadiusz reported that enabling memcg's group oom killing causes strange memcg statistics where there is no task in a memcg despite the number of tasks in that memcg is not 0. It turned out that there is a bug in wake_oom_reaper() which allows enqueuing same task twice which makes impossible to decrease the number of tasks in that memcg due to a refcount leak. This bug existed since the OOM reaper became invokable from task_will_free_mem(current) path in out_of_memory() in Linux 4.7, but memcg's group oom killing made it easier to trigger this bug by calling wake_oom_reaper() on the same task from one out_of_memory() request. Fix this bug using an approach used by commit 855b018325737f76 ("oom, oom_reaper: disable oom_reaper for oom_kill_allocating_task"). As a side effect of this patch, this patch also avoids enqueuing multiple threads sharing memory via task_will_free_mem(current) path. Signed-off-by: Tetsuo Handa Reported-by: Arkadiusz Miśkiewicz Tested-by: Arkadiusz Miśkiewicz Fixes: af8e15cc85a25315 ("oom, oom_reaper: do not enqueue task if it is on the oom_reaper_list head") --- include/linux/sched/coredump.h | 1 + mm/oom_kill.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index ec912d0..ecdc654 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -71,6 +71,7 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ #define MMF_OOM_VICTIM 25 /* mm is the oom victim */ +#define MMF_OOM_REAP_QUEUED 26 /* mm was queued for oom_reaper */ #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f0e8cd9..059e617 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -647,8 +647,8 @@ static int oom_reaper(void *unused) static void wake_oom_reaper(struct task_struct *tsk) { - /* tsk is already queued? */ - if (tsk == oom_reaper_list || tsk->oom_reaper_list) + /* mm is already queued? */ + if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) return; get_task_struct(tsk); -- 1.8.3.1 From: Dave Chinner This reverts commit a76cf1a474d7dbcd9336b5f5afb0162baa142cf0. This change causes serious changes to page cache and inode cache behaviour and balance, resulting in major performance regressions when combining worklaods such as large file copies and kernel compiles. https://bugzilla.kernel.org/show_bug.cgi?id=202441 This change is a hack to work around the problems introduced by changing how agressive shrinkers are on small caches in commit 172b06c32b94 ("mm: slowly shrink slabs with a relatively small number of objects"). It creates more problems than it solves, wasn't adequately reviewed or tested, so it needs to be reverted. cc: Signed-off-by: Dave Chinner --- fs/inode.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 0cd47fe0dbe5..73432e64f874 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -730,11 +730,8 @@ static enum lru_status inode_lru_isolate(struct list_head *item, return LRU_REMOVED; } - /* - * Recently referenced inodes and inodes with many attached pages - * get one more pass. - */ - if (inode->i_state & I_REFERENCED || inode->i_data.nrpages > 1) { + /* recently referenced inodes get one more pass */ + if (inode->i_state & I_REFERENCED) { inode->i_state &= ~I_REFERENCED; spin_unlock(&inode->i_lock); return LRU_ROTATE; -- 2.20.1 This reverts commit 172b06c32b949759fe6313abec514bc4f15014f4. This change changes the agressiveness of shrinker reclaim, causing small cache and low priority reclaim to greatly increase scanning pressure on small caches. As a result, light memory pressure has a disproportionate affect on small caches, and causes large caches to be reclaimed much faster than previously. As a result, it greatly perturbs the delicate balance of the VFS caches (dentry/inode vs file page cache) such that the inode/dentry caches are reclaimed much, much faster than the page cache and this drives us into several other caching imbalance related problems. As such, this is a bad change and needs to be reverted. [ Needs some massaging to retain the later seekless shrinker modifications. ] cc: Signed-off-by: Dave Chinner --- mm/vmscan.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index a714c4f800e9..e979705bbf32 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -491,16 +491,6 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, delta = freeable / 2; } - /* - * Make sure we apply some minimal pressure on default priority - * even on small cgroups. Stale objects are not only consuming memory - * by themselves, but can also hold a reference to a dying cgroup, - * preventing it from being reclaimed. A dying cgroup with all - * corresponding structures like per-cpu stats and kmem caches - * can be really big, so it may lead to a significant waste of memory. - */ - delta = max_t(unsigned long long, delta, min(freeable, batch_size)); - total_scan += delta; if (total_scan < 0) { pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", -- 2.20.1