-From: Shaohua Li <shli@fb.com>
-
-Basically this is a copy of commit 001e4a8775f6(ext4: implement cgroup
-writeback support). Tested with a fio test, verified writeback is
-throttled against cgroup io.max write bandwidth, also verified moving
-the fio test to another cgroup and the writeback is throttled against
-new cgroup setting.
-
-Cc: Tejun Heo <tj@kernel.org>
-Signed-off-by: Shaohua Li <shli@fb.com>
----
- fs/xfs/xfs_aops.c | 2 ++
- fs/xfs/xfs_super.c | 1 +
- 2 files changed, 3 insertions(+)
-
---- linux-4.19/fs/xfs/xfs_aops.c.org 2018-11-21 10:31:12.348955352 +0100
-+++ linux-4.19/fs/xfs/xfs_aops.c 2018-11-21 10:34:35.241764742 +0100
-@@ -613,8 +613,10 @@ xfs_add_to_ioend(
- list_add(&wpc->ioend->io_list, iolist);
- wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset,
- bdev, sector);
-+ wbc_init_bio(wbc, wpc->ioend->io_bio);
- }
-
-+ wbc_account_io(wbc, page, len);
- if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
- if (iop)
- atomic_inc(&iop->write_count);
-diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
-index 584cf2d..aea3bc2 100644
---- a/fs/xfs/xfs_super.c
-+++ b/fs/xfs/xfs_super.c
-@@ -1634,6 +1634,7 @@ xfs_fs_fill_super(
- sb->s_max_links = XFS_MAXLINK;
- sb->s_time_gran = 1;
- set_posix_acl_flag(sb);
-+ sb->s_iflags |= SB_I_CGROUPWB;
-
- /* version 5 superblocks support inode version counters. */
- if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
-From e820d55cb99dd93ac2dc949cf486bb187e5cd70d Mon Sep 17 00:00:00 2001
-From: Guoqing Jiang <gqjiang@suse.com>
-Date: Wed, 19 Dec 2018 14:19:25 +0800
-Subject: md: fix raid10 hang issue caused by barrier
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-When both regular IO and resync IO happen at the same time,
-and if we also need to split regular. Then we can see tasks
-hang due to barrier.
-
-1. resync thread
-[ 1463.757205] INFO: task md1_resync:5215 blocked for more than 480 seconds.
-[ 1463.757207] Not tainted 4.19.5-1-default #1
-[ 1463.757209] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
-[ 1463.757212] md1_resync D 0 5215 2 0x80000000
-[ 1463.757216] Call Trace:
-[ 1463.757223] ? __schedule+0x29a/0x880
-[ 1463.757231] ? raise_barrier+0x8d/0x140 [raid10]
-[ 1463.757236] schedule+0x78/0x110
-[ 1463.757243] raise_barrier+0x8d/0x140 [raid10]
-[ 1463.757248] ? wait_woken+0x80/0x80
-[ 1463.757257] raid10_sync_request+0x1f6/0x1e30 [raid10]
-[ 1463.757265] ? _raw_spin_unlock_irq+0x22/0x40
-[ 1463.757284] ? is_mddev_idle+0x125/0x137 [md_mod]
-[ 1463.757302] md_do_sync.cold.78+0x404/0x969 [md_mod]
-[ 1463.757311] ? wait_woken+0x80/0x80
-[ 1463.757336] ? md_rdev_init+0xb0/0xb0 [md_mod]
-[ 1463.757351] md_thread+0xe9/0x140 [md_mod]
-[ 1463.757358] ? _raw_spin_unlock_irqrestore+0x2e/0x60
-[ 1463.757364] ? __kthread_parkme+0x4c/0x70
-[ 1463.757369] kthread+0x112/0x130
-[ 1463.757374] ? kthread_create_worker_on_cpu+0x40/0x40
-[ 1463.757380] ret_from_fork+0x3a/0x50
-
-2. regular IO
-[ 1463.760679] INFO: task kworker/0:8:5367 blocked for more than 480 seconds.
-[ 1463.760683] Not tainted 4.19.5-1-default #1
-[ 1463.760684] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
-[ 1463.760687] kworker/0:8 D 0 5367 2 0x80000000
-[ 1463.760718] Workqueue: md submit_flushes [md_mod]
-[ 1463.760721] Call Trace:
-[ 1463.760731] ? __schedule+0x29a/0x880
-[ 1463.760741] ? wait_barrier+0xdd/0x170 [raid10]
-[ 1463.760746] schedule+0x78/0x110
-[ 1463.760753] wait_barrier+0xdd/0x170 [raid10]
-[ 1463.760761] ? wait_woken+0x80/0x80
-[ 1463.760768] raid10_write_request+0xf2/0x900 [raid10]
-[ 1463.760774] ? wait_woken+0x80/0x80
-[ 1463.760778] ? mempool_alloc+0x55/0x160
-[ 1463.760795] ? md_write_start+0xa9/0x270 [md_mod]
-[ 1463.760801] ? try_to_wake_up+0x44/0x470
-[ 1463.760810] raid10_make_request+0xc1/0x120 [raid10]
-[ 1463.760816] ? wait_woken+0x80/0x80
-[ 1463.760831] md_handle_request+0x121/0x190 [md_mod]
-[ 1463.760851] md_make_request+0x78/0x190 [md_mod]
-[ 1463.760860] generic_make_request+0x1c6/0x470
-[ 1463.760870] raid10_write_request+0x77a/0x900 [raid10]
-[ 1463.760875] ? wait_woken+0x80/0x80
-[ 1463.760879] ? mempool_alloc+0x55/0x160
-[ 1463.760895] ? md_write_start+0xa9/0x270 [md_mod]
-[ 1463.760904] raid10_make_request+0xc1/0x120 [raid10]
-[ 1463.760910] ? wait_woken+0x80/0x80
-[ 1463.760926] md_handle_request+0x121/0x190 [md_mod]
-[ 1463.760931] ? _raw_spin_unlock_irq+0x22/0x40
-[ 1463.760936] ? finish_task_switch+0x74/0x260
-[ 1463.760954] submit_flushes+0x21/0x40 [md_mod]
-
-So resync io is waiting for regular write io to complete to
-decrease nr_pending (conf->barrier++ is called before waiting).
-The regular write io splits another bio after call wait_barrier
-which call nr_pending++, then the splitted bio would continue
-with raid10_write_request -> wait_barrier, so the splitted bio
-has to wait for barrier to be zero, then deadlock happens as
-follows.
-
- resync io regular io
-
- raise_barrier
- wait_barrier
- generic_make_request
- wait_barrier
-
-To resolve the issue, we need to call allow_barrier to decrease
-nr_pending before generic_make_request since regular IO is not
-issued to underlying devices, and wait_barrier is called again
-to ensure no internal IO happening.
-
-Fixes: fc9977dd069e ("md/raid10: simplify the splitting of requests.")
-Reported-and-tested-by: Siniša Bandin <sinisa@4net.rs>
-Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
-Signed-off-by: Shaohua Li <shli@fb.com>
----
- drivers/md/raid10.c | 4 ++++
- 1 file changed, 4 insertions(+)
-
-diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
-index 76c92e31afc0..abb5d382f64d 100644
---- a/drivers/md/raid10.c
-+++ b/drivers/md/raid10.c
-@@ -1209,7 +1209,9 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
- struct bio *split = bio_split(bio, max_sectors,
- gfp, &conf->bio_split);
- bio_chain(split, bio);
-+ allow_barrier(conf);
- generic_make_request(bio);
-+ wait_barrier(conf);
- bio = split;
- r10_bio->master_bio = bio;
- r10_bio->sectors = max_sectors;
-@@ -1492,7 +1494,9 @@ retry_write:
- struct bio *split = bio_split(bio, r10_bio->sectors,
- GFP_NOIO, &conf->bio_split);
- bio_chain(split, bio);
-+ allow_barrier(conf);
- generic_make_request(bio);
-+ wait_barrier(conf);
- bio = split;
- r10_bio->master_bio = bio;
- }
---
-cgit 1.2-0.3.lf.el7
-
-From 48744b6339cf649a69b55997e138c17df1ecc897 Mon Sep 17 00:00:00 2001\r
-From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>\r
-Date: Sat, 26 Jan 2019 20:00:51 +0900\r
-Subject: [PATCH] oom, oom_reaper: do not enqueue same task twice\r
+Move setting up operation and write hint to xfs_alloc_ioend, and\r
+then just copy over all needed information from the previous bio\r
+in xfs_chain_bio and stop passing various parameters to it.\r
\r
-Arkadiusz reported that enabling memcg's group oom killing causes\r
-strange memcg statistics where there is no task in a memcg despite\r
-the number of tasks in that memcg is not 0. It turned out that there\r
-is a bug in wake_oom_reaper() which allows enqueuing same task twice\r
-which makes impossible to decrease the number of tasks in that memcg\r
-due to a refcount leak.\r
-\r
-This bug existed since the OOM reaper became invokable from\r
-task_will_free_mem(current) path in out_of_memory() in Linux 4.7,\r
-but memcg's group oom killing made it easier to trigger this bug by\r
-calling wake_oom_reaper() on the same task from one out_of_memory()\r
-request.\r
-\r
-Fix this bug using an approach used by commit 855b018325737f76\r
-("oom, oom_reaper: disable oom_reaper for oom_kill_allocating_task").\r
-Since task_will_free_mem(p) == false if p->mm == NULL, we can assume that\r
-p->mm != NULL when wake_oom_reaper() is called from task_will_free_mem()\r
-paths. As a side effect of this patch, this patch also avoids enqueuing\r
-multiple threads sharing memory via task_will_free_mem(current) path.\r
-\r
-Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>\r
-Reported-by: Arkadiusz Miśkiewicz <a.miskiewicz@gmail.com>\r
-Fixes: af8e15cc85a25315 ("oom, oom_reaper: do not enqueue task if it is on the oom_reaper_list head")\r
+Signed-off-by: Christoph Hellwig <hch@lst.de>\r
---\r
- mm/oom_kill.c | 28 +++++++++++++---------------\r
- 1 file changed, 13 insertions(+), 15 deletions(-)\r
+ fs/xfs/xfs_aops.c | 35 +++++++++++++++++------------------\r
+ 1 file changed, 17 insertions(+), 18 deletions(-)\r
\r
-diff --git a/mm/oom_kill.c b/mm/oom_kill.c\r
-index f0e8cd9..457f240 100644\r
---- a/mm/oom_kill.c\r
-+++ b/mm/oom_kill.c\r
-@@ -505,14 +505,6 @@ bool __oom_reap_task_mm(struct mm_struct *mm)\r
- struct vm_area_struct *vma;\r
- bool ret = true;\r
+diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c\r
+index a6f0f4761a37..9cceb90e77c5 100644\r
+--- a/fs/xfs/xfs_aops.c\r
++++ b/fs/xfs/xfs_aops.c\r
+@@ -665,7 +665,6 @@ xfs_submit_ioend(\r
+ \r
+ ioend->io_bio->bi_private = ioend;\r
+ ioend->io_bio->bi_end_io = xfs_end_bio;\r
+- ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);\r
+ \r
+ /*\r
+ * If we are failing the IO now, just mark the ioend with an\r
+@@ -679,7 +678,6 @@ xfs_submit_ioend(\r
+ return status;\r
+ }\r
\r
-- /*\r
-- * Tell all users of get_user/copy_from_user etc... that the content\r
-- * is no longer stable. No barriers really needed because unmapping\r
-- * should imply barriers already and the reader would hit a page fault\r
-- * if it stumbled over a reaped memory.\r
-- */\r
-- set_bit(MMF_UNSTABLE, &mm->flags);\r
--\r
- for (vma = mm->mmap ; vma; vma = vma->vm_next) {\r
- if (!can_madv_dontneed_vma(vma))\r
- continue;\r
-@@ -645,10 +637,15 @@ static int oom_reaper(void *unused)\r
+- ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;\r
+ submit_bio(ioend->io_bio);\r
return 0;\r
}\r
- \r
--static void wake_oom_reaper(struct task_struct *tsk)\r
-+static void wake_oom_reaper(struct task_struct *tsk, struct mm_struct *mm)\r
+@@ -691,7 +689,8 @@ xfs_alloc_ioend(\r
+ xfs_exntst_t state,\r
+ xfs_off_t offset,\r
+ struct block_device *bdev,\r
+- sector_t sector)\r
++ sector_t sector,\r
++ struct writeback_control *wbc)\r
{\r
-- /* tsk is already queued? */\r
-- if (tsk == oom_reaper_list || tsk->oom_reaper_list)\r
-+ /*\r
-+ * Tell all users of get_user/copy_from_user etc... that the content\r
-+ * is no longer stable. No barriers really needed because unmapping\r
-+ * should imply barriers already and the reader would hit a page fault\r
-+ * if it stumbled over a reaped memory.\r
-+ */\r
-+ if (test_and_set_bit(MMF_UNSTABLE, &mm->flags))\r
- return;\r
+ struct xfs_ioend *ioend;\r
+ struct bio *bio;\r
+@@ -699,6 +698,8 @@ xfs_alloc_ioend(\r
+ bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset);\r
+ bio_set_dev(bio, bdev);\r
+ bio->bi_iter.bi_sector = sector;\r
++ bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);\r
++ bio->bi_write_hint = inode->i_write_hint;\r
\r
- get_task_struct(tsk);\r
-@@ -668,7 +665,8 @@ static int __init oom_init(void)\r
- }\r
- subsys_initcall(oom_init)\r
- #else\r
--static inline void wake_oom_reaper(struct task_struct *tsk)\r
-+static inline void wake_oom_reaper(struct task_struct *tsk,\r
-+ struct mm_struct *mm)\r
+ ioend = container_of(bio, struct xfs_ioend, io_inline_bio);\r
+ INIT_LIST_HEAD(&ioend->io_list);\r
+@@ -719,24 +720,22 @@ xfs_alloc_ioend(\r
+ * so that the bi_private linkage is set up in the right direction for the\r
+ * traversal in xfs_destroy_ioend().\r
+ */\r
+-static void\r
++static struct bio *\r
+ xfs_chain_bio(\r
+- struct xfs_ioend *ioend,\r
+- struct writeback_control *wbc,\r
+- struct block_device *bdev,\r
+- sector_t sector)\r
++ struct bio *prev)\r
{\r
+ struct bio *new;\r
+ \r
+ new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);\r
+- bio_set_dev(new, bdev);\r
+- new->bi_iter.bi_sector = sector;\r
+- bio_chain(ioend->io_bio, new);\r
+- bio_get(ioend->io_bio); /* for xfs_destroy_ioend */\r
+- ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);\r
+- ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;\r
+- submit_bio(ioend->io_bio);\r
+- ioend->io_bio = new;\r
++ bio_copy_dev(new, prev);\r
++ new->bi_iter.bi_sector = bio_end_sector(prev);\r
++ new->bi_opf = prev->bi_opf;\r
++ new->bi_write_hint = prev->bi_write_hint;\r
++\r
++ bio_chain(prev, new);\r
++ bio_get(prev); /* for xfs_destroy_ioend */\r
++ submit_bio(prev);\r
++ return new;\r
}\r
- #endif /* CONFIG_MMU */\r
-@@ -915,7 +913,7 @@ static void __oom_kill_process(struct task_struct *victim)\r
- rcu_read_unlock();\r
\r
- if (can_oom_reap)\r
-- wake_oom_reaper(victim);\r
-+ wake_oom_reaper(victim, mm);\r
+ /*\r
+@@ -614,14 +614,14 @@ xfs_add_to_ioend(\r
+ if (wpc->ioend)\r
+ list_add(&wpc->ioend->io_list, iolist);\r
+ wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset,\r
+- bdev, sector);\r
++ bdev, sector, wbc);\r
+ }\r
\r
- mmdrop(mm);\r
- put_task_struct(victim);\r
-@@ -955,7 +953,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)\r
- task_lock(p);\r
- if (task_will_free_mem(p)) {\r
- mark_oom_victim(p);\r
-- wake_oom_reaper(p);\r
-+ wake_oom_reaper(p, p->mm);\r
- task_unlock(p);\r
- put_task_struct(p);\r
- return;\r
-@@ -1085,7 +1083,7 @@ bool out_of_memory(struct oom_control *oc)\r
- */\r
- if (task_will_free_mem(current)) {\r
- mark_oom_victim(current);\r
-- wake_oom_reaper(current);\r
-+ wake_oom_reaper(current, current->mm);\r
- return true;\r
+ if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {\r
+ if (iop)\r
+ atomic_inc(&iop->write_count);\r
+ if (bio_full(wpc->ioend->io_bio))\r
+- xfs_chain_bio(wpc->ioend, wbc, bdev, sector);\r
++ wpc->ioend->io_bio = xfs_chain_bio(wpc->ioend->io_bio);\r
+ __bio_add_page(wpc->ioend->io_bio, page, len, poff);\r
}\r
\r
-- \r
-1.8.3.1\r
+2.20.1\r
+\r
+\r
+Link every newly allocated writeback bio to cgroup pointed to by the\r
+writeback control structure, and charge every byte written back to it.\r
+\r
+Tested-by: Stefan Priebe - Profihost AG <s.priebe@profihost.ag>\r
+Signed-off-by: Christoph Hellwig <hch@lst.de>\r
+---\r
+ fs/xfs/xfs_aops.c | 4 +++-\r
+ fs/xfs/xfs_super.c | 2 ++\r
+ 2 files changed, 5 insertions(+), 1 deletion(-)\r
+\r
+diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c\r
+index 9cceb90e77c5..73c291aeae17 100644\r
+--- a/fs/xfs/xfs_aops.c\r
++++ b/fs/xfs/xfs_aops.c\r
+@@ -700,6 +700,7 @@ xfs_alloc_ioend(\r
+ bio->bi_iter.bi_sector = sector;\r
+ bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);\r
+ bio->bi_write_hint = inode->i_write_hint;\r
++ wbc_init_bio(wbc, bio);\r
+ \r
+ ioend = container_of(bio, struct xfs_ioend, io_inline_bio);\r
+ INIT_LIST_HEAD(&ioend->io_list);\r
+@@ -727,7 +728,7 @@ xfs_chain_bio(\r
+ struct bio *new;\r
+ \r
+ new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);\r
+- bio_copy_dev(new, prev);\r
++ bio_copy_dev(new, prev);/* also copies over blkcg information */\r
+ new->bi_iter.bi_sector = bio_end_sector(prev);\r
+ new->bi_opf = prev->bi_opf;\r
+ new->bi_write_hint = prev->bi_write_hint;\r
+@@ -782,6 +783,7 @@ xfs_add_to_ioend(\r
+ }\r
+ \r
+ wpc->ioend->io_size += len;\r
++ wbc_account_io(wbc, page, len);\r
+ }\r
+ \r
+ STATIC void\r
+diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c\r
+index 594c119824cc..ee0df8f611ff 100644\r
+--- a/fs/xfs/xfs_super.c\r
++++ b/fs/xfs/xfs_super.c\r
+@@ -1685,6 +1685,8 @@ xfs_fs_fill_super(\r
+ sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits);\r
+ sb->s_max_links = XFS_MAXLINK;\r
+ sb->s_time_gran = 1;\r
++ sb->s_iflags |= SB_I_CGROUPWB;\r
++\r
+ set_posix_acl_flag(sb);\r
+ \r
+ /* version 5 superblocks support inode version counters. */\r
+-- \r
+2.20.1\r
\r