- up to 4.19.68

[packages/kernel.git] / kernel-small_fixes.patch
diff --git a/kernel-small_fixes.patch b/kernel-small_fixes.patch

index 6d08eeea1e47c96e7731d644ef28e2c78f057f66..2674f5c5fb8db3fc4dc47f5652758fdc3e85f684 100644 (file)
--- a/kernel-small_fixes.patch
+++ b/kernel-small_fixes.patch
@@ -1,256 +1,160 @@
-From: Shaohua Li <shli@fb.com>
-
-Basically this is a copy of commit 001e4a8775f6(ext4: implement cgroup
-writeback support). Tested with a fio test, verified writeback is
-throttled against cgroup io.max write bandwidth, also verified moving
-the fio test to another cgroup and the writeback is throttled against
-new cgroup setting.
-
-Cc: Tejun Heo <tj@kernel.org>
-Signed-off-by: Shaohua Li <shli@fb.com>
----
- fs/xfs/xfs_aops.c  | 2 ++
- fs/xfs/xfs_super.c | 1 +
- 2 files changed, 3 insertions(+)
-
---- linux-4.19/fs/xfs/xfs_aops.c.org   2018-11-21 10:31:12.348955352 +0100
-+++ linux-4.19/fs/xfs/xfs_aops.c       2018-11-21 10:34:35.241764742 +0100
-@@ -613,8 +613,10 @@ xfs_add_to_ioend(
-                       list_add(&wpc->ioend->io_list, iolist);
-               wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset,
-                               bdev, sector);
-+              wbc_init_bio(wbc, wpc->ioend->io_bio);
-       }
- 
-+      wbc_account_io(wbc, page, len);
-       if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
-               if (iop)
-                       atomic_inc(&iop->write_count);
-diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
-index 584cf2d..aea3bc2 100644
---- a/fs/xfs/xfs_super.c
-+++ b/fs/xfs/xfs_super.c
-@@ -1634,6 +1634,7 @@ xfs_fs_fill_super(
-       sb->s_max_links = XFS_MAXLINK;
-       sb->s_time_gran = 1;
-       set_posix_acl_flag(sb);
-+      sb->s_iflags |= SB_I_CGROUPWB;
- 
-       /* version 5 superblocks support inode version counters. */
-       if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
-From e820d55cb99dd93ac2dc949cf486bb187e5cd70d Mon Sep 17 00:00:00 2001
-From: Guoqing Jiang <gqjiang@suse.com>
-Date: Wed, 19 Dec 2018 14:19:25 +0800
-Subject: md: fix raid10 hang issue caused by barrier
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-When both regular IO and resync IO happen at the same time,
-and if we also need to split regular. Then we can see tasks
-hang due to barrier.
-
-1. resync thread
-[ 1463.757205] INFO: task md1_resync:5215 blocked for more than 480 seconds.
-[ 1463.757207]       Not tainted 4.19.5-1-default #1
-[ 1463.757209] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
-[ 1463.757212] md1_resync      D    0  5215      2 0x80000000
-[ 1463.757216] Call Trace:
-[ 1463.757223]  ? __schedule+0x29a/0x880
-[ 1463.757231]  ? raise_barrier+0x8d/0x140 [raid10]
-[ 1463.757236]  schedule+0x78/0x110
-[ 1463.757243]  raise_barrier+0x8d/0x140 [raid10]
-[ 1463.757248]  ? wait_woken+0x80/0x80
-[ 1463.757257]  raid10_sync_request+0x1f6/0x1e30 [raid10]
-[ 1463.757265]  ? _raw_spin_unlock_irq+0x22/0x40
-[ 1463.757284]  ? is_mddev_idle+0x125/0x137 [md_mod]
-[ 1463.757302]  md_do_sync.cold.78+0x404/0x969 [md_mod]
-[ 1463.757311]  ? wait_woken+0x80/0x80
-[ 1463.757336]  ? md_rdev_init+0xb0/0xb0 [md_mod]
-[ 1463.757351]  md_thread+0xe9/0x140 [md_mod]
-[ 1463.757358]  ? _raw_spin_unlock_irqrestore+0x2e/0x60
-[ 1463.757364]  ? __kthread_parkme+0x4c/0x70
-[ 1463.757369]  kthread+0x112/0x130
-[ 1463.757374]  ? kthread_create_worker_on_cpu+0x40/0x40
-[ 1463.757380]  ret_from_fork+0x3a/0x50
-
-2. regular IO
-[ 1463.760679] INFO: task kworker/0:8:5367 blocked for more than 480 seconds.
-[ 1463.760683]       Not tainted 4.19.5-1-default #1
-[ 1463.760684] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
-[ 1463.760687] kworker/0:8     D    0  5367      2 0x80000000
-[ 1463.760718] Workqueue: md submit_flushes [md_mod]
-[ 1463.760721] Call Trace:
-[ 1463.760731]  ? __schedule+0x29a/0x880
-[ 1463.760741]  ? wait_barrier+0xdd/0x170 [raid10]
-[ 1463.760746]  schedule+0x78/0x110
-[ 1463.760753]  wait_barrier+0xdd/0x170 [raid10]
-[ 1463.760761]  ? wait_woken+0x80/0x80
-[ 1463.760768]  raid10_write_request+0xf2/0x900 [raid10]
-[ 1463.760774]  ? wait_woken+0x80/0x80
-[ 1463.760778]  ? mempool_alloc+0x55/0x160
-[ 1463.760795]  ? md_write_start+0xa9/0x270 [md_mod]
-[ 1463.760801]  ? try_to_wake_up+0x44/0x470
-[ 1463.760810]  raid10_make_request+0xc1/0x120 [raid10]
-[ 1463.760816]  ? wait_woken+0x80/0x80
-[ 1463.760831]  md_handle_request+0x121/0x190 [md_mod]
-[ 1463.760851]  md_make_request+0x78/0x190 [md_mod]
-[ 1463.760860]  generic_make_request+0x1c6/0x470
-[ 1463.760870]  raid10_write_request+0x77a/0x900 [raid10]
-[ 1463.760875]  ? wait_woken+0x80/0x80
-[ 1463.760879]  ? mempool_alloc+0x55/0x160
-[ 1463.760895]  ? md_write_start+0xa9/0x270 [md_mod]
-[ 1463.760904]  raid10_make_request+0xc1/0x120 [raid10]
-[ 1463.760910]  ? wait_woken+0x80/0x80
-[ 1463.760926]  md_handle_request+0x121/0x190 [md_mod]
-[ 1463.760931]  ? _raw_spin_unlock_irq+0x22/0x40
-[ 1463.760936]  ? finish_task_switch+0x74/0x260
-[ 1463.760954]  submit_flushes+0x21/0x40 [md_mod]
-
-So resync io is waiting for regular write io to complete to
-decrease nr_pending (conf->barrier++ is called before waiting).
-The regular write io splits another bio after call wait_barrier
-which call nr_pending++, then the splitted bio would continue
-with raid10_write_request -> wait_barrier, so the splitted bio
-has to wait for barrier to be zero, then deadlock happens as
-follows.
-
-       resync io               regular io
-
-       raise_barrier
-                               wait_barrier
-                               generic_make_request
-                               wait_barrier
-
-To resolve the issue, we need to call allow_barrier to decrease
-nr_pending before generic_make_request since regular IO is not
-issued to underlying devices, and wait_barrier is called again
-to ensure no internal IO happening.
-
-Fixes: fc9977dd069e ("md/raid10: simplify the splitting of requests.")
-Reported-and-tested-by: Siniša Bandin <sinisa@4net.rs>
-Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
-Signed-off-by: Shaohua Li <shli@fb.com>
----
- drivers/md/raid10.c | 4 ++++
- 1 file changed, 4 insertions(+)
-
-diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
-index 76c92e31afc0..abb5d382f64d 100644
---- a/drivers/md/raid10.c
-+++ b/drivers/md/raid10.c
-@@ -1209,7 +1209,9 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
-               struct bio *split = bio_split(bio, max_sectors,
-                                             gfp, &conf->bio_split);
-               bio_chain(split, bio);
-+              allow_barrier(conf);
-               generic_make_request(bio);
-+              wait_barrier(conf);
-               bio = split;
-               r10_bio->master_bio = bio;
-               r10_bio->sectors = max_sectors;
-@@ -1492,7 +1494,9 @@ retry_write:
-               struct bio *split = bio_split(bio, r10_bio->sectors,
-                                             GFP_NOIO, &conf->bio_split);
-               bio_chain(split, bio);
-+              allow_barrier(conf);
-               generic_make_request(bio);
-+              wait_barrier(conf);
-               bio = split;
-               r10_bio->master_bio = bio;
-       }
--- 
-cgit 1.2-0.3.lf.el7
-
-From: Dave Chinner <dchinner@redhat.com>
-
-This reverts commit a76cf1a474d7dbcd9336b5f5afb0162baa142cf0.
-
-This change causes serious changes to page cache and inode cache
-behaviour and balance, resulting in major performance regressions
-when combining worklaods such as large file copies and kernel
-compiles.
-
-https://bugzilla.kernel.org/show_bug.cgi?id=202441
-
-This change is a hack to work around the problems introduced by
-changing how agressive shrinkers are on small caches in commit
-172b06c32b94 ("mm: slowly shrink slabs with a relatively small
-number of objects"). It creates more problems than it solves, wasn't
-adequately reviewed or tested, so it needs to be reverted.
-
-cc: <stable@vger.kernel.org>
-Signed-off-by: Dave Chinner <dchinner@redhat.com>
----
- fs/inode.c | 7 ++-----
- 1 file changed, 2 insertions(+), 5 deletions(-)
-
-diff --git a/fs/inode.c b/fs/inode.c
-index 0cd47fe0dbe5..73432e64f874 100644
---- a/fs/inode.c
-+++ b/fs/inode.c
-@@ -730,11 +730,8 @@ static enum lru_status inode_lru_isolate(struct list_head *item,
-               return LRU_REMOVED;
-       }
- 
--      /*
--       * Recently referenced inodes and inodes with many attached pages
--       * get one more pass.
--       */
--      if (inode->i_state & I_REFERENCED || inode->i_data.nrpages > 1) {
-+      /* recently referenced inodes get one more pass */
-+      if (inode->i_state & I_REFERENCED) {
-               inode->i_state &= ~I_REFERENCED;
-               spin_unlock(&inode->i_lock);
-               return LRU_ROTATE;
--- 
-2.20.1
-
-This reverts commit 172b06c32b949759fe6313abec514bc4f15014f4.
-
-This change changes the agressiveness of shrinker reclaim, causing
-small cache and low priority reclaim to greatly increase
-scanning pressure on small caches. As a result, light memory
-pressure has a disproportionate affect on small caches, and causes
-large caches to be reclaimed much faster than previously.
-
-As a result, it greatly perturbs the delicate balance of the VFS
-caches (dentry/inode vs file page cache) such that the inode/dentry
-caches are reclaimed much, much faster than the page cache and this
-drives us into several other caching imbalance related problems.
-
-As such, this is a bad change and needs to be reverted.
-
-[ Needs some massaging to retain the later seekless shrinker
-modifications. ]
-
-cc: <stable@vger.kernel.org>
-Signed-off-by: Dave Chinner <dchinner@redhat.com>
----
- mm/vmscan.c | 10 ----------
- 1 file changed, 10 deletions(-)
-
-diff --git a/mm/vmscan.c b/mm/vmscan.c
-index a714c4f800e9..e979705bbf32 100644
---- a/mm/vmscan.c
-+++ b/mm/vmscan.c
-@@ -491,16 +491,6 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
-               delta = freeable / 2;
-       }
- 
--      /*
--       * Make sure we apply some minimal pressure on default priority
--       * even on small cgroups. Stale objects are not only consuming memory
--       * by themselves, but can also hold a reference to a dying cgroup,
--       * preventing it from being reclaimed. A dying cgroup with all
--       * corresponding structures like per-cpu stats and kmem caches
--       * can be really big, so it may lead to a significant waste of memory.
--       */
--      delta = max_t(unsigned long long, delta, min(freeable, batch_size));
--
-       total_scan += delta;
-       if (total_scan < 0) {
-               pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
--- 
-2.20.1
-
+Move setting up operation and write hint to xfs_alloc_ioend, and\r
+then just copy over all needed information from the previous bio\r
+in xfs_chain_bio and stop passing various parameters to it.\r
+\r
+Signed-off-by: Christoph Hellwig <hch@lst.de>\r
+---\r
+ fs/xfs/xfs_aops.c | 35 +++++++++++++++++------------------\r
+ 1 file changed, 17 insertions(+), 18 deletions(-)\r
+\r
+diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c\r
+index a6f0f4761a37..9cceb90e77c5 100644\r
+--- a/fs/xfs/xfs_aops.c\r
++++ b/fs/xfs/xfs_aops.c\r
+@@ -665,7 +665,6 @@ xfs_submit_ioend(\r
+ \r
+       ioend->io_bio->bi_private = ioend;\r
+       ioend->io_bio->bi_end_io = xfs_end_bio;\r
+-      ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);\r
+ \r
+       /*\r
+        * If we are failing the IO now, just mark the ioend with an\r
+@@ -679,7 +678,6 @@ xfs_submit_ioend(\r
+               return status;\r
+       }\r
+ \r
+-      ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;\r
+       submit_bio(ioend->io_bio);\r
+       return 0;\r
+ }\r
+@@ -691,7 +689,8 @@ xfs_alloc_ioend(\r
+       xfs_exntst_t            state,\r
+       xfs_off_t               offset,\r
+       struct block_device     *bdev,\r
+-      sector_t                sector)\r
++      sector_t                sector,\r
++      struct writeback_control *wbc)\r
+ {\r
+       struct xfs_ioend        *ioend;\r
+       struct bio              *bio;\r
+@@ -699,6 +698,8 @@ xfs_alloc_ioend(\r
+       bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset);\r
+       bio_set_dev(bio, bdev);\r
+       bio->bi_iter.bi_sector = sector;\r
++      bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);\r
++      bio->bi_write_hint = inode->i_write_hint;\r
+ \r
+       ioend = container_of(bio, struct xfs_ioend, io_inline_bio);\r
+       INIT_LIST_HEAD(&ioend->io_list);\r
+@@ -719,24 +720,22 @@ xfs_alloc_ioend(\r
+  * so that the bi_private linkage is set up in the right direction for the\r
+  * traversal in xfs_destroy_ioend().\r
+  */\r
+-static void\r
++static struct bio *\r
+ xfs_chain_bio(\r
+-      struct xfs_ioend        *ioend,\r
+-      struct writeback_control *wbc,\r
+-      struct block_device     *bdev,\r
+-      sector_t                sector)\r
++      struct bio              *prev)\r
+ {\r
+       struct bio *new;\r
+ \r
+       new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);\r
+-      bio_set_dev(new, bdev);\r
+-      new->bi_iter.bi_sector = sector;\r
+-      bio_chain(ioend->io_bio, new);\r
+-      bio_get(ioend->io_bio);         /* for xfs_destroy_ioend */\r
+-      ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);\r
+-      ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;\r
+-      submit_bio(ioend->io_bio);\r
+-      ioend->io_bio = new;\r
++      bio_copy_dev(new, prev);\r
++      new->bi_iter.bi_sector = bio_end_sector(prev);\r
++      new->bi_opf = prev->bi_opf;\r
++      new->bi_write_hint = prev->bi_write_hint;\r
++\r
++      bio_chain(prev, new);\r
++      bio_get(prev);          /* for xfs_destroy_ioend */\r
++      submit_bio(prev);\r
++      return new;\r
+ }\r
+ \r
+ /*\r
+@@ -614,14 +614,14 @@ xfs_add_to_ioend(\r
+               if (wpc->ioend)\r
+                       list_add(&wpc->ioend->io_list, iolist);\r
+               wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset,\r
+-                              bdev, sector);\r
++                              bdev, sector, wbc);\r
+       }\r
+ \r
+       if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {\r
+               if (iop)\r
+                       atomic_inc(&iop->write_count);\r
+               if (bio_full(wpc->ioend->io_bio))\r
+-                      xfs_chain_bio(wpc->ioend, wbc, bdev, sector);\r
++                      wpc->ioend->io_bio = xfs_chain_bio(wpc->ioend->io_bio);\r
+               __bio_add_page(wpc->ioend->io_bio, page, len, poff);\r
+       }\r
+ \r
+-- \r
+2.20.1\r
+\r
+\r
+Link every newly allocated writeback bio to cgroup pointed to by the\r
+writeback control structure, and charge every byte written back to it.\r
+\r
+Tested-by: Stefan Priebe - Profihost AG <s.priebe@profihost.ag>\r
+Signed-off-by: Christoph Hellwig <hch@lst.de>\r
+---\r
+ fs/xfs/xfs_aops.c  | 4 +++-\r
+ fs/xfs/xfs_super.c | 2 ++\r
+ 2 files changed, 5 insertions(+), 1 deletion(-)\r
+\r
+diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c\r
+index 9cceb90e77c5..73c291aeae17 100644\r
+--- a/fs/xfs/xfs_aops.c\r
++++ b/fs/xfs/xfs_aops.c\r
+@@ -700,6 +700,7 @@ xfs_alloc_ioend(\r
+       bio->bi_iter.bi_sector = sector;\r
+       bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);\r
+       bio->bi_write_hint = inode->i_write_hint;\r
++      wbc_init_bio(wbc, bio);\r
+ \r
+       ioend = container_of(bio, struct xfs_ioend, io_inline_bio);\r
+       INIT_LIST_HEAD(&ioend->io_list);\r
+@@ -727,7 +728,7 @@ xfs_chain_bio(\r
+       struct bio *new;\r
+ \r
+       new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);\r
+-      bio_copy_dev(new, prev);\r
++      bio_copy_dev(new, prev);/* also copies over blkcg information */\r
+       new->bi_iter.bi_sector = bio_end_sector(prev);\r
+       new->bi_opf = prev->bi_opf;\r
+       new->bi_write_hint = prev->bi_write_hint;\r
+@@ -782,6 +783,7 @@ xfs_add_to_ioend(\r
+       }\r
+ \r
+       wpc->ioend->io_size += len;\r
++      wbc_account_io(wbc, page, len);\r
+ }\r
+ \r
+ STATIC void\r
+diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c\r
+index 594c119824cc..ee0df8f611ff 100644\r
+--- a/fs/xfs/xfs_super.c\r
++++ b/fs/xfs/xfs_super.c\r
+@@ -1685,6 +1685,8 @@ xfs_fs_fill_super(\r
+       sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits);\r
+       sb->s_max_links = XFS_MAXLINK;\r
+       sb->s_time_gran = 1;\r
++      sb->s_iflags |= SB_I_CGROUPWB;\r
++\r
+       set_posix_acl_flag(sb);\r
+ \r
+       /* version 5 superblocks support inode version counters. */\r
+-- \r
+2.20.1\r
+\r