summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArkadiusz Miśkiewicz2016-01-11 07:00:34 (GMT)
committerArkadiusz Miśkiewicz2016-01-11 07:00:34 (GMT)
commit28b2046740f32f01e3b8c623715bb9418de6cc9f (patch)
tree489fb5ccda0685d36c68880b48f07ce538ecab8f
parent65e2daa20f668f7fec617adaac52545bbbfa65d8 (diff)
downloadkernel-28b2046740f32f01e3b8c623715bb9418de6cc9f.zip
kernel-28b2046740f32f01e3b8c623715bb9418de6cc9f.tar.gz
- v3 version of the patch (split into two chunks)
-rw-r--r--kernel-small_fixes.patch329
1 files changed, 216 insertions, 113 deletions
diff --git a/kernel-small_fixes.patch b/kernel-small_fixes.patch
index e4cb3a8..70e9047 100644
--- a/kernel-small_fixes.patch
+++ b/kernel-small_fixes.patch
@@ -784,6 +784,208 @@ index 41d70bc..84e597e 100644
tbio->bi_rw = WRITE;
tbio->bi_private = r10_bio;
tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
+From: Michal Hocko <mhocko@suse.com>
+
+kernel test robot has reported the following crash:
+[ 3.870718] BUG: unable to handle kernel NULL pointer dereferenceNULL pointer dereference at 00000100
+ at 00000100
+[ 3.872615] IP: [<c1074df6>] __queue_work+0x26/0x390 [<c1074df6>] __queue_work+0x26/0x390
+[ 3.873758] *pdpt = 0000000000000000 *pde = f000ff53f000ff53 *pde = f000ff53f000ff53
+[ 3.875096] Oops: 0000 [#1] PREEMPT PREEMPT SMP SMP
+[ 3.876130] CPU: 0 PID: 24 Comm: kworker/0:1 Not tainted 4.4.0-rc4-00139-g373ccbe #1
+[ 3.878135] Workqueue: events vmstat_shepherd
+[ 3.879207] task: cb684600 ti: cb7ba000 task.ti: cb7ba000
+[ 3.880445] EIP: 0060:[<c1074df6>] EFLAGS: 00010046 CPU: 0
+[ 3.881704] EIP is at __queue_work+0x26/0x390
+[ 3.882823] EAX: 00000046 EBX: cbb37800 ECX: cbb37800 EDX: 00000000
+[ 3.884457] ESI: 00000000 EDI: 00000000 EBP: cb7bbe68 ESP: cb7bbe38
+[ 3.886005] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
+[ 3.887229] CR0: 8005003b CR2: 00000100 CR3: 01fd5000 CR4: 000006b0
+[ 3.888663] Stack:
+[ 3.895204] Call Trace:
+[ 3.895854] [<c1a381dd>] ? mutex_unlock+0xd/0x10
+[ 3.897120] [<c1075221>] __queue_delayed_work+0xa1/0x160
+[ 3.898530] [<c10764c6>] queue_delayed_work_on+0x36/0x60
+[ 3.899790] [<c11494bd>] vmstat_shepherd+0xad/0xf0
+[ 3.900899] [<c1075a7a>] process_one_work+0x1aa/0x4c0
+[ 3.902093] [<c10759e2>] ? process_one_work+0x112/0x4c0
+[ 3.903520] [<c10ac31e>] ? do_raw_spin_lock+0xe/0x150
+[ 3.904853] [<c1075dd1>] worker_thread+0x41/0x440
+[ 3.906023] [<c1075d90>] ? process_one_work+0x4c0/0x4c0
+[ 3.907242] [<c107b7c0>] kthread+0xb0/0xd0
+[ 3.908188] [<c1a3c651>] ret_from_kernel_thread+0x21/0x40
+[ 3.909601] [<c107b710>] ? __kthread_parkme+0x80/0x80
+
+The reason is that start_shepherd_timer schedules the shepherd work item
+which uses vmstat_wq (vmstat_shepherd) before setup_vmstat allocates
+that workqueue so if the further initialization takes more than HZ
+we might end up scheduling on a NULL vmstat_wq. This is really unlikely
+but not impossible.
+
+Fixes: 373ccbe59270 ("mm, vmstat: allow WQ concurrency to discover memory reclaim doesn't make any progress")
+Reported-by: kernel test robot <ying.huang@linux.intel.com>
+Signed-off-by: Michal Hocko <mhocko@suse.com>
+---
+Hi Linus,
+I am not marking this for stable because I hope we can sneak it into 4.4.
+The patch is trivial and obvious. I am sorry about the breakage. If you prefer
+to postpone it to 4.5-rc1 because this is not really that critical and shouldn't
+happen most of the time then I will repost with stable tag added.
+
+Thanks!
+
+ mm/vmstat.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/mm/vmstat.c b/mm/vmstat.c
+index 4ebc17d948cb..c54fd2924f25 100644
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -1483,6 +1483,7 @@ static void __init start_shepherd_timer(void)
+ BUG();
+ cpumask_copy(cpu_stat_off, cpu_online_mask);
+
++ vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+ schedule_delayed_work(&shepherd,
+ round_jiffies_relative(sysctl_stat_interval));
+ }
+@@ -1550,7 +1551,6 @@ static int __init setup_vmstat(void)
+
+ start_shepherd_timer();
+ cpu_notifier_register_done();
+- vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+ #endif
+ #ifdef CONFIG_PROC_FS
+ proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
+--
+2.6.4
+
+From: Dave Chinner <dchinner@redhat.com>
+
+When we do inode readahead in log recovery, we do can do the
+readahead before we've replayed the icreate transaction that stamps
+the buffer with inode cores. The inode readahead verifier catches
+this and marks the buffer as !done to indicate that it doesn't yet
+contain valid inodes.
+
+In adding buffer error notification (i.e. setting b_error = -EIO at
+the same time as as we clear the done flag) to such a readahead
+verifier failure, we can then get subsequent inode recovery failing
+with this error:
+
+XFS (dm-0): metadata I/O error: block 0xa00060 ("xlog_recover_do..(read#2)") error 5 numblks 32
+
+This occurs when readahead completion races with icreate item replay
+such as:
+
+ inode readahead
+ find buffer
+ lock buffer
+ submit RA io
+ ....
+ icreate recovery
+ xfs_trans_get_buffer
+ find buffer
+ lock buffer
+ <blocks on RA completion>
+ .....
+ <ra completion>
+ fails verifier
+ clear XBF_DONE
+ set bp->b_error = -EIO
+ release and unlock buffer
+ <icreate gains lock>
+ icreate initialises buffer
+ marks buffer as done
+ adds buffer to delayed write queue
+ releases buffer
+
+At this point, we have an initialised inode buffer that is up to
+date but has an -EIO state registered against it. When we finally
+get to recovering an inode in that buffer:
+
+ inode item recovery
+ xfs_trans_read_buffer
+ find buffer
+ lock buffer
+ sees XBF_DONE is set, returns buffer
+ sees bp->b_error is set
+ fail log recovery!
+
+Essentially, we need xfs_trans_get_buf_map() to clear the error status of
+the buffer when doing a lookup. This function returns uninitialised
+buffers, so the buffer returned can not be in an error state and
+none of the code that uses this function expects b_error to be set
+on return. Indeed, there is an ASSERT(!bp->b_error); in the
+transaction case in xfs_trans_get_buf_map() that would have caught
+this if log recovery used transactions....
+
+This patch firstly changes the inode readahead failure to set -EIO
+on the buffer, and secondly changes xfs_buf_get_map() to never
+return a buffer with an error state set so this first change doesn't
+cause unexpected log recovery failures.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+---
+ fs/xfs/libxfs/xfs_inode_buf.c | 12 +++++++-----
+ fs/xfs/xfs_buf.c | 7 +++++++
+ 2 files changed, 14 insertions(+), 5 deletions(-)
+
+diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
+index 1b8d98a..ff17c48 100644
+--- a/fs/xfs/libxfs/xfs_inode_buf.c
++++ b/fs/xfs/libxfs/xfs_inode_buf.c
+@@ -62,11 +62,12 @@ xfs_inobp_check(
+ * has not had the inode cores stamped into it. Hence for readahead, the buffer
+ * may be potentially invalid.
+ *
+- * If the readahead buffer is invalid, we don't want to mark it with an error,
+- * but we do want to clear the DONE status of the buffer so that a followup read
+- * will re-read it from disk. This will ensure that we don't get an unnecessary
+- * warnings during log recovery and we don't get unnecssary panics on debug
+- * kernels.
++ * If the readahead buffer is invalid, we need to mark it with an error and
++ * clear the DONE status of the buffer so that a followup read will re-read it
++ * from disk. We don't report the error otherwise to avoid warnings during log
++ * recovery and we don't get unnecssary panics on debug kernels. We use EIO here
++ * because all we want to do is say readahead failed; there is no-one to report
++ * the error to, so this will distinguish it from a non-ra verifier failure.
+ */
+ static void
+ xfs_inode_buf_verify(
+@@ -93,6 +94,7 @@ xfs_inode_buf_verify(
+ XFS_RANDOM_ITOBP_INOTOBP))) {
+ if (readahead) {
+ bp->b_flags &= ~XBF_DONE;
++ xfs_buf_ioerror(bp, -EIO);
+ return;
+ }
+
+diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
+index 45a8ea7..ae86b16 100644
+--- a/fs/xfs/xfs_buf.c
++++ b/fs/xfs/xfs_buf.c
+@@ -604,6 +604,13 @@ found:
+ }
+ }
+
++ /*
++ * Clear b_error if this is a lookup from a caller that doesn't expect
++ * valid data to be found in the buffer.
++ */
++ if (!(flags & XBF_READ))
++ xfs_buf_ioerror(bp, 0);
++
+ XFS_STATS_INC(xb_get);
+ trace_xfs_buf_get(bp, flags, _RET_IP_);
+ return bp;
+--
+2.5.0
+
+_______________________________________________
+xfs mailing list
+xfs@oss.sgi.com
+http://oss.sgi.com/mailman/listinfo/xfs
From: Dave Chinner <dchinner@redhat.com>
When we do dquot readahead in log recovery, we do not use a verifier
@@ -824,21 +1026,15 @@ together so we don't forget this behavioural link in future.
cc: <stable@vger.kernel.org> # 3.12 - current
Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
---
-
-Version 2
-- fix logic error in determining if verify failed
-- set error on buffer when verifier fails
-- fix inode buffer readahead verifier to set error when it fails
-- better comments, link dquot and inode buffer ra verifiers in the
- comments
-
fs/xfs/libxfs/xfs_dquot_buf.c | 36 ++++++++++++++++++++++++++++++------
- fs/xfs/libxfs/xfs_inode_buf.c | 14 +++++++++-----
+ fs/xfs/libxfs/xfs_inode_buf.c | 2 ++
fs/xfs/libxfs/xfs_quota_defs.h | 2 +-
fs/xfs/libxfs/xfs_shared.h | 1 +
fs/xfs/xfs_log_recover.c | 9 +++++++--
- 5 files changed, 48 insertions(+), 14 deletions(-)
+ 5 files changed, 41 insertions(+), 9 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index 11cefb2..3cc3cf7 100644
@@ -927,37 +1123,18 @@ index 11cefb2..3cc3cf7 100644
+ .verify_write = xfs_dquot_buf_write_verify,
+};
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
-index 1b8d98a..4816209 100644
+index ff17c48..1aabfda 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
-@@ -62,11 +62,14 @@ xfs_inobp_check(
- * has not had the inode cores stamped into it. Hence for readahead, the buffer
- * may be potentially invalid.
- *
-- * If the readahead buffer is invalid, we don't want to mark it with an error,
-- * but we do want to clear the DONE status of the buffer so that a followup read
-- * will re-read it from disk. This will ensure that we don't get an unnecessary
-- * warnings during log recovery and we don't get unnecssary panics on debug
-- * kernels.
-+ * If the readahead buffer is invalid, we need to mark it with an error and
-+ * clear the DONE status of the buffer so that a followup read will re-read it
-+ * from disk. We don't report the error otherwise to avoid warnings during log
-+ * recovery and we don't get unnecssary panics on debug kernels. We use EIO here
-+ * because all we want to do is say readahead failed; there is no-one to report
-+ * the error to, so this will distinguish it from a non-ra verifier failure.
+@@ -68,6 +68,8 @@ xfs_inobp_check(
+ * recovery and we don't get unnecssary panics on debug kernels. We use EIO here
+ * because all we want to do is say readahead failed; there is no-one to report
+ * the error to, so this will distinguish it from a non-ra verifier failure.
+ * Changes to this readahead error behavour also need to be reflected in
+ * xfs_dquot_buf_readahead_verify().
*/
static void
xfs_inode_buf_verify(
-@@ -92,6 +95,7 @@ xfs_inode_buf_verify(
- XFS_ERRTAG_ITOBP_INOTOBP,
- XFS_RANDOM_ITOBP_INOTOBP))) {
- if (readahead) {
-+ xfs_buf_ioerror(bp, -EIO);
- bp->b_flags &= ~XBF_DONE;
- return;
- }
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index 1b0a083..f51078f 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
@@ -984,10 +1161,10 @@ index 5be5297..15c3ceb 100644
extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
extern const struct xfs_buf_ops xfs_symlink_buf_ops;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
-index 26e67b4..da37beb 100644
+index c5ecaac..5991cdc 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
-@@ -3521,6 +3521,7 @@ xlog_recover_dquot_ra_pass2(
+@@ -3204,6 +3204,7 @@ xlog_recover_dquot_ra_pass2(
struct xfs_disk_dquot *recddq;
struct xfs_dq_logformat *dq_f;
uint type;
@@ -995,7 +1172,7 @@ index 26e67b4..da37beb 100644
if (mp->m_qflags == 0)
-@@ -3541,8 +3542,12 @@ xlog_recover_dquot_ra_pass2(
+@@ -3224,8 +3225,12 @@ xlog_recover_dquot_ra_pass2(
ASSERT(dq_f);
ASSERT(dq_f->qlf_len == 1);
@@ -1010,84 +1187,10 @@ index 26e67b4..da37beb 100644
}
STATIC void
+--
+2.5.0
_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs
-From: Michal Hocko <mhocko@suse.com>
-
-kernel test robot has reported the following crash:
-[ 3.870718] BUG: unable to handle kernel NULL pointer dereferenceNULL pointer dereference at 00000100
- at 00000100
-[ 3.872615] IP: [<c1074df6>] __queue_work+0x26/0x390 [<c1074df6>] __queue_work+0x26/0x390
-[ 3.873758] *pdpt = 0000000000000000 *pde = f000ff53f000ff53 *pde = f000ff53f000ff53
-[ 3.875096] Oops: 0000 [#1] PREEMPT PREEMPT SMP SMP
-[ 3.876130] CPU: 0 PID: 24 Comm: kworker/0:1 Not tainted 4.4.0-rc4-00139-g373ccbe #1
-[ 3.878135] Workqueue: events vmstat_shepherd
-[ 3.879207] task: cb684600 ti: cb7ba000 task.ti: cb7ba000
-[ 3.880445] EIP: 0060:[<c1074df6>] EFLAGS: 00010046 CPU: 0
-[ 3.881704] EIP is at __queue_work+0x26/0x390
-[ 3.882823] EAX: 00000046 EBX: cbb37800 ECX: cbb37800 EDX: 00000000
-[ 3.884457] ESI: 00000000 EDI: 00000000 EBP: cb7bbe68 ESP: cb7bbe38
-[ 3.886005] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
-[ 3.887229] CR0: 8005003b CR2: 00000100 CR3: 01fd5000 CR4: 000006b0
-[ 3.888663] Stack:
-[ 3.895204] Call Trace:
-[ 3.895854] [<c1a381dd>] ? mutex_unlock+0xd/0x10
-[ 3.897120] [<c1075221>] __queue_delayed_work+0xa1/0x160
-[ 3.898530] [<c10764c6>] queue_delayed_work_on+0x36/0x60
-[ 3.899790] [<c11494bd>] vmstat_shepherd+0xad/0xf0
-[ 3.900899] [<c1075a7a>] process_one_work+0x1aa/0x4c0
-[ 3.902093] [<c10759e2>] ? process_one_work+0x112/0x4c0
-[ 3.903520] [<c10ac31e>] ? do_raw_spin_lock+0xe/0x150
-[ 3.904853] [<c1075dd1>] worker_thread+0x41/0x440
-[ 3.906023] [<c1075d90>] ? process_one_work+0x4c0/0x4c0
-[ 3.907242] [<c107b7c0>] kthread+0xb0/0xd0
-[ 3.908188] [<c1a3c651>] ret_from_kernel_thread+0x21/0x40
-[ 3.909601] [<c107b710>] ? __kthread_parkme+0x80/0x80
-
-The reason is that start_shepherd_timer schedules the shepherd work item
-which uses vmstat_wq (vmstat_shepherd) before setup_vmstat allocates
-that workqueue so if the further initialization takes more than HZ
-we might end up scheduling on a NULL vmstat_wq. This is really unlikely
-but not impossible.
-
-Fixes: 373ccbe59270 ("mm, vmstat: allow WQ concurrency to discover memory reclaim doesn't make any progress")
-Reported-by: kernel test robot <ying.huang@linux.intel.com>
-Signed-off-by: Michal Hocko <mhocko@suse.com>
----
-Hi Linus,
-I am not marking this for stable because I hope we can sneak it into 4.4.
-The patch is trivial and obvious. I am sorry about the breakage. If you prefer
-to postpone it to 4.5-rc1 because this is not really that critical and shouldn't
-happen most of the time then I will repost with stable tag added.
-
-Thanks!
-
- mm/vmstat.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/mm/vmstat.c b/mm/vmstat.c
-index 4ebc17d948cb..c54fd2924f25 100644
---- a/mm/vmstat.c
-+++ b/mm/vmstat.c
-@@ -1483,6 +1483,7 @@ static void __init start_shepherd_timer(void)
- BUG();
- cpumask_copy(cpu_stat_off, cpu_online_mask);
-
-+ vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
- schedule_delayed_work(&shepherd,
- round_jiffies_relative(sysctl_stat_interval));
- }
-@@ -1550,7 +1551,6 @@ static int __init setup_vmstat(void)
-
- start_shepherd_timer();
- cpu_notifier_register_done();
-- vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
- #endif
- #ifdef CONFIG_PROC_FS
- proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
---
-2.6.4
-