From: Arkadiusz Miƛkiewicz Date: Mon, 11 Jan 2016 07:00:34 +0000 (+0100) Subject: - v3 version of the patch (split into two chunks) X-Git-Tag: auto/th/kernel-4.3.3-2~1 X-Git-Url: https://git.pld-linux.org/?a=commitdiff_plain;h=28b2046740f32f01e3b8c623715bb9418de6cc9f;p=packages%2Fkernel.git - v3 version of the patch (split into two chunks) --- diff --git a/kernel-small_fixes.patch b/kernel-small_fixes.patch index e4cb3a85..70e90479 100644 --- a/kernel-small_fixes.patch +++ b/kernel-small_fixes.patch @@ -784,6 +784,208 @@ index 41d70bc..84e597e 100644 tbio->bi_rw = WRITE; tbio->bi_private = r10_bio; tbio->bi_iter.bi_sector = r10_bio->devs[i].addr; +From: Michal Hocko + +kernel test robot has reported the following crash: +[ 3.870718] BUG: unable to handle kernel NULL pointer dereferenceNULL pointer dereference at 00000100 + at 00000100 +[ 3.872615] IP: [] __queue_work+0x26/0x390 [] __queue_work+0x26/0x390 +[ 3.873758] *pdpt = 0000000000000000 *pde = f000ff53f000ff53 *pde = f000ff53f000ff53 +[ 3.875096] Oops: 0000 [#1] PREEMPT PREEMPT SMP SMP +[ 3.876130] CPU: 0 PID: 24 Comm: kworker/0:1 Not tainted 4.4.0-rc4-00139-g373ccbe #1 +[ 3.878135] Workqueue: events vmstat_shepherd +[ 3.879207] task: cb684600 ti: cb7ba000 task.ti: cb7ba000 +[ 3.880445] EIP: 0060:[] EFLAGS: 00010046 CPU: 0 +[ 3.881704] EIP is at __queue_work+0x26/0x390 +[ 3.882823] EAX: 00000046 EBX: cbb37800 ECX: cbb37800 EDX: 00000000 +[ 3.884457] ESI: 00000000 EDI: 00000000 EBP: cb7bbe68 ESP: cb7bbe38 +[ 3.886005] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 +[ 3.887229] CR0: 8005003b CR2: 00000100 CR3: 01fd5000 CR4: 000006b0 +[ 3.888663] Stack: +[ 3.895204] Call Trace: +[ 3.895854] [] ? mutex_unlock+0xd/0x10 +[ 3.897120] [] __queue_delayed_work+0xa1/0x160 +[ 3.898530] [] queue_delayed_work_on+0x36/0x60 +[ 3.899790] [] vmstat_shepherd+0xad/0xf0 +[ 3.900899] [] process_one_work+0x1aa/0x4c0 +[ 3.902093] [] ? process_one_work+0x112/0x4c0 +[ 3.903520] [] ? do_raw_spin_lock+0xe/0x150 +[ 3.904853] [] worker_thread+0x41/0x440 +[ 3.906023] [] ? process_one_work+0x4c0/0x4c0 +[ 3.907242] [] kthread+0xb0/0xd0 +[ 3.908188] [] ret_from_kernel_thread+0x21/0x40 +[ 3.909601] [] ? __kthread_parkme+0x80/0x80 + +The reason is that start_shepherd_timer schedules the shepherd work item +which uses vmstat_wq (vmstat_shepherd) before setup_vmstat allocates +that workqueue so if the further initialization takes more than HZ +we might end up scheduling on a NULL vmstat_wq. This is really unlikely +but not impossible. + +Fixes: 373ccbe59270 ("mm, vmstat: allow WQ concurrency to discover memory reclaim doesn't make any progress") +Reported-by: kernel test robot +Signed-off-by: Michal Hocko +--- +Hi Linus, +I am not marking this for stable because I hope we can sneak it into 4.4. +The patch is trivial and obvious. I am sorry about the breakage. If you prefer +to postpone it to 4.5-rc1 because this is not really that critical and shouldn't +happen most of the time then I will repost with stable tag added. + +Thanks! + + mm/vmstat.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/mm/vmstat.c b/mm/vmstat.c +index 4ebc17d948cb..c54fd2924f25 100644 +--- a/mm/vmstat.c ++++ b/mm/vmstat.c +@@ -1483,6 +1483,7 @@ static void __init start_shepherd_timer(void) + BUG(); + cpumask_copy(cpu_stat_off, cpu_online_mask); + ++ vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + schedule_delayed_work(&shepherd, + round_jiffies_relative(sysctl_stat_interval)); + } +@@ -1550,7 +1551,6 @@ static int __init setup_vmstat(void) + + start_shepherd_timer(); + cpu_notifier_register_done(); +- vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + #endif + #ifdef CONFIG_PROC_FS + proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); +-- +2.6.4 + +From: Dave Chinner + +When we do inode readahead in log recovery, we do can do the +readahead before we've replayed the icreate transaction that stamps +the buffer with inode cores. The inode readahead verifier catches +this and marks the buffer as !done to indicate that it doesn't yet +contain valid inodes. + +In adding buffer error notification (i.e. setting b_error = -EIO at +the same time as as we clear the done flag) to such a readahead +verifier failure, we can then get subsequent inode recovery failing +with this error: + +XFS (dm-0): metadata I/O error: block 0xa00060 ("xlog_recover_do..(read#2)") error 5 numblks 32 + +This occurs when readahead completion races with icreate item replay +such as: + + inode readahead + find buffer + lock buffer + submit RA io + .... + icreate recovery + xfs_trans_get_buffer + find buffer + lock buffer + + ..... + + fails verifier + clear XBF_DONE + set bp->b_error = -EIO + release and unlock buffer + + icreate initialises buffer + marks buffer as done + adds buffer to delayed write queue + releases buffer + +At this point, we have an initialised inode buffer that is up to +date but has an -EIO state registered against it. When we finally +get to recovering an inode in that buffer: + + inode item recovery + xfs_trans_read_buffer + find buffer + lock buffer + sees XBF_DONE is set, returns buffer + sees bp->b_error is set + fail log recovery! + +Essentially, we need xfs_trans_get_buf_map() to clear the error status of +the buffer when doing a lookup. This function returns uninitialised +buffers, so the buffer returned can not be in an error state and +none of the code that uses this function expects b_error to be set +on return. Indeed, there is an ASSERT(!bp->b_error); in the +transaction case in xfs_trans_get_buf_map() that would have caught +this if log recovery used transactions.... + +This patch firstly changes the inode readahead failure to set -EIO +on the buffer, and secondly changes xfs_buf_get_map() to never +return a buffer with an error state set so this first change doesn't +cause unexpected log recovery failures. + +Signed-off-by: Dave Chinner +--- + fs/xfs/libxfs/xfs_inode_buf.c | 12 +++++++----- + fs/xfs/xfs_buf.c | 7 +++++++ + 2 files changed, 14 insertions(+), 5 deletions(-) + +diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c +index 1b8d98a..ff17c48 100644 +--- a/fs/xfs/libxfs/xfs_inode_buf.c ++++ b/fs/xfs/libxfs/xfs_inode_buf.c +@@ -62,11 +62,12 @@ xfs_inobp_check( + * has not had the inode cores stamped into it. Hence for readahead, the buffer + * may be potentially invalid. + * +- * If the readahead buffer is invalid, we don't want to mark it with an error, +- * but we do want to clear the DONE status of the buffer so that a followup read +- * will re-read it from disk. This will ensure that we don't get an unnecessary +- * warnings during log recovery and we don't get unnecssary panics on debug +- * kernels. ++ * If the readahead buffer is invalid, we need to mark it with an error and ++ * clear the DONE status of the buffer so that a followup read will re-read it ++ * from disk. We don't report the error otherwise to avoid warnings during log ++ * recovery and we don't get unnecssary panics on debug kernels. We use EIO here ++ * because all we want to do is say readahead failed; there is no-one to report ++ * the error to, so this will distinguish it from a non-ra verifier failure. + */ + static void + xfs_inode_buf_verify( +@@ -93,6 +94,7 @@ xfs_inode_buf_verify( + XFS_RANDOM_ITOBP_INOTOBP))) { + if (readahead) { + bp->b_flags &= ~XBF_DONE; ++ xfs_buf_ioerror(bp, -EIO); + return; + } + +diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c +index 45a8ea7..ae86b16 100644 +--- a/fs/xfs/xfs_buf.c ++++ b/fs/xfs/xfs_buf.c +@@ -604,6 +604,13 @@ found: + } + } + ++ /* ++ * Clear b_error if this is a lookup from a caller that doesn't expect ++ * valid data to be found in the buffer. ++ */ ++ if (!(flags & XBF_READ)) ++ xfs_buf_ioerror(bp, 0); ++ + XFS_STATS_INC(xb_get); + trace_xfs_buf_get(bp, flags, _RET_IP_); + return bp; +-- +2.5.0 + +_______________________________________________ +xfs mailing list +xfs@oss.sgi.com +http://oss.sgi.com/mailman/listinfo/xfs From: Dave Chinner When we do dquot readahead in log recovery, we do not use a verifier @@ -824,21 +1026,15 @@ together so we don't forget this behavioural link in future. cc: # 3.12 - current Signed-off-by: Dave Chinner +Reviewed-by: Brian Foster +Signed-off-by: Dave Chinner --- - -Version 2 -- fix logic error in determining if verify failed -- set error on buffer when verifier fails -- fix inode buffer readahead verifier to set error when it fails -- better comments, link dquot and inode buffer ra verifiers in the - comments - fs/xfs/libxfs/xfs_dquot_buf.c | 36 ++++++++++++++++++++++++++++++------ - fs/xfs/libxfs/xfs_inode_buf.c | 14 +++++++++----- + fs/xfs/libxfs/xfs_inode_buf.c | 2 ++ fs/xfs/libxfs/xfs_quota_defs.h | 2 +- fs/xfs/libxfs/xfs_shared.h | 1 + fs/xfs/xfs_log_recover.c | 9 +++++++-- - 5 files changed, 48 insertions(+), 14 deletions(-) + 5 files changed, 41 insertions(+), 9 deletions(-) diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 11cefb2..3cc3cf7 100644 @@ -927,37 +1123,18 @@ index 11cefb2..3cc3cf7 100644 + .verify_write = xfs_dquot_buf_write_verify, +}; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c -index 1b8d98a..4816209 100644 +index ff17c48..1aabfda 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c -@@ -62,11 +62,14 @@ xfs_inobp_check( - * has not had the inode cores stamped into it. Hence for readahead, the buffer - * may be potentially invalid. - * -- * If the readahead buffer is invalid, we don't want to mark it with an error, -- * but we do want to clear the DONE status of the buffer so that a followup read -- * will re-read it from disk. This will ensure that we don't get an unnecessary -- * warnings during log recovery and we don't get unnecssary panics on debug -- * kernels. -+ * If the readahead buffer is invalid, we need to mark it with an error and -+ * clear the DONE status of the buffer so that a followup read will re-read it -+ * from disk. We don't report the error otherwise to avoid warnings during log -+ * recovery and we don't get unnecssary panics on debug kernels. We use EIO here -+ * because all we want to do is say readahead failed; there is no-one to report -+ * the error to, so this will distinguish it from a non-ra verifier failure. +@@ -68,6 +68,8 @@ xfs_inobp_check( + * recovery and we don't get unnecssary panics on debug kernels. We use EIO here + * because all we want to do is say readahead failed; there is no-one to report + * the error to, so this will distinguish it from a non-ra verifier failure. + * Changes to this readahead error behavour also need to be reflected in + * xfs_dquot_buf_readahead_verify(). */ static void xfs_inode_buf_verify( -@@ -92,6 +95,7 @@ xfs_inode_buf_verify( - XFS_ERRTAG_ITOBP_INOTOBP, - XFS_RANDOM_ITOBP_INOTOBP))) { - if (readahead) { -+ xfs_buf_ioerror(bp, -EIO); - bp->b_flags &= ~XBF_DONE; - return; - } diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index 1b0a083..f51078f 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h @@ -984,10 +1161,10 @@ index 5be5297..15c3ceb 100644 extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; extern const struct xfs_buf_ops xfs_symlink_buf_ops; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c -index 26e67b4..da37beb 100644 +index c5ecaac..5991cdc 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c -@@ -3521,6 +3521,7 @@ xlog_recover_dquot_ra_pass2( +@@ -3204,6 +3204,7 @@ xlog_recover_dquot_ra_pass2( struct xfs_disk_dquot *recddq; struct xfs_dq_logformat *dq_f; uint type; @@ -995,7 +1172,7 @@ index 26e67b4..da37beb 100644 if (mp->m_qflags == 0) -@@ -3541,8 +3542,12 @@ xlog_recover_dquot_ra_pass2( +@@ -3224,8 +3225,12 @@ xlog_recover_dquot_ra_pass2( ASSERT(dq_f); ASSERT(dq_f->qlf_len == 1); @@ -1010,84 +1187,10 @@ index 26e67b4..da37beb 100644 } STATIC void +-- +2.5.0 _______________________________________________ xfs mailing list xfs@oss.sgi.com http://oss.sgi.com/mailman/listinfo/xfs -From: Michal Hocko - -kernel test robot has reported the following crash: -[ 3.870718] BUG: unable to handle kernel NULL pointer dereferenceNULL pointer dereference at 00000100 - at 00000100 -[ 3.872615] IP: [] __queue_work+0x26/0x390 [] __queue_work+0x26/0x390 -[ 3.873758] *pdpt = 0000000000000000 *pde = f000ff53f000ff53 *pde = f000ff53f000ff53 -[ 3.875096] Oops: 0000 [#1] PREEMPT PREEMPT SMP SMP -[ 3.876130] CPU: 0 PID: 24 Comm: kworker/0:1 Not tainted 4.4.0-rc4-00139-g373ccbe #1 -[ 3.878135] Workqueue: events vmstat_shepherd -[ 3.879207] task: cb684600 ti: cb7ba000 task.ti: cb7ba000 -[ 3.880445] EIP: 0060:[] EFLAGS: 00010046 CPU: 0 -[ 3.881704] EIP is at __queue_work+0x26/0x390 -[ 3.882823] EAX: 00000046 EBX: cbb37800 ECX: cbb37800 EDX: 00000000 -[ 3.884457] ESI: 00000000 EDI: 00000000 EBP: cb7bbe68 ESP: cb7bbe38 -[ 3.886005] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 -[ 3.887229] CR0: 8005003b CR2: 00000100 CR3: 01fd5000 CR4: 000006b0 -[ 3.888663] Stack: -[ 3.895204] Call Trace: -[ 3.895854] [] ? mutex_unlock+0xd/0x10 -[ 3.897120] [] __queue_delayed_work+0xa1/0x160 -[ 3.898530] [] queue_delayed_work_on+0x36/0x60 -[ 3.899790] [] vmstat_shepherd+0xad/0xf0 -[ 3.900899] [] process_one_work+0x1aa/0x4c0 -[ 3.902093] [] ? process_one_work+0x112/0x4c0 -[ 3.903520] [] ? do_raw_spin_lock+0xe/0x150 -[ 3.904853] [] worker_thread+0x41/0x440 -[ 3.906023] [] ? process_one_work+0x4c0/0x4c0 -[ 3.907242] [] kthread+0xb0/0xd0 -[ 3.908188] [] ret_from_kernel_thread+0x21/0x40 -[ 3.909601] [] ? __kthread_parkme+0x80/0x80 - -The reason is that start_shepherd_timer schedules the shepherd work item -which uses vmstat_wq (vmstat_shepherd) before setup_vmstat allocates -that workqueue so if the further initialization takes more than HZ -we might end up scheduling on a NULL vmstat_wq. This is really unlikely -but not impossible. - -Fixes: 373ccbe59270 ("mm, vmstat: allow WQ concurrency to discover memory reclaim doesn't make any progress") -Reported-by: kernel test robot -Signed-off-by: Michal Hocko ---- -Hi Linus, -I am not marking this for stable because I hope we can sneak it into 4.4. -The patch is trivial and obvious. I am sorry about the breakage. If you prefer -to postpone it to 4.5-rc1 because this is not really that critical and shouldn't -happen most of the time then I will repost with stable tag added. - -Thanks! - - mm/vmstat.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/mm/vmstat.c b/mm/vmstat.c -index 4ebc17d948cb..c54fd2924f25 100644 ---- a/mm/vmstat.c -+++ b/mm/vmstat.c -@@ -1483,6 +1483,7 @@ static void __init start_shepherd_timer(void) - BUG(); - cpumask_copy(cpu_stat_off, cpu_online_mask); - -+ vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); - schedule_delayed_work(&shepherd, - round_jiffies_relative(sysctl_stat_interval)); - } -@@ -1550,7 +1551,6 @@ static int __init setup_vmstat(void) - - start_shepherd_timer(); - cpu_notifier_register_done(); -- vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); - #endif - #ifdef CONFIG_PROC_FS - proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); --- -2.6.4 -