From: Jan Rękorajski Date: Wed, 25 Jul 2018 11:49:07 +0000 (+0200) Subject: - fix tcp out-of-order packets issue X-Git-Tag: auto/th/kernel-4.17.9-3 X-Git-Url: http://git.pld-linux.org/gitweb.cgi?p=packages%2Fkernel.git;a=commitdiff_plain;h=b876ae37eed247f5842db180eb857bf35d5820a9 - fix tcp out-of-order packets issue - rel 3 --- diff --git a/kernel.spec b/kernel.spec index 2869c952..a42b35d5 100644 --- a/kernel.spec +++ b/kernel.spec @@ -67,7 +67,7 @@ %define have_pcmcia 0 %endif -%define rel 2 +%define rel 3 %define basever 4.17 %define postver .9 @@ -215,6 +215,8 @@ Patch2000: kernel-small_fixes.patch Patch2001: kernel-pwc-uncompress.patch Patch2003: kernel-regressions.patch +Patch2004: tcp-ooo.patch + # for rescuecd # based on ftp://ftp.leg.uct.ac.za/pub/linux/rip/tmpfs_root-2.6.30.diff.gz Patch7000: kernel-inittmpfs.patch @@ -685,6 +687,7 @@ rm -f localversion-rt %patch2000 -p1 %patch2001 -p1 #%patch2003 -p1 +%patch2004 -p1 # Do not remove this, please! #%%patch50000 -p1 diff --git a/tcp-ooo.patch b/tcp-ooo.patch new file mode 100644 index 00000000..002ccec3 --- /dev/null +++ b/tcp-ooo.patch @@ -0,0 +1,285 @@ +commit 58152ecbbcc6a0ce7fddd5bf5f6ee535834ece0c +Author: Eric Dumazet +Date: Mon Jul 23 09:28:21 2018 -0700 + + tcp: add tcp_ooo_try_coalesce() helper + + In case skb in out_or_order_queue is the result of + multiple skbs coalescing, we would like to get a proper gso_segs + counter tracking, so that future tcp_drop() can report an accurate + number. + + I chose to not implement this tracking for skbs in receive queue, + since they are not dropped, unless socket is disconnected. + + Signed-off-by: Eric Dumazet + Acked-by: Soheil Hassas Yeganeh + Acked-by: Yuchung Cheng + Signed-off-by: David S. Miller + +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index b062a7692238..3bcd30a2ba06 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4358,6 +4358,23 @@ static bool tcp_try_coalesce(struct sock *sk, + return true; + } + ++static bool tcp_ooo_try_coalesce(struct sock *sk, ++ struct sk_buff *to, ++ struct sk_buff *from, ++ bool *fragstolen) ++{ ++ bool res = tcp_try_coalesce(sk, to, from, fragstolen); ++ ++ /* In case tcp_drop() is called later, update to->gso_segs */ ++ if (res) { ++ u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) + ++ max_t(u16, 1, skb_shinfo(from)->gso_segs); ++ ++ skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF); ++ } ++ return res; ++} ++ + static void tcp_drop(struct sock *sk, struct sk_buff *skb) + { + sk_drops_add(sk, skb); +@@ -4481,8 +4498,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) + /* In the typical case, we are adding an skb to the end of the list. + * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. + */ +- if (tcp_try_coalesce(sk, tp->ooo_last_skb, +- skb, &fragstolen)) { ++ if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb, ++ skb, &fragstolen)) { + coalesce_done: + tcp_grow_window(sk, skb); + kfree_skb_partial(skb, fragstolen); +@@ -4532,8 +4549,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) + tcp_drop(sk, skb1); + goto merge_right; + } +- } else if (tcp_try_coalesce(sk, skb1, +- skb, &fragstolen)) { ++ } else if (tcp_ooo_try_coalesce(sk, skb1, ++ skb, &fragstolen)) { + goto coalesce_done; + } + p = &parent->rb_right; +commit 8541b21e781a22dce52a74fef0b9bed00404a1cd +Author: Eric Dumazet +Date: Mon Jul 23 09:28:20 2018 -0700 + + tcp: call tcp_drop() from tcp_data_queue_ofo() + + In order to be able to give better diagnostics and detect + malicious traffic, we need to have better sk->sk_drops tracking. + + Fixes: 9f5afeae5152 ("tcp: use an RB tree for ooo receive queue") + Signed-off-by: Eric Dumazet + Acked-by: Soheil Hassas Yeganeh + Acked-by: Yuchung Cheng + Signed-off-by: David S. Miller + +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 78068b902e7b..b062a7692238 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4510,7 +4510,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) + /* All the bits are present. Drop. */ + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPOFOMERGE); +- __kfree_skb(skb); ++ tcp_drop(sk, skb); + skb = NULL; + tcp_dsack_set(sk, seq, end_seq); + goto add_sack; +@@ -4529,7 +4529,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) + TCP_SKB_CB(skb1)->end_seq); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPOFOMERGE); +- __kfree_skb(skb1); ++ tcp_drop(sk, skb1); + goto merge_right; + } + } else if (tcp_try_coalesce(sk, skb1, +commit 3d4bf93ac12003f9b8e1e2de37fe27983deebdcf +Author: Eric Dumazet +Date: Mon Jul 23 09:28:19 2018 -0700 + + tcp: detect malicious patterns in tcp_collapse_ofo_queue() + + In case an attacker feeds tiny packets completely out of order, + tcp_collapse_ofo_queue() might scan the whole rb-tree, performing + expensive copies, but not changing socket memory usage at all. + + 1) Do not attempt to collapse tiny skbs. + 2) Add logic to exit early when too many tiny skbs are detected. + + We prefer not doing aggressive collapsing (which copies packets) + for pathological flows, and revert to tcp_prune_ofo_queue() which + will be less expensive. + + In the future, we might add the possibility of terminating flows + that are proven to be malicious. + + Signed-off-by: Eric Dumazet + Acked-by: Soheil Hassas Yeganeh + Signed-off-by: David S. Miller + +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 53289911362a..78068b902e7b 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4902,6 +4902,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root, + static void tcp_collapse_ofo_queue(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); ++ u32 range_truesize, sum_tiny = 0; + struct sk_buff *skb, *head; + u32 start, end; + +@@ -4913,6 +4914,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk) + } + start = TCP_SKB_CB(skb)->seq; + end = TCP_SKB_CB(skb)->end_seq; ++ range_truesize = skb->truesize; + + for (head = skb;;) { + skb = skb_rb_next(skb); +@@ -4923,11 +4925,20 @@ static void tcp_collapse_ofo_queue(struct sock *sk) + if (!skb || + after(TCP_SKB_CB(skb)->seq, end) || + before(TCP_SKB_CB(skb)->end_seq, start)) { +- tcp_collapse(sk, NULL, &tp->out_of_order_queue, +- head, skb, start, end); ++ /* Do not attempt collapsing tiny skbs */ ++ if (range_truesize != head->truesize || ++ end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) { ++ tcp_collapse(sk, NULL, &tp->out_of_order_queue, ++ head, skb, start, end); ++ } else { ++ sum_tiny += range_truesize; ++ if (sum_tiny > sk->sk_rcvbuf >> 3) ++ return; ++ } + goto new_range; + } + ++ range_truesize += skb->truesize; + if (unlikely(before(TCP_SKB_CB(skb)->seq, start))) + start = TCP_SKB_CB(skb)->seq; + if (after(TCP_SKB_CB(skb)->end_seq, end)) +commit f4a3313d8e2ca9fd8d8f45e40a2903ba782607e7 +Author: Eric Dumazet +Date: Mon Jul 23 09:28:18 2018 -0700 + + tcp: avoid collapses in tcp_prune_queue() if possible + + Right after a TCP flow is created, receiving tiny out of order + packets allways hit the condition : + + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) + tcp_clamp_window(sk); + + tcp_clamp_window() increases sk_rcvbuf to match sk_rmem_alloc + (guarded by tcp_rmem[2]) + + Calling tcp_collapse_ofo_queue() in this case is not useful, + and offers a O(N^2) surface attack to malicious peers. + + Better not attempt anything before full queue capacity is reached, + forcing attacker to spend lots of resource and allow us to more + easily detect the abuse. + + Signed-off-by: Eric Dumazet + Acked-by: Soheil Hassas Yeganeh + Acked-by: Yuchung Cheng + Signed-off-by: David S. Miller + +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 64e45b279431..53289911362a 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -5004,6 +5004,9 @@ static int tcp_prune_queue(struct sock *sk) + else if (tcp_under_memory_pressure(sk)) + tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); + ++ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) ++ return 0; ++ + tcp_collapse_ofo_queue(sk); + if (!skb_queue_empty(&sk->sk_receive_queue)) + tcp_collapse(sk, &sk->sk_receive_queue, NULL, +commit 72cd43ba64fc172a443410ce01645895850844c8 +Author: Eric Dumazet +Date: Mon Jul 23 09:28:17 2018 -0700 + + tcp: free batches of packets in tcp_prune_ofo_queue() + + Juha-Matti Tilli reported that malicious peers could inject tiny + packets in out_of_order_queue, forcing very expensive calls + to tcp_collapse_ofo_queue() and tcp_prune_ofo_queue() for + every incoming packet. out_of_order_queue rb-tree can contain + thousands of nodes, iterating over all of them is not nice. + + Before linux-4.9, we would have pruned all packets in ofo_queue + in one go, every XXXX packets. XXXX depends on sk_rcvbuf and skbs + truesize, but is about 7000 packets with tcp_rmem[2] default of 6 MB. + + Since we plan to increase tcp_rmem[2] in the future to cope with + modern BDP, can not revert to the old behavior, without great pain. + + Strategy taken in this patch is to purge ~12.5 % of the queue capacity. + + Fixes: 36a6503fedda ("tcp: refine tcp_prune_ofo_queue() to not drop all packets") + Signed-off-by: Eric Dumazet + Reported-by: Juha-Matti Tilli + Acked-by: Yuchung Cheng + Acked-by: Soheil Hassas Yeganeh + Signed-off-by: David S. Miller + +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 6bade06aaf72..64e45b279431 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4942,6 +4942,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk) + * 2) not add too big latencies if thousands of packets sit there. + * (But if application shrinks SO_RCVBUF, we could still end up + * freeing whole queue here) ++ * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks. + * + * Return true if queue has shrunk. + */ +@@ -4949,20 +4950,26 @@ static bool tcp_prune_ofo_queue(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); + struct rb_node *node, *prev; ++ int goal; + + if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) + return false; + + NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); ++ goal = sk->sk_rcvbuf >> 3; + node = &tp->ooo_last_skb->rbnode; + do { + prev = rb_prev(node); + rb_erase(node, &tp->out_of_order_queue); ++ goal -= rb_to_skb(node)->truesize; + tcp_drop(sk, rb_to_skb(node)); +- sk_mem_reclaim(sk); +- if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && +- !tcp_under_memory_pressure(sk)) +- break; ++ if (!prev || goal <= 0) { ++ sk_mem_reclaim(sk); ++ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && ++ !tcp_under_memory_pressure(sk)) ++ break; ++ goal = sk->sk_rcvbuf >> 3; ++ } + node = prev; + } while (node); + tp->ooo_last_skb = rb_to_skb(prev);