From: Jan Rękorajski <baggins@pld-linux.org>
Date: Wed, 25 Jul 2018 11:49:07 +0000 (+0200)
Subject: - fix tcp out-of-order packets issue
X-Git-Tag: auto/th/kernel-4.17.9-3
X-Git-Url: http://git.pld-linux.org/gitweb.cgi?p=packages%2Fkernel.git;a=commitdiff_plain;h=b876ae37eed247f5842db180eb857bf35d5820a9

- fix tcp out-of-order packets issue
- rel 3
---

diff --git a/kernel.spec b/kernel.spec
index 2869c952..a42b35d5 100644
--- a/kernel.spec
+++ b/kernel.spec
@@ -67,7 +67,7 @@
 %define		have_pcmcia	0
 %endif
 
-%define		rel		2
+%define		rel		3
 %define		basever		4.17
 %define		postver		.9
 
@@ -215,6 +215,8 @@ Patch2000:	kernel-small_fixes.patch
 Patch2001:	kernel-pwc-uncompress.patch
 Patch2003:	kernel-regressions.patch
 
+Patch2004:	tcp-ooo.patch
+
 # for rescuecd
 # based on ftp://ftp.leg.uct.ac.za/pub/linux/rip/tmpfs_root-2.6.30.diff.gz
 Patch7000:	kernel-inittmpfs.patch
@@ -685,6 +687,7 @@ rm -f localversion-rt
 %patch2000 -p1
 %patch2001 -p1
 #%patch2003 -p1
+%patch2004 -p1
 
 # Do not remove this, please!
 #%%patch50000 -p1
diff --git a/tcp-ooo.patch b/tcp-ooo.patch
new file mode 100644
index 00000000..002ccec3
--- /dev/null
+++ b/tcp-ooo.patch
@@ -0,0 +1,285 @@
+commit 58152ecbbcc6a0ce7fddd5bf5f6ee535834ece0c
+Author: Eric Dumazet <edumazet@google.com>
+Date:   Mon Jul 23 09:28:21 2018 -0700
+
+    tcp: add tcp_ooo_try_coalesce() helper
+    
+    In case skb in out_or_order_queue is the result of
+    multiple skbs coalescing, we would like to get a proper gso_segs
+    counter tracking, so that future tcp_drop() can report an accurate
+    number.
+    
+    I chose to not implement this tracking for skbs in receive queue,
+    since they are not dropped, unless socket is disconnected.
+    
+    Signed-off-by: Eric Dumazet <edumazet@google.com>
+    Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+    Acked-by: Yuchung Cheng <ycheng@google.com>
+    Signed-off-by: David S. Miller <davem@davemloft.net>
+
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index b062a7692238..3bcd30a2ba06 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4358,6 +4358,23 @@ static bool tcp_try_coalesce(struct sock *sk,
+ 	return true;
+ }
+ 
++static bool tcp_ooo_try_coalesce(struct sock *sk,
++			     struct sk_buff *to,
++			     struct sk_buff *from,
++			     bool *fragstolen)
++{
++	bool res = tcp_try_coalesce(sk, to, from, fragstolen);
++
++	/* In case tcp_drop() is called later, update to->gso_segs */
++	if (res) {
++		u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
++			       max_t(u16, 1, skb_shinfo(from)->gso_segs);
++
++		skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
++	}
++	return res;
++}
++
+ static void tcp_drop(struct sock *sk, struct sk_buff *skb)
+ {
+ 	sk_drops_add(sk, skb);
+@@ -4481,8 +4498,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
+ 	/* In the typical case, we are adding an skb to the end of the list.
+ 	 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
+ 	 */
+-	if (tcp_try_coalesce(sk, tp->ooo_last_skb,
+-			     skb, &fragstolen)) {
++	if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
++				 skb, &fragstolen)) {
+ coalesce_done:
+ 		tcp_grow_window(sk, skb);
+ 		kfree_skb_partial(skb, fragstolen);
+@@ -4532,8 +4549,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
+ 				tcp_drop(sk, skb1);
+ 				goto merge_right;
+ 			}
+-		} else if (tcp_try_coalesce(sk, skb1,
+-					    skb, &fragstolen)) {
++		} else if (tcp_ooo_try_coalesce(sk, skb1,
++						skb, &fragstolen)) {
+ 			goto coalesce_done;
+ 		}
+ 		p = &parent->rb_right;
+commit 8541b21e781a22dce52a74fef0b9bed00404a1cd
+Author: Eric Dumazet <edumazet@google.com>
+Date:   Mon Jul 23 09:28:20 2018 -0700
+
+    tcp: call tcp_drop() from tcp_data_queue_ofo()
+    
+    In order to be able to give better diagnostics and detect
+    malicious traffic, we need to have better sk->sk_drops tracking.
+    
+    Fixes: 9f5afeae5152 ("tcp: use an RB tree for ooo receive queue")
+    Signed-off-by: Eric Dumazet <edumazet@google.com>
+    Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+    Acked-by: Yuchung Cheng <ycheng@google.com>
+    Signed-off-by: David S. Miller <davem@davemloft.net>
+
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index 78068b902e7b..b062a7692238 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4510,7 +4510,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
+ 				/* All the bits are present. Drop. */
+ 				NET_INC_STATS(sock_net(sk),
+ 					      LINUX_MIB_TCPOFOMERGE);
+-				__kfree_skb(skb);
++				tcp_drop(sk, skb);
+ 				skb = NULL;
+ 				tcp_dsack_set(sk, seq, end_seq);
+ 				goto add_sack;
+@@ -4529,7 +4529,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
+ 						 TCP_SKB_CB(skb1)->end_seq);
+ 				NET_INC_STATS(sock_net(sk),
+ 					      LINUX_MIB_TCPOFOMERGE);
+-				__kfree_skb(skb1);
++				tcp_drop(sk, skb1);
+ 				goto merge_right;
+ 			}
+ 		} else if (tcp_try_coalesce(sk, skb1,
+commit 3d4bf93ac12003f9b8e1e2de37fe27983deebdcf
+Author: Eric Dumazet <edumazet@google.com>
+Date:   Mon Jul 23 09:28:19 2018 -0700
+
+    tcp: detect malicious patterns in tcp_collapse_ofo_queue()
+    
+    In case an attacker feeds tiny packets completely out of order,
+    tcp_collapse_ofo_queue() might scan the whole rb-tree, performing
+    expensive copies, but not changing socket memory usage at all.
+    
+    1) Do not attempt to collapse tiny skbs.
+    2) Add logic to exit early when too many tiny skbs are detected.
+    
+    We prefer not doing aggressive collapsing (which copies packets)
+    for pathological flows, and revert to tcp_prune_ofo_queue() which
+    will be less expensive.
+    
+    In the future, we might add the possibility of terminating flows
+    that are proven to be malicious.
+    
+    Signed-off-by: Eric Dumazet <edumazet@google.com>
+    Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+    Signed-off-by: David S. Miller <davem@davemloft.net>
+
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index 53289911362a..78068b902e7b 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4902,6 +4902,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
+ static void tcp_collapse_ofo_queue(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
++	u32 range_truesize, sum_tiny = 0;
+ 	struct sk_buff *skb, *head;
+ 	u32 start, end;
+ 
+@@ -4913,6 +4914,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
+ 	}
+ 	start = TCP_SKB_CB(skb)->seq;
+ 	end = TCP_SKB_CB(skb)->end_seq;
++	range_truesize = skb->truesize;
+ 
+ 	for (head = skb;;) {
+ 		skb = skb_rb_next(skb);
+@@ -4923,11 +4925,20 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
+ 		if (!skb ||
+ 		    after(TCP_SKB_CB(skb)->seq, end) ||
+ 		    before(TCP_SKB_CB(skb)->end_seq, start)) {
+-			tcp_collapse(sk, NULL, &tp->out_of_order_queue,
+-				     head, skb, start, end);
++			/* Do not attempt collapsing tiny skbs */
++			if (range_truesize != head->truesize ||
++			    end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
++				tcp_collapse(sk, NULL, &tp->out_of_order_queue,
++					     head, skb, start, end);
++			} else {
++				sum_tiny += range_truesize;
++				if (sum_tiny > sk->sk_rcvbuf >> 3)
++					return;
++			}
+ 			goto new_range;
+ 		}
+ 
++		range_truesize += skb->truesize;
+ 		if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
+ 			start = TCP_SKB_CB(skb)->seq;
+ 		if (after(TCP_SKB_CB(skb)->end_seq, end))
+commit f4a3313d8e2ca9fd8d8f45e40a2903ba782607e7
+Author: Eric Dumazet <edumazet@google.com>
+Date:   Mon Jul 23 09:28:18 2018 -0700
+
+    tcp: avoid collapses in tcp_prune_queue() if possible
+    
+    Right after a TCP flow is created, receiving tiny out of order
+    packets allways hit the condition :
+    
+    if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+            tcp_clamp_window(sk);
+    
+    tcp_clamp_window() increases sk_rcvbuf to match sk_rmem_alloc
+    (guarded by tcp_rmem[2])
+    
+    Calling tcp_collapse_ofo_queue() in this case is not useful,
+    and offers a O(N^2) surface attack to malicious peers.
+    
+    Better not attempt anything before full queue capacity is reached,
+    forcing attacker to spend lots of resource and allow us to more
+    easily detect the abuse.
+    
+    Signed-off-by: Eric Dumazet <edumazet@google.com>
+    Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+    Acked-by: Yuchung Cheng <ycheng@google.com>
+    Signed-off-by: David S. Miller <davem@davemloft.net>
+
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index 64e45b279431..53289911362a 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -5004,6 +5004,9 @@ static int tcp_prune_queue(struct sock *sk)
+ 	else if (tcp_under_memory_pressure(sk))
+ 		tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
+ 
++	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
++		return 0;
++
+ 	tcp_collapse_ofo_queue(sk);
+ 	if (!skb_queue_empty(&sk->sk_receive_queue))
+ 		tcp_collapse(sk, &sk->sk_receive_queue, NULL,
+commit 72cd43ba64fc172a443410ce01645895850844c8
+Author: Eric Dumazet <edumazet@google.com>
+Date:   Mon Jul 23 09:28:17 2018 -0700
+
+    tcp: free batches of packets in tcp_prune_ofo_queue()
+    
+    Juha-Matti Tilli reported that malicious peers could inject tiny
+    packets in out_of_order_queue, forcing very expensive calls
+    to tcp_collapse_ofo_queue() and tcp_prune_ofo_queue() for
+    every incoming packet. out_of_order_queue rb-tree can contain
+    thousands of nodes, iterating over all of them is not nice.
+    
+    Before linux-4.9, we would have pruned all packets in ofo_queue
+    in one go, every XXXX packets. XXXX depends on sk_rcvbuf and skbs
+    truesize, but is about 7000 packets with tcp_rmem[2] default of 6 MB.
+    
+    Since we plan to increase tcp_rmem[2] in the future to cope with
+    modern BDP, can not revert to the old behavior, without great pain.
+    
+    Strategy taken in this patch is to purge ~12.5 % of the queue capacity.
+    
+    Fixes: 36a6503fedda ("tcp: refine tcp_prune_ofo_queue() to not drop all packets")
+    Signed-off-by: Eric Dumazet <edumazet@google.com>
+    Reported-by: Juha-Matti Tilli <juha-matti.tilli@iki.fi>
+    Acked-by: Yuchung Cheng <ycheng@google.com>
+    Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+    Signed-off-by: David S. Miller <davem@davemloft.net>
+
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index 6bade06aaf72..64e45b279431 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4942,6 +4942,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
+  * 2) not add too big latencies if thousands of packets sit there.
+  *    (But if application shrinks SO_RCVBUF, we could still end up
+  *     freeing whole queue here)
++ * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
+  *
+  * Return true if queue has shrunk.
+  */
+@@ -4949,20 +4950,26 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
+ {
+ 	struct tcp_sock *tp = tcp_sk(sk);
+ 	struct rb_node *node, *prev;
++	int goal;
+ 
+ 	if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
+ 		return false;
+ 
+ 	NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
++	goal = sk->sk_rcvbuf >> 3;
+ 	node = &tp->ooo_last_skb->rbnode;
+ 	do {
+ 		prev = rb_prev(node);
+ 		rb_erase(node, &tp->out_of_order_queue);
++		goal -= rb_to_skb(node)->truesize;
+ 		tcp_drop(sk, rb_to_skb(node));
+-		sk_mem_reclaim(sk);
+-		if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
+-		    !tcp_under_memory_pressure(sk))
+-			break;
++		if (!prev || goal <= 0) {
++			sk_mem_reclaim(sk);
++			if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
++			    !tcp_under_memory_pressure(sk))
++				break;
++			goal = sk->sk_rcvbuf >> 3;
++		}
+ 		node = prev;
+ 	} while (node);
+ 	tp->ooo_last_skb = rb_to_skb(prev);