commit 58152ecbbcc6a0ce7fddd5bf5f6ee535834ece0c
Author: Eric Dumazet <edumazet@google.com>
Date:   Mon Jul 23 09:28:21 2018 -0700

    tcp: add tcp_ooo_try_coalesce() helper
    
    In case skb in out_or_order_queue is the result of
    multiple skbs coalescing, we would like to get a proper gso_segs
    counter tracking, so that future tcp_drop() can report an accurate
    number.
    
    I chose to not implement this tracking for skbs in receive queue,
    since they are not dropped, unless socket is disconnected.
    
    Signed-off-by: Eric Dumazet <edumazet@google.com>
    Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
    Acked-by: Yuchung Cheng <ycheng@google.com>
    Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b062a7692238..3bcd30a2ba06 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4358,6 +4358,23 @@ static bool tcp_try_coalesce(struct sock *sk,
 	return true;
 }
 
+static bool tcp_ooo_try_coalesce(struct sock *sk,
+			     struct sk_buff *to,
+			     struct sk_buff *from,
+			     bool *fragstolen)
+{
+	bool res = tcp_try_coalesce(sk, to, from, fragstolen);
+
+	/* In case tcp_drop() is called later, update to->gso_segs */
+	if (res) {
+		u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
+			       max_t(u16, 1, skb_shinfo(from)->gso_segs);
+
+		skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
+	}
+	return res;
+}
+
 static void tcp_drop(struct sock *sk, struct sk_buff *skb)
 {
 	sk_drops_add(sk, skb);
@@ -4481,8 +4498,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 	/* In the typical case, we are adding an skb to the end of the list.
 	 * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
 	 */
-	if (tcp_try_coalesce(sk, tp->ooo_last_skb,
-			     skb, &fragstolen)) {
+	if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
+				 skb, &fragstolen)) {
 coalesce_done:
 		tcp_grow_window(sk, skb);
 		kfree_skb_partial(skb, fragstolen);
@@ -4532,8 +4549,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 				tcp_drop(sk, skb1);
 				goto merge_right;
 			}
-		} else if (tcp_try_coalesce(sk, skb1,
-					    skb, &fragstolen)) {
+		} else if (tcp_ooo_try_coalesce(sk, skb1,
+						skb, &fragstolen)) {
 			goto coalesce_done;
 		}
 		p = &parent->rb_right;
commit 8541b21e781a22dce52a74fef0b9bed00404a1cd
Author: Eric Dumazet <edumazet@google.com>
Date:   Mon Jul 23 09:28:20 2018 -0700

    tcp: call tcp_drop() from tcp_data_queue_ofo()
    
    In order to be able to give better diagnostics and detect
    malicious traffic, we need to have better sk->sk_drops tracking.
    
    Fixes: 9f5afeae5152 ("tcp: use an RB tree for ooo receive queue")
    Signed-off-by: Eric Dumazet <edumazet@google.com>
    Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
    Acked-by: Yuchung Cheng <ycheng@google.com>
    Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 78068b902e7b..b062a7692238 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4510,7 +4510,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 				/* All the bits are present. Drop. */
 				NET_INC_STATS(sock_net(sk),
 					      LINUX_MIB_TCPOFOMERGE);
-				__kfree_skb(skb);
+				tcp_drop(sk, skb);
 				skb = NULL;
 				tcp_dsack_set(sk, seq, end_seq);
 				goto add_sack;
@@ -4529,7 +4529,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
 						 TCP_SKB_CB(skb1)->end_seq);
 				NET_INC_STATS(sock_net(sk),
 					      LINUX_MIB_TCPOFOMERGE);
-				__kfree_skb(skb1);
+				tcp_drop(sk, skb1);
 				goto merge_right;
 			}
 		} else if (tcp_try_coalesce(sk, skb1,
commit 3d4bf93ac12003f9b8e1e2de37fe27983deebdcf
Author: Eric Dumazet <edumazet@google.com>
Date:   Mon Jul 23 09:28:19 2018 -0700

    tcp: detect malicious patterns in tcp_collapse_ofo_queue()
    
    In case an attacker feeds tiny packets completely out of order,
    tcp_collapse_ofo_queue() might scan the whole rb-tree, performing
    expensive copies, but not changing socket memory usage at all.
    
    1) Do not attempt to collapse tiny skbs.
    2) Add logic to exit early when too many tiny skbs are detected.
    
    We prefer not doing aggressive collapsing (which copies packets)
    for pathological flows, and revert to tcp_prune_ofo_queue() which
    will be less expensive.
    
    In the future, we might add the possibility of terminating flows
    that are proven to be malicious.
    
    Signed-off-by: Eric Dumazet <edumazet@google.com>
    Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
    Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 53289911362a..78068b902e7b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4902,6 +4902,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
 static void tcp_collapse_ofo_queue(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
+	u32 range_truesize, sum_tiny = 0;
 	struct sk_buff *skb, *head;
 	u32 start, end;
 
@@ -4913,6 +4914,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
 	}
 	start = TCP_SKB_CB(skb)->seq;
 	end = TCP_SKB_CB(skb)->end_seq;
+	range_truesize = skb->truesize;
 
 	for (head = skb;;) {
 		skb = skb_rb_next(skb);
@@ -4923,11 +4925,20 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
 		if (!skb ||
 		    after(TCP_SKB_CB(skb)->seq, end) ||
 		    before(TCP_SKB_CB(skb)->end_seq, start)) {
-			tcp_collapse(sk, NULL, &tp->out_of_order_queue,
-				     head, skb, start, end);
+			/* Do not attempt collapsing tiny skbs */
+			if (range_truesize != head->truesize ||
+			    end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
+				tcp_collapse(sk, NULL, &tp->out_of_order_queue,
+					     head, skb, start, end);
+			} else {
+				sum_tiny += range_truesize;
+				if (sum_tiny > sk->sk_rcvbuf >> 3)
+					return;
+			}
 			goto new_range;
 		}
 
+		range_truesize += skb->truesize;
 		if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
 			start = TCP_SKB_CB(skb)->seq;
 		if (after(TCP_SKB_CB(skb)->end_seq, end))
commit f4a3313d8e2ca9fd8d8f45e40a2903ba782607e7
Author: Eric Dumazet <edumazet@google.com>
Date:   Mon Jul 23 09:28:18 2018 -0700

    tcp: avoid collapses in tcp_prune_queue() if possible
    
    Right after a TCP flow is created, receiving tiny out of order
    packets allways hit the condition :
    
    if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
            tcp_clamp_window(sk);
    
    tcp_clamp_window() increases sk_rcvbuf to match sk_rmem_alloc
    (guarded by tcp_rmem[2])
    
    Calling tcp_collapse_ofo_queue() in this case is not useful,
    and offers a O(N^2) surface attack to malicious peers.
    
    Better not attempt anything before full queue capacity is reached,
    forcing attacker to spend lots of resource and allow us to more
    easily detect the abuse.
    
    Signed-off-by: Eric Dumazet <edumazet@google.com>
    Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
    Acked-by: Yuchung Cheng <ycheng@google.com>
    Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 64e45b279431..53289911362a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5004,6 +5004,9 @@ static int tcp_prune_queue(struct sock *sk)
 	else if (tcp_under_memory_pressure(sk))
 		tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
 
+	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+		return 0;
+
 	tcp_collapse_ofo_queue(sk);
 	if (!skb_queue_empty(&sk->sk_receive_queue))
 		tcp_collapse(sk, &sk->sk_receive_queue, NULL,
commit 72cd43ba64fc172a443410ce01645895850844c8
Author: Eric Dumazet <edumazet@google.com>
Date:   Mon Jul 23 09:28:17 2018 -0700

    tcp: free batches of packets in tcp_prune_ofo_queue()
    
    Juha-Matti Tilli reported that malicious peers could inject tiny
    packets in out_of_order_queue, forcing very expensive calls
    to tcp_collapse_ofo_queue() and tcp_prune_ofo_queue() for
    every incoming packet. out_of_order_queue rb-tree can contain
    thousands of nodes, iterating over all of them is not nice.
    
    Before linux-4.9, we would have pruned all packets in ofo_queue
    in one go, every XXXX packets. XXXX depends on sk_rcvbuf and skbs
    truesize, but is about 7000 packets with tcp_rmem[2] default of 6 MB.
    
    Since we plan to increase tcp_rmem[2] in the future to cope with
    modern BDP, can not revert to the old behavior, without great pain.
    
    Strategy taken in this patch is to purge ~12.5 % of the queue capacity.
    
    Fixes: 36a6503fedda ("tcp: refine tcp_prune_ofo_queue() to not drop all packets")
    Signed-off-by: Eric Dumazet <edumazet@google.com>
    Reported-by: Juha-Matti Tilli <juha-matti.tilli@iki.fi>
    Acked-by: Yuchung Cheng <ycheng@google.com>
    Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
    Signed-off-by: David S. Miller <davem@davemloft.net>

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 6bade06aaf72..64e45b279431 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4942,6 +4942,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
  * 2) not add too big latencies if thousands of packets sit there.
  *    (But if application shrinks SO_RCVBUF, we could still end up
  *     freeing whole queue here)
+ * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
  *
  * Return true if queue has shrunk.
  */
@@ -4949,20 +4950,26 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct rb_node *node, *prev;
+	int goal;
 
 	if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
 		return false;
 
 	NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
+	goal = sk->sk_rcvbuf >> 3;
 	node = &tp->ooo_last_skb->rbnode;
 	do {
 		prev = rb_prev(node);
 		rb_erase(node, &tp->out_of_order_queue);
+		goal -= rb_to_skb(node)->truesize;
 		tcp_drop(sk, rb_to_skb(node));
-		sk_mem_reclaim(sk);
-		if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
-		    !tcp_under_memory_pressure(sk))
-			break;
+		if (!prev || goal <= 0) {
+			sk_mem_reclaim(sk);
+			if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
+			    !tcp_under_memory_pressure(sk))
+				break;
+			goal = sk->sk_rcvbuf >> 3;
+		}
 		node = prev;
 	} while (node);
 	tp->ooo_last_skb = rb_to_skb(prev);