--- /dev/null
+commit 58152ecbbcc6a0ce7fddd5bf5f6ee535834ece0c
+Author: Eric Dumazet <edumazet@google.com>
+Date: Mon Jul 23 09:28:21 2018 -0700
+
+ tcp: add tcp_ooo_try_coalesce() helper
+
+ In case skb in out_or_order_queue is the result of
+ multiple skbs coalescing, we would like to get a proper gso_segs
+ counter tracking, so that future tcp_drop() can report an accurate
+ number.
+
+ I chose to not implement this tracking for skbs in receive queue,
+ since they are not dropped, unless socket is disconnected.
+
+ Signed-off-by: Eric Dumazet <edumazet@google.com>
+ Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+ Acked-by: Yuchung Cheng <ycheng@google.com>
+ Signed-off-by: David S. Miller <davem@davemloft.net>
+
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index b062a7692238..3bcd30a2ba06 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4358,6 +4358,23 @@ static bool tcp_try_coalesce(struct sock *sk,
+ return true;
+ }
+
++static bool tcp_ooo_try_coalesce(struct sock *sk,
++ struct sk_buff *to,
++ struct sk_buff *from,
++ bool *fragstolen)
++{
++ bool res = tcp_try_coalesce(sk, to, from, fragstolen);
++
++ /* In case tcp_drop() is called later, update to->gso_segs */
++ if (res) {
++ u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
++ max_t(u16, 1, skb_shinfo(from)->gso_segs);
++
++ skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
++ }
++ return res;
++}
++
+ static void tcp_drop(struct sock *sk, struct sk_buff *skb)
+ {
+ sk_drops_add(sk, skb);
+@@ -4481,8 +4498,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
+ /* In the typical case, we are adding an skb to the end of the list.
+ * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
+ */
+- if (tcp_try_coalesce(sk, tp->ooo_last_skb,
+- skb, &fragstolen)) {
++ if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
++ skb, &fragstolen)) {
+ coalesce_done:
+ tcp_grow_window(sk, skb);
+ kfree_skb_partial(skb, fragstolen);
+@@ -4532,8 +4549,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
+ tcp_drop(sk, skb1);
+ goto merge_right;
+ }
+- } else if (tcp_try_coalesce(sk, skb1,
+- skb, &fragstolen)) {
++ } else if (tcp_ooo_try_coalesce(sk, skb1,
++ skb, &fragstolen)) {
+ goto coalesce_done;
+ }
+ p = &parent->rb_right;
+commit 8541b21e781a22dce52a74fef0b9bed00404a1cd
+Author: Eric Dumazet <edumazet@google.com>
+Date: Mon Jul 23 09:28:20 2018 -0700
+
+ tcp: call tcp_drop() from tcp_data_queue_ofo()
+
+ In order to be able to give better diagnostics and detect
+ malicious traffic, we need to have better sk->sk_drops tracking.
+
+ Fixes: 9f5afeae5152 ("tcp: use an RB tree for ooo receive queue")
+ Signed-off-by: Eric Dumazet <edumazet@google.com>
+ Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+ Acked-by: Yuchung Cheng <ycheng@google.com>
+ Signed-off-by: David S. Miller <davem@davemloft.net>
+
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index 78068b902e7b..b062a7692238 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4510,7 +4510,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
+ /* All the bits are present. Drop. */
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPOFOMERGE);
+- __kfree_skb(skb);
++ tcp_drop(sk, skb);
+ skb = NULL;
+ tcp_dsack_set(sk, seq, end_seq);
+ goto add_sack;
+@@ -4529,7 +4529,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
+ TCP_SKB_CB(skb1)->end_seq);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPOFOMERGE);
+- __kfree_skb(skb1);
++ tcp_drop(sk, skb1);
+ goto merge_right;
+ }
+ } else if (tcp_try_coalesce(sk, skb1,
+commit 3d4bf93ac12003f9b8e1e2de37fe27983deebdcf
+Author: Eric Dumazet <edumazet@google.com>
+Date: Mon Jul 23 09:28:19 2018 -0700
+
+ tcp: detect malicious patterns in tcp_collapse_ofo_queue()
+
+ In case an attacker feeds tiny packets completely out of order,
+ tcp_collapse_ofo_queue() might scan the whole rb-tree, performing
+ expensive copies, but not changing socket memory usage at all.
+
+ 1) Do not attempt to collapse tiny skbs.
+ 2) Add logic to exit early when too many tiny skbs are detected.
+
+ We prefer not doing aggressive collapsing (which copies packets)
+ for pathological flows, and revert to tcp_prune_ofo_queue() which
+ will be less expensive.
+
+ In the future, we might add the possibility of terminating flows
+ that are proven to be malicious.
+
+ Signed-off-by: Eric Dumazet <edumazet@google.com>
+ Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+ Signed-off-by: David S. Miller <davem@davemloft.net>
+
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index 53289911362a..78068b902e7b 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4902,6 +4902,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
+ static void tcp_collapse_ofo_queue(struct sock *sk)
+ {
+ struct tcp_sock *tp = tcp_sk(sk);
++ u32 range_truesize, sum_tiny = 0;
+ struct sk_buff *skb, *head;
+ u32 start, end;
+
+@@ -4913,6 +4914,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
+ }
+ start = TCP_SKB_CB(skb)->seq;
+ end = TCP_SKB_CB(skb)->end_seq;
++ range_truesize = skb->truesize;
+
+ for (head = skb;;) {
+ skb = skb_rb_next(skb);
+@@ -4923,11 +4925,20 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
+ if (!skb ||
+ after(TCP_SKB_CB(skb)->seq, end) ||
+ before(TCP_SKB_CB(skb)->end_seq, start)) {
+- tcp_collapse(sk, NULL, &tp->out_of_order_queue,
+- head, skb, start, end);
++ /* Do not attempt collapsing tiny skbs */
++ if (range_truesize != head->truesize ||
++ end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
++ tcp_collapse(sk, NULL, &tp->out_of_order_queue,
++ head, skb, start, end);
++ } else {
++ sum_tiny += range_truesize;
++ if (sum_tiny > sk->sk_rcvbuf >> 3)
++ return;
++ }
+ goto new_range;
+ }
+
++ range_truesize += skb->truesize;
+ if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
+ start = TCP_SKB_CB(skb)->seq;
+ if (after(TCP_SKB_CB(skb)->end_seq, end))
+commit f4a3313d8e2ca9fd8d8f45e40a2903ba782607e7
+Author: Eric Dumazet <edumazet@google.com>
+Date: Mon Jul 23 09:28:18 2018 -0700
+
+ tcp: avoid collapses in tcp_prune_queue() if possible
+
+ Right after a TCP flow is created, receiving tiny out of order
+ packets allways hit the condition :
+
+ if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+ tcp_clamp_window(sk);
+
+ tcp_clamp_window() increases sk_rcvbuf to match sk_rmem_alloc
+ (guarded by tcp_rmem[2])
+
+ Calling tcp_collapse_ofo_queue() in this case is not useful,
+ and offers a O(N^2) surface attack to malicious peers.
+
+ Better not attempt anything before full queue capacity is reached,
+ forcing attacker to spend lots of resource and allow us to more
+ easily detect the abuse.
+
+ Signed-off-by: Eric Dumazet <edumazet@google.com>
+ Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+ Acked-by: Yuchung Cheng <ycheng@google.com>
+ Signed-off-by: David S. Miller <davem@davemloft.net>
+
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index 64e45b279431..53289911362a 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -5004,6 +5004,9 @@ static int tcp_prune_queue(struct sock *sk)
+ else if (tcp_under_memory_pressure(sk))
+ tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
+
++ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
++ return 0;
++
+ tcp_collapse_ofo_queue(sk);
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ tcp_collapse(sk, &sk->sk_receive_queue, NULL,
+commit 72cd43ba64fc172a443410ce01645895850844c8
+Author: Eric Dumazet <edumazet@google.com>
+Date: Mon Jul 23 09:28:17 2018 -0700
+
+ tcp: free batches of packets in tcp_prune_ofo_queue()
+
+ Juha-Matti Tilli reported that malicious peers could inject tiny
+ packets in out_of_order_queue, forcing very expensive calls
+ to tcp_collapse_ofo_queue() and tcp_prune_ofo_queue() for
+ every incoming packet. out_of_order_queue rb-tree can contain
+ thousands of nodes, iterating over all of them is not nice.
+
+ Before linux-4.9, we would have pruned all packets in ofo_queue
+ in one go, every XXXX packets. XXXX depends on sk_rcvbuf and skbs
+ truesize, but is about 7000 packets with tcp_rmem[2] default of 6 MB.
+
+ Since we plan to increase tcp_rmem[2] in the future to cope with
+ modern BDP, can not revert to the old behavior, without great pain.
+
+ Strategy taken in this patch is to purge ~12.5 % of the queue capacity.
+
+ Fixes: 36a6503fedda ("tcp: refine tcp_prune_ofo_queue() to not drop all packets")
+ Signed-off-by: Eric Dumazet <edumazet@google.com>
+ Reported-by: Juha-Matti Tilli <juha-matti.tilli@iki.fi>
+ Acked-by: Yuchung Cheng <ycheng@google.com>
+ Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+ Signed-off-by: David S. Miller <davem@davemloft.net>
+
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index 6bade06aaf72..64e45b279431 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4942,6 +4942,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
+ * 2) not add too big latencies if thousands of packets sit there.
+ * (But if application shrinks SO_RCVBUF, we could still end up
+ * freeing whole queue here)
++ * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
+ *
+ * Return true if queue has shrunk.
+ */
+@@ -4949,20 +4950,26 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
+ {
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct rb_node *node, *prev;
++ int goal;
+
+ if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
+ return false;
+
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
++ goal = sk->sk_rcvbuf >> 3;
+ node = &tp->ooo_last_skb->rbnode;
+ do {
+ prev = rb_prev(node);
+ rb_erase(node, &tp->out_of_order_queue);
++ goal -= rb_to_skb(node)->truesize;
+ tcp_drop(sk, rb_to_skb(node));
+- sk_mem_reclaim(sk);
+- if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
+- !tcp_under_memory_pressure(sk))
+- break;
++ if (!prev || goal <= 0) {
++ sk_mem_reclaim(sk);
++ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
++ !tcp_under_memory_pressure(sk))
++ break;
++ goal = sk->sk_rcvbuf >> 3;
++ }
+ node = prev;
+ } while (node);
+ tp->ooo_last_skb = rb_to_skb(prev);