- fix tcp out-of-order packets issue

author Jan Rękorajski <baggins@pld-linux.org>

Wed, 25 Jul 2018 11:49:07 +0000 (13:49 +0200)

committer Jan Rękorajski <baggins@pld-linux.org>

Wed, 25 Jul 2018 11:49:07 +0000 (13:49 +0200)
author Jan Rękorajski <baggins@pld-linux.org>
Wed, 25 Jul 2018 11:49:07 +0000 (13:49 +0200)
committer Jan Rękorajski <baggins@pld-linux.org>
Wed, 25 Jul 2018 11:49:07 +0000 (13:49 +0200)
diff --git a/kernel.spec b/kernel.spec

index 2869c952ef6c71131e10de9491e2c52c33043ea2..a42b35d52332f985549c53457b7dc71be02b7fe9 100644 (file)
--- a/kernel.spec
+++ b/kernel.spec
@@ -67,7 +67,7 @@
  %define                have_pcmcia     0
  %endif
  
-%define                rel             2
+%define                rel             3
  %define                basever         4.17
  %define                postver         .9
  
@@ -215,6 +215,8 @@ Patch2000:  kernel-small_fixes.patch
  Patch2001:     kernel-pwc-uncompress.patch
  Patch2003:     kernel-regressions.patch
  
+Patch2004:     tcp-ooo.patch
+
  # for rescuecd
  # based on ftp://ftp.leg.uct.ac.za/pub/linux/rip/tmpfs_root-2.6.30.diff.gz
  Patch7000:     kernel-inittmpfs.patch
@@ -685,6 +687,7 @@ rm -f localversion-rt
  %patch2000 -p1
  %patch2001 -p1
  #%patch2003 -p1
+%patch2004 -p1
  
  # Do not remove this, please!
  #%%patch50000 -p1
diff --git a/tcp-ooo.patch b/tcp-ooo.patch

new file mode 100644 (file)

index 0000000..002ccec
--- /dev/null
+++ b/tcp-ooo.patch
@@ -0,0 +1,285 @@
+commit 58152ecbbcc6a0ce7fddd5bf5f6ee535834ece0c
+Author: Eric Dumazet <edumazet@google.com>
+Date:   Mon Jul 23 09:28:21 2018 -0700
+
+    tcp: add tcp_ooo_try_coalesce() helper
+    
+    In case skb in out_or_order_queue is the result of
+    multiple skbs coalescing, we would like to get a proper gso_segs
+    counter tracking, so that future tcp_drop() can report an accurate
+    number.
+    
+    I chose to not implement this tracking for skbs in receive queue,
+    since they are not dropped, unless socket is disconnected.
+    
+    Signed-off-by: Eric Dumazet <edumazet@google.com>
+    Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+    Acked-by: Yuchung Cheng <ycheng@google.com>
+    Signed-off-by: David S. Miller <davem@davemloft.net>
+
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index b062a7692238..3bcd30a2ba06 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4358,6 +4358,23 @@ static bool tcp_try_coalesce(struct sock *sk,
+       return true;
+ }
+ 
++static bool tcp_ooo_try_coalesce(struct sock *sk,
++                           struct sk_buff *to,
++                           struct sk_buff *from,
++                           bool *fragstolen)
++{
++      bool res = tcp_try_coalesce(sk, to, from, fragstolen);
++
++      /* In case tcp_drop() is called later, update to->gso_segs */
++      if (res) {
++              u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
++                             max_t(u16, 1, skb_shinfo(from)->gso_segs);
++
++              skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
++      }
++      return res;
++}
++
+ static void tcp_drop(struct sock *sk, struct sk_buff *skb)
+ {
+       sk_drops_add(sk, skb);
+@@ -4481,8 +4498,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
+       /* In the typical case, we are adding an skb to the end of the list.
+        * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
+        */
+-      if (tcp_try_coalesce(sk, tp->ooo_last_skb,
+-                           skb, &fragstolen)) {
++      if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
++                               skb, &fragstolen)) {
+ coalesce_done:
+               tcp_grow_window(sk, skb);
+               kfree_skb_partial(skb, fragstolen);
+@@ -4532,8 +4549,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
+                               tcp_drop(sk, skb1);
+                               goto merge_right;
+                       }
+-              } else if (tcp_try_coalesce(sk, skb1,
+-                                          skb, &fragstolen)) {
++              } else if (tcp_ooo_try_coalesce(sk, skb1,
++                                              skb, &fragstolen)) {
+                       goto coalesce_done;
+               }
+               p = &parent->rb_right;
+commit 8541b21e781a22dce52a74fef0b9bed00404a1cd
+Author: Eric Dumazet <edumazet@google.com>
+Date:   Mon Jul 23 09:28:20 2018 -0700
+
+    tcp: call tcp_drop() from tcp_data_queue_ofo()
+    
+    In order to be able to give better diagnostics and detect
+    malicious traffic, we need to have better sk->sk_drops tracking.
+    
+    Fixes: 9f5afeae5152 ("tcp: use an RB tree for ooo receive queue")
+    Signed-off-by: Eric Dumazet <edumazet@google.com>
+    Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+    Acked-by: Yuchung Cheng <ycheng@google.com>
+    Signed-off-by: David S. Miller <davem@davemloft.net>
+
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index 78068b902e7b..b062a7692238 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4510,7 +4510,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
+                               /* All the bits are present. Drop. */
+                               NET_INC_STATS(sock_net(sk),
+                                             LINUX_MIB_TCPOFOMERGE);
+-                              __kfree_skb(skb);
++                              tcp_drop(sk, skb);
+                               skb = NULL;
+                               tcp_dsack_set(sk, seq, end_seq);
+                               goto add_sack;
+@@ -4529,7 +4529,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
+                                                TCP_SKB_CB(skb1)->end_seq);
+                               NET_INC_STATS(sock_net(sk),
+                                             LINUX_MIB_TCPOFOMERGE);
+-                              __kfree_skb(skb1);
++                              tcp_drop(sk, skb1);
+                               goto merge_right;
+                       }
+               } else if (tcp_try_coalesce(sk, skb1,
+commit 3d4bf93ac12003f9b8e1e2de37fe27983deebdcf
+Author: Eric Dumazet <edumazet@google.com>
+Date:   Mon Jul 23 09:28:19 2018 -0700
+
+    tcp: detect malicious patterns in tcp_collapse_ofo_queue()
+    
+    In case an attacker feeds tiny packets completely out of order,
+    tcp_collapse_ofo_queue() might scan the whole rb-tree, performing
+    expensive copies, but not changing socket memory usage at all.
+    
+    1) Do not attempt to collapse tiny skbs.
+    2) Add logic to exit early when too many tiny skbs are detected.
+    
+    We prefer not doing aggressive collapsing (which copies packets)
+    for pathological flows, and revert to tcp_prune_ofo_queue() which
+    will be less expensive.
+    
+    In the future, we might add the possibility of terminating flows
+    that are proven to be malicious.
+    
+    Signed-off-by: Eric Dumazet <edumazet@google.com>
+    Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+    Signed-off-by: David S. Miller <davem@davemloft.net>
+
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index 53289911362a..78068b902e7b 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4902,6 +4902,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
+ static void tcp_collapse_ofo_queue(struct sock *sk)
+ {
+       struct tcp_sock *tp = tcp_sk(sk);
++      u32 range_truesize, sum_tiny = 0;
+       struct sk_buff *skb, *head;
+       u32 start, end;
+ 
+@@ -4913,6 +4914,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
+       }
+       start = TCP_SKB_CB(skb)->seq;
+       end = TCP_SKB_CB(skb)->end_seq;
++      range_truesize = skb->truesize;
+ 
+       for (head = skb;;) {
+               skb = skb_rb_next(skb);
+@@ -4923,11 +4925,20 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
+               if (!skb ||
+                   after(TCP_SKB_CB(skb)->seq, end) ||
+                   before(TCP_SKB_CB(skb)->end_seq, start)) {
+-                      tcp_collapse(sk, NULL, &tp->out_of_order_queue,
+-                                   head, skb, start, end);
++                      /* Do not attempt collapsing tiny skbs */
++                      if (range_truesize != head->truesize ||
++                          end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
++                              tcp_collapse(sk, NULL, &tp->out_of_order_queue,
++                                           head, skb, start, end);
++                      } else {
++                              sum_tiny += range_truesize;
++                              if (sum_tiny > sk->sk_rcvbuf >> 3)
++                                      return;
++                      }
+                       goto new_range;
+               }
+ 
++              range_truesize += skb->truesize;
+               if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
+                       start = TCP_SKB_CB(skb)->seq;
+               if (after(TCP_SKB_CB(skb)->end_seq, end))
+commit f4a3313d8e2ca9fd8d8f45e40a2903ba782607e7
+Author: Eric Dumazet <edumazet@google.com>
+Date:   Mon Jul 23 09:28:18 2018 -0700
+
+    tcp: avoid collapses in tcp_prune_queue() if possible
+    
+    Right after a TCP flow is created, receiving tiny out of order
+    packets allways hit the condition :
+    
+    if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+            tcp_clamp_window(sk);
+    
+    tcp_clamp_window() increases sk_rcvbuf to match sk_rmem_alloc
+    (guarded by tcp_rmem[2])
+    
+    Calling tcp_collapse_ofo_queue() in this case is not useful,
+    and offers a O(N^2) surface attack to malicious peers.
+    
+    Better not attempt anything before full queue capacity is reached,
+    forcing attacker to spend lots of resource and allow us to more
+    easily detect the abuse.
+    
+    Signed-off-by: Eric Dumazet <edumazet@google.com>
+    Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+    Acked-by: Yuchung Cheng <ycheng@google.com>
+    Signed-off-by: David S. Miller <davem@davemloft.net>
+
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index 64e45b279431..53289911362a 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -5004,6 +5004,9 @@ static int tcp_prune_queue(struct sock *sk)
+       else if (tcp_under_memory_pressure(sk))
+               tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
+ 
++      if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
++              return 0;
++
+       tcp_collapse_ofo_queue(sk);
+       if (!skb_queue_empty(&sk->sk_receive_queue))
+               tcp_collapse(sk, &sk->sk_receive_queue, NULL,
+commit 72cd43ba64fc172a443410ce01645895850844c8
+Author: Eric Dumazet <edumazet@google.com>
+Date:   Mon Jul 23 09:28:17 2018 -0700
+
+    tcp: free batches of packets in tcp_prune_ofo_queue()
+    
+    Juha-Matti Tilli reported that malicious peers could inject tiny
+    packets in out_of_order_queue, forcing very expensive calls
+    to tcp_collapse_ofo_queue() and tcp_prune_ofo_queue() for
+    every incoming packet. out_of_order_queue rb-tree can contain
+    thousands of nodes, iterating over all of them is not nice.
+    
+    Before linux-4.9, we would have pruned all packets in ofo_queue
+    in one go, every XXXX packets. XXXX depends on sk_rcvbuf and skbs
+    truesize, but is about 7000 packets with tcp_rmem[2] default of 6 MB.
+    
+    Since we plan to increase tcp_rmem[2] in the future to cope with
+    modern BDP, can not revert to the old behavior, without great pain.
+    
+    Strategy taken in this patch is to purge ~12.5 % of the queue capacity.
+    
+    Fixes: 36a6503fedda ("tcp: refine tcp_prune_ofo_queue() to not drop all packets")
+    Signed-off-by: Eric Dumazet <edumazet@google.com>
+    Reported-by: Juha-Matti Tilli <juha-matti.tilli@iki.fi>
+    Acked-by: Yuchung Cheng <ycheng@google.com>
+    Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+    Signed-off-by: David S. Miller <davem@davemloft.net>
+
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index 6bade06aaf72..64e45b279431 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4942,6 +4942,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
+  * 2) not add too big latencies if thousands of packets sit there.
+  *    (But if application shrinks SO_RCVBUF, we could still end up
+  *     freeing whole queue here)
++ * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
+  *
+  * Return true if queue has shrunk.
+  */
+@@ -4949,20 +4950,26 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
+ {
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct rb_node *node, *prev;
++      int goal;
+ 
+       if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
+               return false;
+ 
+       NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
++      goal = sk->sk_rcvbuf >> 3;
+       node = &tp->ooo_last_skb->rbnode;
+       do {
+               prev = rb_prev(node);
+               rb_erase(node, &tp->out_of_order_queue);
++              goal -= rb_to_skb(node)->truesize;
+               tcp_drop(sk, rb_to_skb(node));
+-              sk_mem_reclaim(sk);
+-              if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
+-                  !tcp_under_memory_pressure(sk))
+-                      break;
++              if (!prev || goal <= 0) {
++                      sk_mem_reclaim(sk);
++                      if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
++                          !tcp_under_memory_pressure(sk))
++                              break;
++                      goal = sk->sk_rcvbuf >> 3;
++              }
+               node = prev;
+       } while (node);
+       tp->ooo_last_skb = rb_to_skb(prev);
author	Jan Rękorajski <baggins@pld-linux.org>
	Wed, 25 Jul 2018 11:49:07 +0000 (13:49 +0200)
committer	Jan Rękorajski <baggins@pld-linux.org>
	Wed, 25 Jul 2018 11:49:07 +0000 (13:49 +0200)
kernel.spec		patch \| blob \| blame \| history
tcp-ooo.patch	[new file with mode: 0644]	patch \| blob