8 years ago · a0370b3f3f
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -262,6 +262,9 @@ extern int sysctl_tcp_slow_start_after_idle;
 
				 extern int sysctl_tcp_thin_linear_timeouts;
			
 
				 extern int sysctl_tcp_thin_dupack;
			
 
				 extern int sysctl_tcp_early_retrans;
			
 
				+extern int sysctl_tcp_recovery;
			
 
				+#define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
			
 
				+
			
 
				 extern int sysctl_tcp_limit_output_bytes;
			
 
				 extern int sysctl_tcp_challenge_ack_limit;
			
 
				 extern int sysctl_tcp_min_tso_segs;
			
@@ -1043,6 +1046,7 @@ static inline void tcp_enable_early_retrans(struct tcp_sock *tp)
 
				 
			
 
				 	tp->do_early_retrans = sysctl_tcp_early_retrans &&
			
 
				 		sysctl_tcp_early_retrans < 4 && !sysctl_tcp_thin_dupack &&
			
 
				+		!(sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) &&
			
 
				 		net->ipv4.sysctl_tcp_reordering == 3;
			
 
				 }
			
 
				 
			
@@ -1859,13 +1863,6 @@ void tcp_v4_init(void);
 
				 void tcp_init(void);
			
 
				 
			
 
				 /* tcp_recovery.c */
			
 
				-
			
 
				-/* Flags to enable various loss recovery features. See below */
			
 
				-extern int sysctl_tcp_recovery;
			
 
				-
			
 
				-/* Use TCP RACK to detect (some) tail and retransmit losses */
			
 
				-#define TCP_RACK_LOST_RETRANS  0x1
			
 
				-
			
 
				 extern void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now);
			
 
				 extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
			
 
				 			     const struct skb_mstamp *xmit_time,
			
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2129,10 +2129,25 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
 
				  *	F.e. after RTO, when all the queue is considered as lost,
			
 
				  *	lost_out = packets_out and in_flight = retrans_out.
			
 
				  *
			
 
				- *		Essentially, we have now two algorithms counting
			
 
				+ *		Essentially, we have now a few algorithms detecting
			
 
				  *		lost packets.
			
 
				  *
			
 
				- *		FACK: It is the simplest heuristics. As soon as we decided
			
 
				+ *		If the receiver supports SACK:
			
 
				+ *
			
 
				+ *		RFC6675/3517: It is the conventional algorithm. A packet is
			
 
				+ *		considered lost if the number of higher sequence packets
			
 
				+ *		SACKed is greater than or equal the DUPACK thoreshold
			
 
				+ *		(reordering). This is implemented in tcp_mark_head_lost and
			
 
				+ *		tcp_update_scoreboard.
			
 
				+ *
			
 
				+ *		RACK (draft-ietf-tcpm-rack-01): it is a newer algorithm
			
 
				+ *		(2017-) that checks timing instead of counting DUPACKs.
			
 
				+ *		Essentially a packet is considered lost if it's not S/ACKed
			
 
				+ *		after RTT + reordering_window, where both metrics are
			
 
				+ *		dynamically measured and adjusted. This is implemented in
			
 
				+ *		tcp_rack_mark_lost.
			
 
				+ *
			
 
				+ *		FACK: it is the simplest heuristics. As soon as we decided
			
 
				  *		that something is lost, we decide that _all_ not SACKed
			
 
				  *		packets until the most forward SACK are lost. I.e.
			
 
				  *		lost_out = fackets_out - sacked_out and left_out = fackets_out.
			
@@ -2141,16 +2156,14 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
 
				  *		takes place. We use FACK by default until reordering
			
 
				  *		is suspected on the path to this destination.
			
 
				  *
			
 
				- *		NewReno: when Recovery is entered, we assume that one segment
			
 
				+ *		If the receiver does not support SACK:
			
 
				+ *
			
 
				+ *		NewReno (RFC6582): in Recovery we assume that one segment
			
 
				  *		is lost (classic Reno). While we are in Recovery and
			
 
				  *		a partial ACK arrives, we assume that one more packet
			
 
				  *		is lost (NewReno). This heuristics are the same in NewReno
			
 
				  *		and SACK.
			
 
				  *
			
 
				- *  Imagine, that's all! Forget about all this shamanism about CWND inflation
			
 
				- *  deflation etc. CWND is real congestion window, never inflated, changes
			
 
				- *  only according to classic VJ rules.
			
 
				- *
			
 
				  * Really tricky (and requiring careful tuning) part of algorithm
			
 
				  * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
			
 
				  * The first determines the moment _when_ we should reduce CWND and,
			
@@ -2807,7 +2820,7 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag,
 
				 	struct tcp_sock *tp = tcp_sk(sk);
			
 
				 
			
 
				 	/* Use RACK to detect loss */
			
 
				-	if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS) {
			
 
				+	if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) {
			
 
				 		u32 prior_retrans = tp->retrans_out;
			
 
				 
			
 
				 		tcp_rack_mark_lost(sk, ack_time);
			
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -1,7 +1,7 @@
 
				 #include <linux/tcp.h>
			
 
				 #include <net/tcp.h>
			
 
				 
			
 
				-int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
			
 
				+int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOSS_DETECTION;
			
 
				 
			
 
				 static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
			
 
				 {
			
@@ -24,7 +24,9 @@ static bool tcp_rack_sent_after(const struct skb_mstamp *t1,
 
				 	       (t1->v64 == t2->v64 && after(seq1, seq2));
			
 
				 }
			
 
				 
			
 
				-/* Marks a packet lost, if some packet sent later has been (s)acked.
			
 
				+/* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
			
 
				+ *
			
 
				+ * Marks a packet lost, if some packet sent later has been (s)acked.
			
 
				  * The underlying idea is similar to the traditional dupthresh and FACK
			
 
				  * but they look at different metrics:
			
 
				  *
			
@@ -37,8 +39,10 @@ static bool tcp_rack_sent_after(const struct skb_mstamp *t1,
 
				  * is being more resilient to reordering by simply allowing some
			
 
				  * "settling delay", instead of tweaking the dupthresh.
			
 
				  *
			
 
				- * The current version is only used after recovery starts but can be
			
 
				- * easily extended to detect the first loss.
			
 
				+ * When tcp_rack_detect_loss() detects some packets are lost and we
			
 
				+ * are not already in the CA_Recovery state, either tcp_rack_reo_timeout()
			
 
				+ * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will
			
 
				+ * make us enter the CA_Recovery state.
			
 
				  */
			
 
				 static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now,
			
 
				 				 u32 *reo_timeout)
			
@@ -54,7 +58,7 @@ static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now,
 
				 	 * to queuing or delayed ACKs.
			
 
				 	 */
			
 
				 	reo_wnd = 1000;
			
 
				-	if (tp->rack.reord && tcp_min_rtt(tp) != ~0U)
			
 
				+	if ((tp->rack.reord || !tp->lost_out) && tcp_min_rtt(tp) != ~0U)
			
 
				 		reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
			
 
				 
			
 
				 	tcp_for_write_queue(skb, sk) {
			
@@ -105,7 +109,7 @@ void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now)
 
				 	struct tcp_sock *tp = tcp_sk(sk);
			
 
				 	u32 timeout;
			
 
				 
			
 
				-	if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
			
 
				+	if (!tp->rack.advanced)
			
 
				 		return;
			
 
				 
			
 
				 	/* Reset the advanced flag to avoid unnecessary queue scanning */