7 years ago · 10e361e100
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -449,8 +449,10 @@ tcp_recovery - INTEGER
 
				 	features.
			
 
				 
			
 
				 	RACK: 0x1 enables the RACK loss detection for fast detection of lost
			
 
				-	      retransmissions and tail drops.
			
 
				+	      retransmissions and tail drops. It also subsumes and disables
			
 
				+	      RFC6675 recovery for SACK connections.
			
 
				 	RACK: 0x2 makes RACK's reordering window static (min_rtt/4).
			
 
				+	RACK: 0x4 disables RACK's DUPACK threshold heuristic
			
 
				 
			
 
				 	Default: 0x1
			
 
				 
			
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -245,6 +245,7 @@ extern long sysctl_tcp_mem[3];
 
				 
			
 
				 #define TCP_RACK_LOSS_DETECTION  0x1 /* Use RACK to detect losses */
			
 
				 #define TCP_RACK_STATIC_REO_WND  0x2 /* Use static RACK reo wnd */
			
 
				+#define TCP_RACK_NO_DUPTHRESH    0x4 /* Do not use DUPACK threshold in RACK */
			
 
				 
			
 
				 extern atomic_long_t tcp_memory_allocated;
			
 
				 extern struct percpu_counter tcp_sockets_allocated;
			
@@ -1876,6 +1877,10 @@ void tcp_v4_init(void);
 
				 void tcp_init(void);
			
 
				 
			
 
				 /* tcp_recovery.c */
			
 
				+void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb);
			
 
				+void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced);
			
 
				+extern s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb,
			
 
				+				u32 reo_wnd);
			
 
				 extern void tcp_rack_mark_lost(struct sock *sk);
			
 
				 extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
			
 
				 			     u64 xmit_time);
			
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1917,19 +1917,54 @@ static inline void tcp_init_undo(struct tcp_sock *tp)
 
				 	tp->undo_retrans = tp->retrans_out ? : -1;
			
 
				 }
			
 
				 
			
 
				-/* Enter Loss state. If we detect SACK reneging, forget all SACK information
			
 
				+static bool tcp_is_rack(const struct sock *sk)
			
 
				+{
			
 
				+	return sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION;
			
 
				+}
			
 
				+
			
 
				+/* If we detect SACK reneging, forget all SACK information
			
 
				  * and reset tags completely, otherwise preserve SACKs. If receiver
			
 
				  * dropped its ofo queue, we will know this due to reneging detection.
			
 
				  */
			
 
				+static void tcp_timeout_mark_lost(struct sock *sk)
			
 
				+{
			
 
				+	struct tcp_sock *tp = tcp_sk(sk);
			
 
				+	struct sk_buff *skb, *head;
			
 
				+	bool is_reneg;			/* is receiver reneging on SACKs? */
			
 
				+
			
 
				+	head = tcp_rtx_queue_head(sk);
			
 
				+	is_reneg = head && (TCP_SKB_CB(head)->sacked & TCPCB_SACKED_ACKED);
			
 
				+	if (is_reneg) {
			
 
				+		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
			
 
				+		tp->sacked_out = 0;
			
 
				+		/* Mark SACK reneging until we recover from this loss event. */
			
 
				+		tp->is_sack_reneg = 1;
			
 
				+	} else if (tcp_is_reno(tp)) {
			
 
				+		tcp_reset_reno_sack(tp);
			
 
				+	}
			
 
				+
			
 
				+	skb = head;
			
 
				+	skb_rbtree_walk_from(skb) {
			
 
				+		if (is_reneg)
			
 
				+			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
			
 
				+		else if (tcp_is_rack(sk) && skb != head &&
			
 
				+			 tcp_rack_skb_timeout(tp, skb, 0) > 0)
			
 
				+			continue; /* Don't mark recently sent ones lost yet */
			
 
				+		tcp_mark_skb_lost(sk, skb);
			
 
				+	}
			
 
				+	tcp_verify_left_out(tp);
			
 
				+	tcp_clear_all_retrans_hints(tp);
			
 
				+}
			
 
				+
			
 
				+/* Enter Loss state. */
			
 
				 void tcp_enter_loss(struct sock *sk)
			
 
				 {
			
 
				 	const struct inet_connection_sock *icsk = inet_csk(sk);
			
 
				 	struct tcp_sock *tp = tcp_sk(sk);
			
 
				 	struct net *net = sock_net(sk);
			
 
				-	struct sk_buff *skb;
			
 
				 	bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
			
 
				-	bool is_reneg;			/* is receiver reneging on SACKs? */
			
 
				-	bool mark_lost;
			
 
				+
			
 
				+	tcp_timeout_mark_lost(sk);
			
 
				 
			
 
				 	/* Reduce ssthresh if it has not yet been made inside this window. */
			
 
				 	if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
			
@@ -1941,40 +1976,10 @@ void tcp_enter_loss(struct sock *sk)
 
				 		tcp_ca_event(sk, CA_EVENT_LOSS);
			
 
				 		tcp_init_undo(tp);
			
 
				 	}
			
 
				-	tp->snd_cwnd	   = 1;
			
 
				+	tp->snd_cwnd	   = tcp_packets_in_flight(tp) + 1;
			
 
				 	tp->snd_cwnd_cnt   = 0;
			
 
				 	tp->snd_cwnd_stamp = tcp_jiffies32;
			
 
				 
			
 
				-	tp->retrans_out = 0;
			
 
				-	tp->lost_out = 0;
			
 
				-
			
 
				-	if (tcp_is_reno(tp))
			
 
				-		tcp_reset_reno_sack(tp);
			
 
				-
			
 
				-	skb = tcp_rtx_queue_head(sk);
			
 
				-	is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED);
			
 
				-	if (is_reneg) {
			
 
				-		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
			
 
				-		tp->sacked_out = 0;
			
 
				-		/* Mark SACK reneging until we recover from this loss event. */
			
 
				-		tp->is_sack_reneg = 1;
			
 
				-	}
			
 
				-	tcp_clear_all_retrans_hints(tp);
			
 
				-
			
 
				-	skb_rbtree_walk_from(skb) {
			
 
				-		mark_lost = (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
			
 
				-			     is_reneg);
			
 
				-		if (mark_lost)
			
 
				-			tcp_sum_lost(tp, skb);
			
 
				-		TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
			
 
				-		if (mark_lost) {
			
 
				-			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
			
 
				-			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
			
 
				-			tp->lost_out += tcp_skb_pcount(skb);
			
 
				-		}
			
 
				-	}
			
 
				-	tcp_verify_left_out(tp);
			
 
				-
			
 
				 	/* Timeout in disordered state after receiving substantial DUPACKs
			
 
				 	 * suggests that the degree of reordering is over-estimated.
			
 
				 	 */
			
@@ -2141,7 +2146,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
 
				 		return true;
			
 
				 
			
 
				 	/* Not-A-Trick#2 : Classic rule... */
			
 
				-	if (tcp_dupack_heuristics(tp) > tp->reordering)
			
 
				+	if (!tcp_is_rack(sk) && tcp_dupack_heuristics(tp) > tp->reordering)
			
 
				 		return true;
			
 
				 
			
 
				 	return false;
			
@@ -2218,9 +2223,7 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
 
				 {
			
 
				 	struct tcp_sock *tp = tcp_sk(sk);
			
 
				 
			
 
				-	if (tcp_is_reno(tp)) {
			
 
				-		tcp_mark_head_lost(sk, 1, 1);
			
 
				-	} else {
			
 
				+	if (tcp_is_sack(tp)) {
			
 
				 		int sacked_upto = tp->sacked_out - tp->reordering;
			
 
				 		if (sacked_upto >= 0)
			
 
				 			tcp_mark_head_lost(sk, sacked_upto, 0);
			
@@ -2718,12 +2721,16 @@ static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
 
				 	return false;
			
 
				 }
			
 
				 
			
 
				-static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag)
			
 
				+static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag)
			
 
				 {
			
 
				 	struct tcp_sock *tp = tcp_sk(sk);
			
 
				 
			
 
				-	/* Use RACK to detect loss */
			
 
				-	if (sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) {
			
 
				+	if (tcp_rtx_queue_empty(sk))
			
 
				+		return;
			
 
				+
			
 
				+	if (unlikely(tcp_is_reno(tp))) {
			
 
				+		tcp_newreno_mark_lost(sk, *ack_flag & FLAG_SND_UNA_ADVANCED);
			
 
				+	} else if (tcp_is_rack(sk)) {
			
 
				 		u32 prior_retrans = tp->retrans_out;
			
 
				 
			
 
				 		tcp_rack_mark_lost(sk);
			
@@ -2819,11 +2826,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
 
				 			tcp_try_keep_open(sk);
			
 
				 			return;
			
 
				 		}
			
 
				-		tcp_rack_identify_loss(sk, ack_flag);
			
 
				+		tcp_identify_packet_loss(sk, ack_flag);
			
 
				 		break;
			
 
				 	case TCP_CA_Loss:
			
 
				 		tcp_process_loss(sk, flag, is_dupack, rexmit);
			
 
				-		tcp_rack_identify_loss(sk, ack_flag);
			
 
				+		tcp_identify_packet_loss(sk, ack_flag);
			
 
				 		if (!(icsk->icsk_ca_state == TCP_CA_Open ||
			
 
				 		      (*ack_flag & FLAG_LOST_RETRANS)))
			
 
				 			return;
			
@@ -2840,7 +2847,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
 
				 		if (icsk->icsk_ca_state <= TCP_CA_Disorder)
			
 
				 			tcp_try_undo_dsack(sk);
			
 
				 
			
 
				-		tcp_rack_identify_loss(sk, ack_flag);
			
 
				+		tcp_identify_packet_loss(sk, ack_flag);
			
 
				 		if (!tcp_time_to_recover(sk, flag)) {
			
 
				 			tcp_try_to_open(sk, flag);
			
 
				 			return;
			
@@ -2862,7 +2869,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
 
				 		fast_rexmit = 1;
			
 
				 	}
			
 
				 
			
 
				-	if (do_lost)
			
 
				+	if (!tcp_is_rack(sk) && do_lost)
			
 
				 		tcp_update_scoreboard(sk, fast_rexmit);
			
 
				 	*rexmit = REXMIT_LOST;
			
 
				 }
			
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -2,7 +2,7 @@
 
				 #include <linux/tcp.h>
			
 
				 #include <net/tcp.h>
			
 
				 
			
 
				-static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
			
 
				+void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
			
 
				 {
			
 
				 	struct tcp_sock *tp = tcp_sk(sk);
			
 
				 
			
@@ -21,6 +21,38 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
 
				 	return t1 > t2 || (t1 == t2 && after(seq1, seq2));
			
 
				 }
			
 
				 
			
 
				+u32 tcp_rack_reo_wnd(const struct sock *sk)
			
 
				+{
			
 
				+	struct tcp_sock *tp = tcp_sk(sk);
			
 
				+
			
 
				+	if (!tp->rack.reord) {
			
 
				+		/* If reordering has not been observed, be aggressive during
			
 
				+		 * the recovery or starting the recovery by DUPACK threshold.
			
 
				+		 */
			
 
				+		if (inet_csk(sk)->icsk_ca_state >= TCP_CA_Recovery)
			
 
				+			return 0;
			
 
				+
			
 
				+		if (tp->sacked_out >= tp->reordering &&
			
 
				+		    !(sock_net(sk)->ipv4.sysctl_tcp_recovery & TCP_RACK_NO_DUPTHRESH))
			
 
				+			return 0;
			
 
				+	}
			
 
				+
			
 
				+	/* To be more reordering resilient, allow min_rtt/4 settling delay.
			
 
				+	 * Use min_rtt instead of the smoothed RTT because reordering is
			
 
				+	 * often a path property and less related to queuing or delayed ACKs.
			
 
				+	 * Upon receiving DSACKs, linearly increase the window up to the
			
 
				+	 * smoothed RTT.
			
 
				+	 */
			
 
				+	return min((tcp_min_rtt(tp) >> 2) * tp->rack.reo_wnd_steps,
			
 
				+		   tp->srtt_us >> 3);
			
 
				+}
			
 
				+
			
 
				+s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd)
			
 
				+{
			
 
				+	return tp->rack.rtt_us + reo_wnd -
			
 
				+	       tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
			
 
				+}
			
 
				+
			
 
				 /* RACK loss detection (IETF draft draft-ietf-tcpm-rack-01):
			
 
				  *
			
 
				  * Marks a packet lost, if some packet sent later has been (s)acked.
			
@@ -44,23 +76,11 @@ static bool tcp_rack_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2)
 
				 static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
			
 
				 {
			
 
				 	struct tcp_sock *tp = tcp_sk(sk);
			
 
				-	u32 min_rtt = tcp_min_rtt(tp);
			
 
				 	struct sk_buff *skb, *n;
			
 
				 	u32 reo_wnd;
			
 
				 
			
 
				 	*reo_timeout = 0;
			
 
				-	/* To be more reordering resilient, allow min_rtt/4 settling delay
			
 
				-	 * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
			
 
				-	 * RTT because reordering is often a path property and less related
			
 
				-	 * to queuing or delayed ACKs.
			
 
				-	 */
			
 
				-	reo_wnd = 1000;
			
 
				-	if ((tp->rack.reord || inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery) &&
			
 
				-	    min_rtt != ~0U) {
			
 
				-		reo_wnd = max((min_rtt >> 2) * tp->rack.reo_wnd_steps, reo_wnd);
			
 
				-		reo_wnd = min(reo_wnd, tp->srtt_us >> 3);
			
 
				-	}
			
 
				-
			
 
				+	reo_wnd = tcp_rack_reo_wnd(sk);
			
 
				 	list_for_each_entry_safe(skb, n, &tp->tsorted_sent_queue,
			
 
				 				 tcp_tsorted_anchor) {
			
 
				 		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
			
@@ -78,10 +98,9 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout)
 
				 		/* A packet is lost if it has not been s/acked beyond
			
 
				 		 * the recent RTT plus the reordering window.
			
 
				 		 */
			
 
				-		remaining = tp->rack.rtt_us + reo_wnd -
			
 
				-			    tcp_stamp_us_delta(tp->tcp_mstamp, skb->skb_mstamp);
			
 
				+		remaining = tcp_rack_skb_timeout(tp, skb, reo_wnd);
			
 
				 		if (remaining <= 0) {
			
 
				-			tcp_rack_mark_skb_lost(sk, skb);
			
 
				+			tcp_mark_skb_lost(sk, skb);
			
 
				 			list_del_init(&skb->tcp_tsorted_anchor);
			
 
				 		} else {
			
 
				 			/* Record maximum wait time */
			
@@ -202,3 +221,30 @@ void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs)
 
				 		tp->rack.reo_wnd_steps = 1;
			
 
				 	}
			
 
				 }
			
 
				+
			
 
				+/* RFC6582 NewReno recovery for non-SACK connection. It simply retransmits
			
 
				+ * the next unacked packet upon receiving
			
 
				+ * a) three or more DUPACKs to start the fast recovery
			
 
				+ * b) an ACK acknowledging new data during the fast recovery.
			
 
				+ */
			
 
				+void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced)
			
 
				+{
			
 
				+	const u8 state = inet_csk(sk)->icsk_ca_state;
			
 
				+	struct tcp_sock *tp = tcp_sk(sk);
			
 
				+
			
 
				+	if ((state < TCP_CA_Recovery && tp->sacked_out >= tp->reordering) ||
			
 
				+	    (state == TCP_CA_Recovery && snd_una_advanced)) {
			
 
				+		struct sk_buff *skb = tcp_rtx_queue_head(sk);
			
 
				+		u32 mss;
			
 
				+
			
 
				+		if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
			
 
				+			return;
			
 
				+
			
 
				+		mss = tcp_skb_mss(skb);
			
 
				+		if (tcp_skb_pcount(skb) > 1 && skb->len > mss)
			
 
				+			tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
			
 
				+				     mss, mss, GFP_ATOMIC);
			
 
				+
			
 
				+		tcp_skb_mark_lost_uncond_verify(tp, skb);
			
 
				+	}
			
 
				+}