10 年之前 · 4f41b1c58a
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -433,6 +433,15 @@ tcp_orphan_retries - INTEGER
 
				 	you should think about lowering this value, such sockets
			
 
				 	may consume significant resources. Cf. tcp_max_orphans.
			
 
				 
			
 
				+tcp_recovery - INTEGER
			
 
				+	This value is a bitmap to enable various experimental loss recovery
			
 
				+	features.
			
 
				+
			
 
				+	RACK: 0x1 enables the RACK loss detection for fast detection of lost
			
 
				+	      retransmissions and tail drops.
			
 
				+
			
 
				+	Default: 0x1
			
 
				+
			
 
				 tcp_reordering - INTEGER
			
 
				 	Initial reordering level of packets in a TCP stream.
			
 
				 	TCP stack can then dynamically adjust flow reordering level
			
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -567,6 +567,7 @@ void tcp_resume_early_retransmit(struct sock *sk);
 
				 void tcp_rearm_rto(struct sock *sk);
			
 
				 void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
			
 
				 void tcp_reset(struct sock *sk);
			
 
				+void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb);
			
 
				 
			
 
				 /* tcp_timer.c */
			
 
				 void tcp_init_xmit_timers(struct sock *);
			
@@ -1752,6 +1753,14 @@ void tcp_init(void);
 
				 
			
 
				 /* tcp_recovery.c */
			
 
				 
			
 
				+/* Flags to enable various loss recovery features. See below */
			
 
				+extern int sysctl_tcp_recovery;
			
 
				+
			
 
				+/* Use TCP RACK to detect (some) tail and retransmit losses */
			
 
				+#define TCP_RACK_LOST_RETRANS  0x1
			
 
				+
			
 
				+extern int tcp_rack_mark_lost(struct sock *sk);
			
 
				+
			
 
				 extern void tcp_rack_advance(struct tcp_sock *tp,
			
 
				 			     const struct skb_mstamp *xmit_time, u8 sacked);
			
 
				 
			
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -495,6 +495,13 @@ static struct ctl_table ipv4_table[] = {
 
				 		.mode		= 0644,
			
 
				 		.proc_handler	= proc_dointvec
			
 
				 	},
			
 
				+	{
			
 
				+		.procname	= "tcp_recovery",
			
 
				+		.data		= &sysctl_tcp_recovery,
			
 
				+		.maxlen		= sizeof(int),
			
 
				+		.mode		= 0644,
			
 
				+		.proc_handler	= proc_dointvec,
			
 
				+	},
			
 
				 	{
			
 
				 		.procname	= "tcp_reordering",
			
 
				 		.data		= &sysctl_tcp_reordering,
			
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -881,6 +881,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
 
				 
			
 
				 	if (metric > 0)
			
 
				 		tcp_disable_early_retrans(tp);
			
 
				+	tp->rack.reord = 1;
			
 
				 }
			
 
				 
			
 
				 /* This must be called before lost_out is incremented */
			
@@ -906,8 +907,7 @@ static void tcp_skb_mark_lost(struct tcp_sock *tp, struct sk_buff *skb)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp,
			
 
				-					    struct sk_buff *skb)
			
 
				+void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb)
			
 
				 {
			
 
				 	tcp_verify_retransmit_hint(tp, skb);
			
 
				 
			
@@ -2806,6 +2806,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	/* Use RACK to detect loss */
			
 
				+	if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
			
 
				+	    tcp_rack_mark_lost(sk))
			
 
				+		flag |= FLAG_LOST_RETRANS;
			
 
				+
			
 
				 	/* E. Process state. */
			
 
				 	switch (icsk->icsk_ca_state) {
			
 
				 	case TCP_CA_Recovery:
			
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -1,6 +1,83 @@
 
				 #include <linux/tcp.h>
			
 
				 #include <net/tcp.h>
			
 
				 
			
 
				+int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;
			
 
				+
			
 
				+/* Marks a packet lost, if some packet sent later has been (s)acked.
			
 
				+ * The underlying idea is similar to the traditional dupthresh and FACK
			
 
				+ * but they look at different metrics:
			
 
				+ *
			
 
				+ * dupthresh: 3 OOO packets delivered (packet count)
			
 
				+ * FACK: sequence delta to highest sacked sequence (sequence space)
			
 
				+ * RACK: sent time delta to the latest delivered packet (time domain)
			
 
				+ *
			
 
				+ * The advantage of RACK is it applies to both original and retransmitted
			
 
				+ * packet and therefore is robust against tail losses. Another advantage
			
 
				+ * is being more resilient to reordering by simply allowing some
			
 
				+ * "settling delay", instead of tweaking the dupthresh.
			
 
				+ *
			
 
				+ * The current version is only used after recovery starts but can be
			
 
				+ * easily extended to detect the first loss.
			
 
				+ */
			
 
				+int tcp_rack_mark_lost(struct sock *sk)
			
 
				+{
			
 
				+	struct tcp_sock *tp = tcp_sk(sk);
			
 
				+	struct sk_buff *skb;
			
 
				+	u32 reo_wnd, prior_retrans = tp->retrans_out;
			
 
				+
			
 
				+	if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
			
 
				+		return 0;
			
 
				+
			
 
				+	/* Reset the advanced flag to avoid unnecessary queue scanning */
			
 
				+	tp->rack.advanced = 0;
			
 
				+
			
 
				+	/* To be more reordering resilient, allow min_rtt/4 settling delay
			
 
				+	 * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
			
 
				+	 * RTT because reordering is often a path property and less related
			
 
				+	 * to queuing or delayed ACKs.
			
 
				+	 *
			
 
				+	 * TODO: measure and adapt to the observed reordering delay, and
			
 
				+	 * use a timer to retransmit like the delayed early retransmit.
			
 
				+	 */
			
 
				+	reo_wnd = 1000;
			
 
				+	if (tp->rack.reord && tcp_min_rtt(tp) != ~0U)
			
 
				+		reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);
			
 
				+
			
 
				+	tcp_for_write_queue(skb, sk) {
			
 
				+		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
			
 
				+
			
 
				+		if (skb == tcp_send_head(sk))
			
 
				+			break;
			
 
				+
			
 
				+		/* Skip ones already (s)acked */
			
 
				+		if (!after(scb->end_seq, tp->snd_una) ||
			
 
				+		    scb->sacked & TCPCB_SACKED_ACKED)
			
 
				+			continue;
			
 
				+
			
 
				+		if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) {
			
 
				+
			
 
				+			if (skb_mstamp_us_delta(&tp->rack.mstamp,
			
 
				+						&skb->skb_mstamp) <= reo_wnd)
			
 
				+				continue;
			
 
				+
			
 
				+			/* skb is lost if packet sent later is sacked */
			
 
				+			tcp_skb_mark_lost_uncond_verify(tp, skb);
			
 
				+			if (scb->sacked & TCPCB_SACKED_RETRANS) {
			
 
				+				scb->sacked &= ~TCPCB_SACKED_RETRANS;
			
 
				+				tp->retrans_out -= tcp_skb_pcount(skb);
			
 
				+				NET_INC_STATS_BH(sock_net(sk),
			
 
				+						 LINUX_MIB_TCPLOSTRETRANSMIT);
			
 
				+			}
			
 
				+		} else if (!(scb->sacked & TCPCB_RETRANS)) {
			
 
				+			/* Original data are sent sequentially so stop early
			
 
				+			 * b/c the rest are all sent after rack_sent
			
 
				+			 */
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+	return prior_retrans - tp->retrans_out;
			
 
				+}
			
 
				+
			
 
				 /* Record the most recently (re)sent time among the (s)acked packets */
			
 
				 void tcp_rack_advance(struct tcp_sock *tp,
			
 
				 		      const struct skb_mstamp *xmit_time, u8 sacked)