8 years ago · 218af599fa
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -293,6 +293,8 @@ struct tcp_sock {
 
				 	u32	sacked_out;	/* SACK'd packets			*/
			
 
				 	u32	fackets_out;	/* FACK'd packets			*/
			
 
				 
			
 
				+	struct hrtimer	pacing_timer;
			
 
				+
			
 
				 	/* from STCP, retrans queue hinting */
			
 
				 	struct sk_buff* lost_skb_hint;
			
 
				 	struct sk_buff *retransmit_skb_hint;
			
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -253,6 +253,7 @@ struct sock_common {
 
				   *	@sk_ll_usec: usecs to busypoll when there is no data
			
 
				   *	@sk_allocation: allocation mode
			
 
				   *	@sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler)
			
 
				+  *	@sk_pacing_status: Pacing status (requested, handled by sch_fq)
			
 
				   *	@sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE)
			
 
				   *	@sk_sndbuf: size of send buffer in bytes
			
 
				   *	@sk_padding: unused element for alignment
			
@@ -396,7 +397,7 @@ struct sock {
 
				 	__s32			sk_peek_off;
			
 
				 	int			sk_write_pending;
			
 
				 	__u32			sk_dst_pending_confirm;
			
 
				-	/* Note: 32bit hole on 64bit arches */
			
 
				+	u32			sk_pacing_status; /* see enum sk_pacing */
			
 
				 	long			sk_sndtimeo;
			
 
				 	struct timer_list	sk_timer;
			
 
				 	__u32			sk_priority;
			
@@ -475,6 +476,12 @@ struct sock {
 
				 	struct rcu_head		sk_rcu;
			
 
				 };
			
 
				 
			
 
				+enum sk_pacing {
			
 
				+	SK_PACING_NONE		= 0,
			
 
				+	SK_PACING_NEEDED	= 1,
			
 
				+	SK_PACING_FQ		= 2,
			
 
				+};
			
 
				+
			
 
				 #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data)))
			
 
				 
			
 
				 #define rcu_dereference_sk_user_data(sk)	rcu_dereference(__sk_user_data((sk)))
			
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -574,6 +574,7 @@ void tcp_fin(struct sock *sk);
 
				 void tcp_init_xmit_timers(struct sock *);
			
 
				 static inline void tcp_clear_xmit_timers(struct sock *sk)
			
 
				 {
			
 
				+	hrtimer_cancel(&tcp_sk(sk)->pacing_timer);
			
 
				 	inet_csk_clear_xmit_timers(sk);
			
 
				 }
			
 
				 
			
@@ -1945,4 +1946,6 @@ static inline void tcp_listendrop(const struct sock *sk)
 
				 	__NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS);
			
 
				 }
			
 
				 
			
 
				+enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer);
			
 
				+
			
 
				 #endif	/* _TCP_H */
			
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1041,6 +1041,10 @@ set_rcvbuf:
 
				 #endif
			
 
				 
			
 
				 	case SO_MAX_PACING_RATE:
			
 
				+		if (val != ~0U)
			
 
				+			cmpxchg(&sk->sk_pacing_status,
			
 
				+				SK_PACING_NONE,
			
 
				+				SK_PACING_NEEDED);
			
 
				 		sk->sk_max_pacing_rate = val;
			
 
				 		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
			
 
				 					 sk->sk_max_pacing_rate);
			
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -52,10 +52,9 @@
 
				  * There is a public e-mail list for discussing BBR development and testing:
			
 
				  *   https://groups.google.com/forum/#!forum/bbr-dev
			
 
				  *
			
 
				- * NOTE: BBR *must* be used with the fq qdisc ("man tc-fq") with pacing enabled,
			
 
				- * since pacing is integral to the BBR design and implementation.
			
 
				- * BBR without pacing would not function properly, and may incur unnecessary
			
 
				- * high packet loss rates.
			
 
				+ * NOTE: BBR might be used with the fq qdisc ("man tc-fq") with pacing enabled,
			
 
				+ * otherwise TCP stack falls back to an internal pacing using one high
			
 
				+ * resolution timer per TCP socket and may use more resources.
			
 
				  */
			
 
				 #include <linux/module.h>
			
 
				 #include <net/tcp.h>
			
@@ -830,6 +829,8 @@ static void bbr_init(struct sock *sk)
 
				 	bbr->cycle_idx = 0;
			
 
				 	bbr_reset_lt_bw_sampling(sk);
			
 
				 	bbr_reset_startup_mode(sk);
			
 
				+
			
 
				+	cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
			
 
				 }
			
 
				 
			
 
				 static u32 bbr_sndbuf_expand(struct sock *sk)
			
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -904,6 +904,72 @@ out:
 
				 	sk_free(sk);
			
 
				 }
			
 
				 
			
 
				+/* Note: Called under hard irq.
			
 
				+ * We can not call TCP stack right away.
			
 
				+ */
			
 
				+enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
			
 
				+{
			
 
				+	struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
			
 
				+	struct sock *sk = (struct sock *)tp;
			
 
				+	unsigned long nval, oval;
			
 
				+
			
 
				+	for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
			
 
				+		struct tsq_tasklet *tsq;
			
 
				+		bool empty;
			
 
				+
			
 
				+		if (oval & TSQF_QUEUED)
			
 
				+			break;
			
 
				+
			
 
				+		nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
			
 
				+		nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
			
 
				+		if (nval != oval)
			
 
				+			continue;
			
 
				+
			
 
				+		if (!atomic_inc_not_zero(&sk->sk_wmem_alloc))
			
 
				+			break;
			
 
				+		/* queue this socket to tasklet queue */
			
 
				+		tsq = this_cpu_ptr(&tsq_tasklet);
			
 
				+		empty = list_empty(&tsq->head);
			
 
				+		list_add(&tp->tsq_node, &tsq->head);
			
 
				+		if (empty)
			
 
				+			tasklet_schedule(&tsq->tasklet);
			
 
				+		break;
			
 
				+	}
			
 
				+	return HRTIMER_NORESTART;
			
 
				+}
			
 
				+
			
 
				+/* BBR congestion control needs pacing.
			
 
				+ * Same remark for SO_MAX_PACING_RATE.
			
 
				+ * sch_fq packet scheduler is efficiently handling pacing,
			
 
				+ * but is not always installed/used.
			
 
				+ * Return true if TCP stack should pace packets itself.
			
 
				+ */
			
 
				+static bool tcp_needs_internal_pacing(const struct sock *sk)
			
 
				+{
			
 
				+	return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED;
			
 
				+}
			
 
				+
			
 
				+static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
			
 
				+{
			
 
				+	u64 len_ns;
			
 
				+	u32 rate;
			
 
				+
			
 
				+	if (!tcp_needs_internal_pacing(sk))
			
 
				+		return;
			
 
				+	rate = sk->sk_pacing_rate;
			
 
				+	if (!rate || rate == ~0U)
			
 
				+		return;
			
 
				+
			
 
				+	/* Should account for header sizes as sch_fq does,
			
 
				+	 * but lets make things simple.
			
 
				+	 */
			
 
				+	len_ns = (u64)skb->len * NSEC_PER_SEC;
			
 
				+	do_div(len_ns, rate);
			
 
				+	hrtimer_start(&tcp_sk(sk)->pacing_timer,
			
 
				+		      ktime_add_ns(ktime_get(), len_ns),
			
 
				+		      HRTIMER_MODE_ABS_PINNED);
			
 
				+}
			
 
				+
			
 
				 /* This routine actually transmits TCP packets queued in by
			
 
				  * tcp_do_sendmsg().  This is used by both the initial
			
 
				  * transmission and possible later retransmissions.
			
@@ -1034,6 +1100,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 
				 	if (skb->len != tcp_header_size) {
			
 
				 		tcp_event_data_sent(tp, sk);
			
 
				 		tp->data_segs_out += tcp_skb_pcount(skb);
			
 
				+		tcp_internal_pacing(sk, skb);
			
 
				 	}
			
 
				 
			
 
				 	if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
			
@@ -2086,6 +2153,12 @@ static int tcp_mtu_probe(struct sock *sk)
 
				 	return -1;
			
 
				 }
			
 
				 
			
 
				+static bool tcp_pacing_check(const struct sock *sk)
			
 
				+{
			
 
				+	return tcp_needs_internal_pacing(sk) &&
			
 
				+	       hrtimer_active(&tcp_sk(sk)->pacing_timer);
			
 
				+}
			
 
				+
			
 
				 /* TCP Small Queues :
			
 
				  * Control number of packets in qdisc/devices to two packets / or ~1 ms.
			
 
				  * (These limits are doubled for retransmits)
			
@@ -2210,6 +2283,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 
				 	while ((skb = tcp_send_head(sk))) {
			
 
				 		unsigned int limit;
			
 
				 
			
 
				+		if (tcp_pacing_check(sk))
			
 
				+			break;
			
 
				+
			
 
				 		tso_segs = tcp_init_tso_segs(skb, mss_now);
			
 
				 		BUG_ON(!tso_segs);
			
 
				 
			
@@ -2878,6 +2954,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 
				 
			
 
				 		if (skb == tcp_send_head(sk))
			
 
				 			break;
			
 
				+
			
 
				+		if (tcp_pacing_check(sk))
			
 
				+			break;
			
 
				+
			
 
				 		/* we could do better than to assign each time */
			
 
				 		if (!hole)
			
 
				 			tp->retransmit_skb_hint = skb;
			
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -710,4 +710,7 @@ void tcp_init_xmit_timers(struct sock *sk)
 
				 {
			
 
				 	inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
			
 
				 				  &tcp_keepalive_timer);
			
 
				+	hrtimer_init(&tcp_sk(sk)->pacing_timer, CLOCK_MONOTONIC,
			
 
				+		     HRTIMER_MODE_ABS_PINNED);
			
 
				+	tcp_sk(sk)->pacing_timer.function = tcp_pace_kick;
			
 
				 }
			
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -390,9 +390,17 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 
				 		q->stat_tcp_retrans++;
			
 
				 	qdisc_qstats_backlog_inc(sch, skb);
			
 
				 	if (fq_flow_is_detached(f)) {
			
 
				+		struct sock *sk = skb->sk;
			
 
				+
			
 
				 		fq_flow_add_tail(&q->new_flows, f);
			
 
				 		if (time_after(jiffies, f->age + q->flow_refill_delay))
			
 
				 			f->credit = max_t(u32, f->credit, q->quantum);
			
 
				+		if (sk && q->rate_enable) {
			
 
				+			if (unlikely(smp_load_acquire(&sk->sk_pacing_status) !=
			
 
				+				     SK_PACING_FQ))
			
 
				+				smp_store_release(&sk->sk_pacing_status,
			
 
				+						  SK_PACING_FQ);
			
 
				+		}
			
 
				 		q->inactive_flows--;
			
 
				 	}