|
@@ -904,6 +904,72 @@ out:
|
|
|
sk_free(sk);
|
|
|
}
|
|
|
|
|
|
+/* Note: Called under hard irq.
|
|
|
+ * We can not call TCP stack right away.
|
|
|
+ */
|
|
|
+enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)
|
|
|
+{
|
|
|
+ struct tcp_sock *tp = container_of(timer, struct tcp_sock, pacing_timer);
|
|
|
+ struct sock *sk = (struct sock *)tp;
|
|
|
+ unsigned long nval, oval;
|
|
|
+
|
|
|
+ for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
|
|
|
+ struct tsq_tasklet *tsq;
|
|
|
+ bool empty;
|
|
|
+
|
|
|
+ if (oval & TSQF_QUEUED)
|
|
|
+ break;
|
|
|
+
|
|
|
+ nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED | TCPF_TSQ_DEFERRED;
|
|
|
+ nval = cmpxchg(&sk->sk_tsq_flags, oval, nval);
|
|
|
+ if (nval != oval)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ if (!atomic_inc_not_zero(&sk->sk_wmem_alloc))
|
|
|
+ break;
|
|
|
+ /* queue this socket to tasklet queue */
|
|
|
+ tsq = this_cpu_ptr(&tsq_tasklet);
|
|
|
+ empty = list_empty(&tsq->head);
|
|
|
+ list_add(&tp->tsq_node, &tsq->head);
|
|
|
+ if (empty)
|
|
|
+ tasklet_schedule(&tsq->tasklet);
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ return HRTIMER_NORESTART;
|
|
|
+}
|
|
|
+
|
|
|
+/* BBR congestion control needs pacing.
|
|
|
+ * Same remark for SO_MAX_PACING_RATE.
|
|
|
+ * sch_fq packet scheduler is efficiently handling pacing,
|
|
|
+ * but is not always installed/used.
|
|
|
+ * Return true if TCP stack should pace packets itself.
|
|
|
+ */
|
|
|
+static bool tcp_needs_internal_pacing(const struct sock *sk)
|
|
|
+{
|
|
|
+ return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED;
|
|
|
+}
|
|
|
+
|
|
|
+static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
|
|
|
+{
|
|
|
+ u64 len_ns;
|
|
|
+ u32 rate;
|
|
|
+
|
|
|
+ if (!tcp_needs_internal_pacing(sk))
|
|
|
+ return;
|
|
|
+ rate = sk->sk_pacing_rate;
|
|
|
+ if (!rate || rate == ~0U)
|
|
|
+ return;
|
|
|
+
|
|
|
+ /* Should account for header sizes as sch_fq does,
|
|
|
+ * but lets make things simple.
|
|
|
+ */
|
|
|
+ len_ns = (u64)skb->len * NSEC_PER_SEC;
|
|
|
+ do_div(len_ns, rate);
|
|
|
+ hrtimer_start(&tcp_sk(sk)->pacing_timer,
|
|
|
+ ktime_add_ns(ktime_get(), len_ns),
|
|
|
+ HRTIMER_MODE_ABS_PINNED);
|
|
|
+}
|
|
|
+
|
|
|
/* This routine actually transmits TCP packets queued in by
|
|
|
* tcp_do_sendmsg(). This is used by both the initial
|
|
|
* transmission and possible later retransmissions.
|
|
@@ -1034,6 +1100,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
|
|
|
if (skb->len != tcp_header_size) {
|
|
|
tcp_event_data_sent(tp, sk);
|
|
|
tp->data_segs_out += tcp_skb_pcount(skb);
|
|
|
+ tcp_internal_pacing(sk, skb);
|
|
|
}
|
|
|
|
|
|
if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
|
|
@@ -2086,6 +2153,12 @@ static int tcp_mtu_probe(struct sock *sk)
|
|
|
return -1;
|
|
|
}
|
|
|
|
|
|
+static bool tcp_pacing_check(const struct sock *sk)
|
|
|
+{
|
|
|
+ return tcp_needs_internal_pacing(sk) &&
|
|
|
+ hrtimer_active(&tcp_sk(sk)->pacing_timer);
|
|
|
+}
|
|
|
+
|
|
|
/* TCP Small Queues :
|
|
|
* Control number of packets in qdisc/devices to two packets / or ~1 ms.
|
|
|
* (These limits are doubled for retransmits)
|
|
@@ -2210,6 +2283,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
|
|
|
while ((skb = tcp_send_head(sk))) {
|
|
|
unsigned int limit;
|
|
|
|
|
|
+ if (tcp_pacing_check(sk))
|
|
|
+ break;
|
|
|
+
|
|
|
tso_segs = tcp_init_tso_segs(skb, mss_now);
|
|
|
BUG_ON(!tso_segs);
|
|
|
|
|
@@ -2878,6 +2954,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
|
|
|
|
|
|
if (skb == tcp_send_head(sk))
|
|
|
break;
|
|
|
+
|
|
|
+ if (tcp_pacing_check(sk))
|
|
|
+ break;
|
|
|
+
|
|
|
/* we could do better than to assign each time */
|
|
|
if (!hole)
|
|
|
tp->retransmit_skb_hint = skb;
|