|
@@ -66,15 +66,17 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
|
|
|
int push_one, gfp_t gfp);
|
|
|
|
|
|
/* Account for new data that has been sent to the network. */
|
|
|
-static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
|
|
|
+static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
|
|
|
{
|
|
|
struct inet_connection_sock *icsk = inet_csk(sk);
|
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
|
unsigned int prior_packets = tp->packets_out;
|
|
|
|
|
|
- tcp_advance_send_head(sk, skb);
|
|
|
tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
|
|
|
|
|
|
+ __skb_unlink(skb, &sk->sk_write_queue);
|
|
|
+ tcp_rbtree_insert(&sk->tcp_rtx_queue, skb);
|
|
|
+
|
|
|
tp->packets_out += tcp_skb_pcount(skb);
|
|
|
if (!prior_packets || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
|
|
|
tcp_rearm_rto(sk);
|
|
@@ -1249,12 +1251,25 @@ static void tcp_skb_fragment_eor(struct sk_buff *skb, struct sk_buff *skb2)
|
|
|
TCP_SKB_CB(skb)->eor = 0;
|
|
|
}
|
|
|
|
|
|
+/* Insert buff after skb on the write or rtx queue of sk. */
|
|
|
+static void tcp_insert_write_queue_after(struct sk_buff *skb,
|
|
|
+ struct sk_buff *buff,
|
|
|
+ struct sock *sk,
|
|
|
+ enum tcp_queue tcp_queue)
|
|
|
+{
|
|
|
+ if (tcp_queue == TCP_FRAG_IN_WRITE_QUEUE)
|
|
|
+ __skb_queue_after(&sk->sk_write_queue, skb, buff);
|
|
|
+ else
|
|
|
+ tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
|
|
|
+}
|
|
|
+
|
|
|
/* Function to create two new TCP segments. Shrinks the given segment
|
|
|
* to the specified size and appends a new segment with the rest of the
|
|
|
* packet to the list. This won't be called frequently, I hope.
|
|
|
* Remember, these are still headerless SKBs at this point.
|
|
|
*/
|
|
|
-int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
|
|
|
+int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
|
|
|
+ struct sk_buff *skb, u32 len,
|
|
|
unsigned int mss_now, gfp_t gfp)
|
|
|
{
|
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
@@ -1337,7 +1352,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
|
|
|
|
|
|
/* Link BUFF into the send queue. */
|
|
|
__skb_header_release(buff);
|
|
|
- tcp_insert_write_queue_after(skb, buff, sk);
|
|
|
+ tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
|
|
|
list_add(&buff->tcp_tsorted_anchor, &skb->tcp_tsorted_anchor);
|
|
|
|
|
|
return 0;
|
|
@@ -1625,10 +1640,10 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited)
|
|
|
* is caused by insufficient sender buffer:
|
|
|
* 1) just sent some data (see tcp_write_xmit)
|
|
|
* 2) not cwnd limited (this else condition)
|
|
|
- * 3) no more data to send (null tcp_send_head )
|
|
|
+ * 3) no more data to send (tcp_write_queue_empty())
|
|
|
* 4) application is hitting buffer limit (SOCK_NOSPACE)
|
|
|
*/
|
|
|
- if (!tcp_send_head(sk) && sk->sk_socket &&
|
|
|
+ if (tcp_write_queue_empty(sk) && sk->sk_socket &&
|
|
|
test_bit(SOCK_NOSPACE, &sk->sk_socket->flags) &&
|
|
|
(1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
|
|
|
tcp_chrono_start(sk, TCP_CHRONO_SNDBUF_LIMITED);
|
|
@@ -1824,7 +1839,8 @@ static bool tcp_snd_wnd_test(const struct tcp_sock *tp,
|
|
|
* know that all the data is in scatter-gather pages, and that the
|
|
|
* packet has never been sent out before (and thus is not cloned).
|
|
|
*/
|
|
|
-static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
|
|
|
+static int tso_fragment(struct sock *sk, enum tcp_queue tcp_queue,
|
|
|
+ struct sk_buff *skb, unsigned int len,
|
|
|
unsigned int mss_now, gfp_t gfp)
|
|
|
{
|
|
|
struct sk_buff *buff;
|
|
@@ -1833,7 +1849,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
|
|
|
|
|
|
/* All of a TSO frame must be composed of paged data. */
|
|
|
if (skb->len != skb->data_len)
|
|
|
- return tcp_fragment(sk, skb, len, mss_now, gfp);
|
|
|
+ return tcp_fragment(sk, tcp_queue, skb, len, mss_now, gfp);
|
|
|
|
|
|
buff = sk_stream_alloc_skb(sk, 0, gfp, true);
|
|
|
if (unlikely(!buff))
|
|
@@ -1869,7 +1885,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
|
|
|
|
|
|
/* Link BUFF into the send queue. */
|
|
|
__skb_header_release(buff);
|
|
|
- tcp_insert_write_queue_after(skb, buff, sk);
|
|
|
+ tcp_insert_write_queue_after(skb, buff, sk, tcp_queue);
|
|
|
|
|
|
return 0;
|
|
|
}
|
|
@@ -1939,8 +1955,10 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
|
|
|
goto send_now;
|
|
|
}
|
|
|
|
|
|
- head = tcp_write_queue_head(sk);
|
|
|
-
|
|
|
+ /* TODO : use tsorted_sent_queue ? */
|
|
|
+ head = tcp_rtx_queue_head(sk);
|
|
|
+ if (!head)
|
|
|
+ goto send_now;
|
|
|
age = tcp_stamp_us_delta(tp->tcp_mstamp, head->skb_mstamp);
|
|
|
/* If next ACK is likely to come too late (half srtt), do not defer */
|
|
|
if (age < (tp->srtt_us >> 4))
|
|
@@ -2158,13 +2176,12 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb,
|
|
|
limit <<= factor;
|
|
|
|
|
|
if (refcount_read(&sk->sk_wmem_alloc) > limit) {
|
|
|
- /* Always send the 1st or 2nd skb in write queue.
|
|
|
+ /* Always send skb if rtx queue is empty.
|
|
|
* No need to wait for TX completion to call us back,
|
|
|
* after softirq/tasklet schedule.
|
|
|
* This helps when TX completions are delayed too much.
|
|
|
*/
|
|
|
- if (skb == sk->sk_write_queue.next ||
|
|
|
- skb->prev == sk->sk_write_queue.next)
|
|
|
+ if (tcp_rtx_queue_empty(sk))
|
|
|
return false;
|
|
|
|
|
|
set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
|
|
@@ -2215,7 +2232,7 @@ void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type)
|
|
|
* it's the "most interesting" or current chrono we are
|
|
|
* tracking and starts busy chrono if we have pending data.
|
|
|
*/
|
|
|
- if (tcp_write_queue_empty(sk))
|
|
|
+ if (tcp_rtx_and_write_queues_empty(sk))
|
|
|
tcp_chrono_set(tp, TCP_CHRONO_UNSPEC);
|
|
|
else if (type == tp->chrono_type)
|
|
|
tcp_chrono_set(tp, TCP_CHRONO_BUSY);
|
|
@@ -2310,7 +2327,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
|
|
|
nonagle);
|
|
|
|
|
|
if (skb->len > limit &&
|
|
|
- unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
|
|
|
+ unlikely(tso_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
|
|
|
+ skb, limit, mss_now, gfp)))
|
|
|
break;
|
|
|
|
|
|
if (test_bit(TCP_TSQ_DEFERRED, &sk->sk_tsq_flags))
|
|
@@ -2350,7 +2368,7 @@ repair:
|
|
|
tcp_cwnd_validate(sk, is_cwnd_limited);
|
|
|
return false;
|
|
|
}
|
|
|
- return !tp->packets_out && tcp_send_head(sk);
|
|
|
+ return !tp->packets_out && !tcp_write_queue_empty(sk);
|
|
|
}
|
|
|
|
|
|
bool tcp_schedule_loss_probe(struct sock *sk)
|
|
@@ -2374,7 +2392,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
|
|
|
return false;
|
|
|
|
|
|
if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
|
|
|
- tcp_send_head(sk))
|
|
|
+ !tcp_write_queue_empty(sk))
|
|
|
return false;
|
|
|
|
|
|
/* Probe timeout is 2*rtt. Add minimum RTO to account
|
|
@@ -2427,18 +2445,14 @@ void tcp_send_loss_probe(struct sock *sk)
|
|
|
int mss = tcp_current_mss(sk);
|
|
|
|
|
|
skb = tcp_send_head(sk);
|
|
|
- if (skb) {
|
|
|
- if (tcp_snd_wnd_test(tp, skb, mss)) {
|
|
|
- pcount = tp->packets_out;
|
|
|
- tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
|
|
|
- if (tp->packets_out > pcount)
|
|
|
- goto probe_sent;
|
|
|
- goto rearm_timer;
|
|
|
- }
|
|
|
- skb = tcp_write_queue_prev(sk, skb);
|
|
|
- } else {
|
|
|
- skb = tcp_write_queue_tail(sk);
|
|
|
+ if (skb && tcp_snd_wnd_test(tp, skb, mss)) {
|
|
|
+ pcount = tp->packets_out;
|
|
|
+ tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
|
|
|
+ if (tp->packets_out > pcount)
|
|
|
+ goto probe_sent;
|
|
|
+ goto rearm_timer;
|
|
|
}
|
|
|
+ skb = skb_rb_last(&sk->tcp_rtx_queue);
|
|
|
|
|
|
/* At most one outstanding TLP retransmission. */
|
|
|
if (tp->tlp_high_seq)
|
|
@@ -2456,10 +2470,11 @@ void tcp_send_loss_probe(struct sock *sk)
|
|
|
goto rearm_timer;
|
|
|
|
|
|
if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
|
|
|
- if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss,
|
|
|
+ if (unlikely(tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
|
|
|
+ (pcount - 1) * mss, mss,
|
|
|
GFP_ATOMIC)))
|
|
|
goto rearm_timer;
|
|
|
- skb = tcp_write_queue_next(sk, skb);
|
|
|
+ skb = skb_rb_next(skb);
|
|
|
}
|
|
|
|
|
|
if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
|
|
@@ -2659,7 +2674,7 @@ void tcp_skb_collapse_tstamp(struct sk_buff *skb,
|
|
|
static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
|
|
|
{
|
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
|
- struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
|
|
|
+ struct sk_buff *next_skb = skb_rb_next(skb);
|
|
|
int skb_size, next_skb_size;
|
|
|
|
|
|
skb_size = skb->len;
|
|
@@ -2676,8 +2691,6 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
|
|
|
}
|
|
|
tcp_highest_sack_combine(sk, next_skb, skb);
|
|
|
|
|
|
- tcp_unlink_write_queue(next_skb, sk);
|
|
|
-
|
|
|
if (next_skb->ip_summed == CHECKSUM_PARTIAL)
|
|
|
skb->ip_summed = CHECKSUM_PARTIAL;
|
|
|
|
|
@@ -2705,7 +2718,7 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
|
|
|
|
|
|
tcp_skb_collapse_tstamp(skb, next_skb);
|
|
|
|
|
|
- sk_wmem_free_skb(sk, next_skb);
|
|
|
+ tcp_rtx_queue_unlink_and_free(next_skb, sk);
|
|
|
return true;
|
|
|
}
|
|
|
|
|
@@ -2716,8 +2729,6 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
|
|
|
return false;
|
|
|
if (skb_cloned(skb))
|
|
|
return false;
|
|
|
- if (skb == tcp_send_head(sk))
|
|
|
- return false;
|
|
|
/* Some heuristics for collapsing over SACK'd could be invented */
|
|
|
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
|
|
|
return false;
|
|
@@ -2740,7 +2751,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
|
|
|
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
|
|
|
return;
|
|
|
|
|
|
- tcp_for_write_queue_from_safe(skb, tmp, sk) {
|
|
|
+ skb_rbtree_walk_from_safe(skb, tmp) {
|
|
|
if (!tcp_can_collapse(sk, skb))
|
|
|
break;
|
|
|
|
|
@@ -2815,7 +2826,8 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
|
|
|
|
|
|
len = cur_mss * segs;
|
|
|
if (skb->len > len) {
|
|
|
- if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC))
|
|
|
+ if (tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb, len,
|
|
|
+ cur_mss, GFP_ATOMIC))
|
|
|
return -ENOMEM; /* We'll try again later. */
|
|
|
} else {
|
|
|
if (skb_unclone(skb, GFP_ATOMIC))
|
|
@@ -2906,29 +2918,24 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
|
|
|
void tcp_xmit_retransmit_queue(struct sock *sk)
|
|
|
{
|
|
|
const struct inet_connection_sock *icsk = inet_csk(sk);
|
|
|
+ struct sk_buff *skb, *rtx_head = NULL, *hole = NULL;
|
|
|
struct tcp_sock *tp = tcp_sk(sk);
|
|
|
- struct sk_buff *skb;
|
|
|
- struct sk_buff *hole = NULL;
|
|
|
u32 max_segs;
|
|
|
int mib_idx;
|
|
|
|
|
|
if (!tp->packets_out)
|
|
|
return;
|
|
|
|
|
|
- if (tp->retransmit_skb_hint) {
|
|
|
- skb = tp->retransmit_skb_hint;
|
|
|
- } else {
|
|
|
- skb = tcp_write_queue_head(sk);
|
|
|
+ skb = tp->retransmit_skb_hint;
|
|
|
+ if (!skb) {
|
|
|
+ rtx_head = tcp_rtx_queue_head(sk);
|
|
|
+ skb = rtx_head;
|
|
|
}
|
|
|
-
|
|
|
max_segs = tcp_tso_segs(sk, tcp_current_mss(sk));
|
|
|
- tcp_for_write_queue_from(skb, sk) {
|
|
|
+ skb_rbtree_walk_from(skb) {
|
|
|
__u8 sacked;
|
|
|
int segs;
|
|
|
|
|
|
- if (skb == tcp_send_head(sk))
|
|
|
- break;
|
|
|
-
|
|
|
if (tcp_pacing_check(sk))
|
|
|
break;
|
|
|
|
|
@@ -2973,7 +2980,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
|
|
|
if (tcp_in_cwnd_reduction(sk))
|
|
|
tp->prr_out += tcp_skb_pcount(skb);
|
|
|
|
|
|
- if (skb == tcp_write_queue_head(sk) &&
|
|
|
+ if (skb == rtx_head &&
|
|
|
icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
|
|
|
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
|
|
|
inet_csk(sk)->icsk_rto,
|
|
@@ -3015,12 +3022,15 @@ void tcp_send_fin(struct sock *sk)
|
|
|
* Note: in the latter case, FIN packet will be sent after a timeout,
|
|
|
* as TCP stack thinks it has already been transmitted.
|
|
|
*/
|
|
|
- if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) {
|
|
|
+ if (!tskb && tcp_under_memory_pressure(sk))
|
|
|
+ tskb = skb_rb_last(&sk->tcp_rtx_queue);
|
|
|
+
|
|
|
+ if (tskb) {
|
|
|
coalesce:
|
|
|
TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
|
|
|
TCP_SKB_CB(tskb)->end_seq++;
|
|
|
tp->write_seq++;
|
|
|
- if (!tcp_send_head(sk)) {
|
|
|
+ if (tcp_write_queue_empty(sk)) {
|
|
|
/* This means tskb was already sent.
|
|
|
* Pretend we included the FIN on previous transmit.
|
|
|
* We need to set tp->snd_nxt to the value it would have
|
|
@@ -3086,9 +3096,9 @@ int tcp_send_synack(struct sock *sk)
|
|
|
{
|
|
|
struct sk_buff *skb;
|
|
|
|
|
|
- skb = tcp_write_queue_head(sk);
|
|
|
+ skb = tcp_rtx_queue_head(sk);
|
|
|
if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
|
|
|
- pr_debug("%s: wrong queue state\n", __func__);
|
|
|
+ pr_err("%s: wrong queue state\n", __func__);
|
|
|
return -EFAULT;
|
|
|
}
|
|
|
if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
|
|
@@ -3101,10 +3111,9 @@ int tcp_send_synack(struct sock *sk)
|
|
|
if (!nskb)
|
|
|
return -ENOMEM;
|
|
|
INIT_LIST_HEAD(&nskb->tcp_tsorted_anchor);
|
|
|
- tcp_unlink_write_queue(skb, sk);
|
|
|
+ tcp_rtx_queue_unlink_and_free(skb, sk);
|
|
|
__skb_header_release(nskb);
|
|
|
- __tcp_add_write_queue_head(sk, nskb);
|
|
|
- sk_wmem_free_skb(sk, skb);
|
|
|
+ tcp_rbtree_insert(&sk->tcp_rtx_queue, nskb);
|
|
|
sk->sk_wmem_queued += nskb->truesize;
|
|
|
sk_mem_charge(sk, nskb->truesize);
|
|
|
skb = nskb;
|
|
@@ -3327,7 +3336,6 @@ static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
|
|
|
|
|
|
tcb->end_seq += skb->len;
|
|
|
__skb_header_release(skb);
|
|
|
- __tcp_add_write_queue_tail(sk, skb);
|
|
|
sk->sk_wmem_queued += skb->truesize;
|
|
|
sk_mem_charge(sk, skb->truesize);
|
|
|
tp->write_seq = tcb->end_seq;
|
|
@@ -3405,12 +3413,13 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
|
|
|
TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
|
|
|
if (!err) {
|
|
|
tp->syn_data = (fo->copied > 0);
|
|
|
+ tcp_rbtree_insert(&sk->tcp_rtx_queue, syn_data);
|
|
|
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
|
|
|
goto done;
|
|
|
}
|
|
|
|
|
|
- /* data was not sent, this is our new send_head */
|
|
|
- sk->sk_send_head = syn_data;
|
|
|
+ /* data was not sent, put it in write_queue */
|
|
|
+ __skb_queue_tail(&sk->sk_write_queue, syn_data);
|
|
|
tp->packets_out -= tcp_skb_pcount(syn_data);
|
|
|
|
|
|
fallback:
|
|
@@ -3453,6 +3462,7 @@ int tcp_connect(struct sock *sk)
|
|
|
tp->retrans_stamp = tcp_time_stamp(tp);
|
|
|
tcp_connect_queue_skb(sk, buff);
|
|
|
tcp_ecn_send_syn(sk, buff);
|
|
|
+ tcp_rbtree_insert(&sk->tcp_rtx_queue, buff);
|
|
|
|
|
|
/* Send off SYN; include data in Fast Open. */
|
|
|
err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
|
|
@@ -3647,7 +3657,8 @@ int tcp_write_wakeup(struct sock *sk, int mib)
|
|
|
skb->len > mss) {
|
|
|
seg_size = min(seg_size, mss);
|
|
|
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH;
|
|
|
- if (tcp_fragment(sk, skb, seg_size, mss, GFP_ATOMIC))
|
|
|
+ if (tcp_fragment(sk, TCP_FRAG_IN_WRITE_QUEUE,
|
|
|
+ skb, seg_size, mss, GFP_ATOMIC))
|
|
|
return -1;
|
|
|
} else if (!tcp_skb_pcount(skb))
|
|
|
tcp_set_skb_tso_segs(skb, mss);
|
|
@@ -3677,7 +3688,7 @@ void tcp_send_probe0(struct sock *sk)
|
|
|
|
|
|
err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE);
|
|
|
|
|
|
- if (tp->packets_out || !tcp_send_head(sk)) {
|
|
|
+ if (tp->packets_out || tcp_write_queue_empty(sk)) {
|
|
|
/* Cancel probe timer, if it is not required. */
|
|
|
icsk->icsk_probes_out = 0;
|
|
|
icsk->icsk_backoff = 0;
|