aboutsummaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
authorJohn Heffner <jheffner@psc.edu>2006-03-20 17:53:41 -0800
committerDavid S. Miller <davem@davemloft.net>2006-03-20 17:53:41 -0800
commit5d424d5a674f782d0659a3b66d951f412901faee (patch)
tree579871172044e02e626a90388d19ec55cf2d1fc4 /net
parent1d60290f27e7dc4bce2c43922d0bfa9abd246fc9 (diff)
[TCP]: MTU probing
Implementation of packetization layer path mtu discovery for TCP, based on the internet-draft currently found at <http://www.ietf.org/internet-drafts/draft-ietf-pmtud-method-05.txt>. Signed-off-by: John Heffner <jheffner@psc.edu> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/ipv4/sysctl_net_ipv4.c16
-rw-r--r--net/ipv4/tcp_input.c49
-rw-r--r--net/ipv4/tcp_ipv4.c1
-rw-r--r--net/ipv4/tcp_output.c236
-rw-r--r--net/ipv4/tcp_timer.c36
-rw-r--r--net/ipv6/tcp_ipv6.c1
6 files changed, 302 insertions, 37 deletions
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 16984d4a8a0..ebf2e0b363c 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -664,6 +664,22 @@ ctl_table ipv4_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
+ {
+ .ctl_name = NET_TCP_MTU_PROBING,
+ .procname = "tcp_mtu_probing",
+ .data = &sysctl_tcp_mtu_probing,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+ {
+ .ctl_name = NET_TCP_BASE_MSS,
+ .procname = "tcp_base_mss",
+ .data = &sysctl_tcp_base_mss,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
{ .ctl_name = 0 }
};
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e9a54ae7d69..0ac388e3d01 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1891,6 +1891,34 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
}
}
+static void tcp_mtup_probe_failed(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ icsk->icsk_mtup.search_high = icsk->icsk_mtup.probe_size - 1;
+ icsk->icsk_mtup.probe_size = 0;
+}
+
+static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ /* FIXME: breaks with very large cwnd */
+ tp->prior_ssthresh = tcp_current_ssthresh(sk);
+ tp->snd_cwnd = tp->snd_cwnd *
+ tcp_mss_to_mtu(sk, tp->mss_cache) /
+ icsk->icsk_mtup.probe_size;
+ tp->snd_cwnd_cnt = 0;
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ tp->rcv_ssthresh = tcp_current_ssthresh(sk);
+
+ icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
+ icsk->icsk_mtup.probe_size = 0;
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+}
+
+
/* Process an event, which can update packets-in-flight not trivially.
* Main goal of this function is to calculate new estimate for left_out,
* taking into account both packets sitting in receiver's buffer and
@@ -2023,6 +2051,17 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
return;
}
+ /* MTU probe failure: don't reduce cwnd */
+ if (icsk->icsk_ca_state < TCP_CA_CWR &&
+ icsk->icsk_mtup.probe_size &&
+ tp->snd_una == icsk->icsk_mtup.probe_seq_start) {
+ tcp_mtup_probe_failed(sk);
+ /* Restores the reduction we did in tcp_mtup_probe() */
+ tp->snd_cwnd++;
+ tcp_simple_retransmit(sk);
+ return;
+ }
+
/* Otherwise enter Recovery state */
if (IsReno(tp))
@@ -2243,6 +2282,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
tp->retrans_stamp = 0;
}
+ /* MTU probing checks */
+ if (icsk->icsk_mtup.probe_size) {
+ if (!after(icsk->icsk_mtup.probe_seq_end, TCP_SKB_CB(skb)->end_seq)) {
+ tcp_mtup_probe_success(sk, skb);
+ }
+ }
+
if (sacked) {
if (sacked & TCPCB_RETRANS) {
if(sacked & TCPCB_SACKED_RETRANS)
@@ -4101,6 +4147,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
if (tp->rx_opt.sack_ok && sysctl_tcp_fack)
tp->rx_opt.sack_ok |= 2;
+ tcp_mtup_init(sk);
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
tcp_initialize_rcv_mss(sk);
@@ -4211,6 +4258,7 @@ discard:
if (tp->ecn_flags&TCP_ECN_OK)
sock_set_flag(sk, SOCK_NO_LARGESEND);
+ tcp_mtup_init(sk);
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
tcp_initialize_rcv_mss(sk);
@@ -4399,6 +4447,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
*/
tp->lsndtime = tcp_time_stamp;
+ tcp_mtup_init(sk);
tcp_initialize_rcv_mss(sk);
tcp_init_buffer_space(sk);
tcp_fast_path_on(tp);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 233bdf25996..57e7a26e821 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -900,6 +900,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
newinet->id = newtp->write_seq ^ jiffies;
+ tcp_mtup_init(newsk);
tcp_sync_mss(newsk, dst_mtu(dst));
newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
tcp_initialize_rcv_mss(newsk);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9f498a6c889..8197b5e12f1 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -51,6 +51,12 @@ int sysctl_tcp_retrans_collapse = 1;
*/
int sysctl_tcp_tso_win_divisor = 3;
+int sysctl_tcp_mtu_probing = 0;
+int sysctl_tcp_base_mss = 512;
+
+EXPORT_SYMBOL(sysctl_tcp_mtu_probing);
+EXPORT_SYMBOL(sysctl_tcp_base_mss);
+
static void update_send_head(struct sock *sk, struct tcp_sock *tp,
struct sk_buff *skb)
{
@@ -681,6 +687,62 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
return 0;
}
+/* Not accounting for SACKs here. */
+int tcp_mtu_to_mss(struct sock *sk, int pmtu)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int mss_now;
+
+ /* Calculate base mss without TCP options:
+ It is MMS_S - sizeof(tcphdr) of rfc1122
+ */
+ mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
+
+ /* Clamp it (mss_clamp does not include tcp options) */
+ if (mss_now > tp->rx_opt.mss_clamp)
+ mss_now = tp->rx_opt.mss_clamp;
+
+ /* Now subtract optional transport overhead */
+ mss_now -= icsk->icsk_ext_hdr_len;
+
+ /* Then reserve room for full set of TCP options and 8 bytes of data */
+ if (mss_now < 48)
+ mss_now = 48;
+
+ /* Now subtract TCP options size, not including SACKs */
+ mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
+
+ return mss_now;
+}
+
+/* Inverse of above */
+int tcp_mss_to_mtu(struct sock *sk, int mss)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ int mtu;
+
+ mtu = mss +
+ tp->tcp_header_len +
+ icsk->icsk_ext_hdr_len +
+ icsk->icsk_af_ops->net_header_len;
+
+ return mtu;
+}
+
+void tcp_mtup_init(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1;
+ icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
+ icsk->icsk_af_ops->net_header_len;
+ icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
+ icsk->icsk_mtup.probe_size = 0;
+}
+
/* This function synchronize snd mss to current pmtu/exthdr set.
tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
@@ -708,25 +770,12 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
- /* Calculate base mss without TCP options:
- It is MMS_S - sizeof(tcphdr) of rfc1122
- */
- int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len -
- sizeof(struct tcphdr));
+ int mss_now;
- /* Clamp it (mss_clamp does not include tcp options) */
- if (mss_now > tp->rx_opt.mss_clamp)
- mss_now = tp->rx_opt.mss_clamp;
+ if (icsk->icsk_mtup.search_high > pmtu)
+ icsk->icsk_mtup.search_high = pmtu;
- /* Now subtract optional transport overhead */
- mss_now -= icsk->icsk_ext_hdr_len;
-
- /* Then reserve room for full set of TCP options and 8 bytes of data */
- if (mss_now < 48)
- mss_now = 48;
-
- /* Now subtract TCP options size, not including SACKs */
- mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
+ mss_now = tcp_mtu_to_mss(sk, pmtu);
/* Bound mss with half of window */
if (tp->max_window && mss_now > (tp->max_window>>1))
@@ -734,6 +783,8 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
/* And store cached results */
icsk->icsk_pmtu_cookie = pmtu;
+ if (icsk->icsk_mtup.enabled)
+ mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
tp->mss_cache = mss_now;
return mss_now;
@@ -1063,6 +1114,140 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_
return 1;
}
+/* Create a new MTU probe if we are ready.
+ * Returns 0 if we should wait to probe (no cwnd available),
+ * 1 if a probe was sent,
+ * -1 otherwise */
+static int tcp_mtu_probe(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct sk_buff *skb, *nskb, *next;
+ int len;
+ int probe_size;
+ unsigned int pif;
+ int copy;
+ int mss_now;
+
+ /* Not currently probing/verifying,
+ * not in recovery,
+ * have enough cwnd, and
+ * not SACKing (the variable headers throw things off) */
+ if (!icsk->icsk_mtup.enabled ||
+ icsk->icsk_mtup.probe_size ||
+ inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
+ tp->snd_cwnd < 11 ||
+ tp->rx_opt.eff_sacks)
+ return -1;
+
+ /* Very simple search strategy: just double the MSS. */
+ mss_now = tcp_current_mss(sk, 0);
+ probe_size = 2*tp->mss_cache;
+ if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
+ /* TODO: set timer for probe_converge_event */
+ return -1;
+ }
+
+ /* Have enough data in the send queue to probe? */
+ len = 0;
+ if ((skb = sk->sk_send_head) == NULL)
+ return -1;
+ while ((len += skb->len) < probe_size && !tcp_skb_is_last(sk, skb))
+ skb = skb->next;
+ if (len < probe_size)
+ return -1;
+
+ /* Receive window check. */
+ if (after(TCP_SKB_CB(skb)->seq + probe_size, tp->snd_una + tp->snd_wnd)) {
+ if (tp->snd_wnd < probe_size)
+ return -1;
+ else
+ return 0;
+ }
+
+ /* Do we need to wait to drain cwnd? */
+ pif = tcp_packets_in_flight(tp);
+ if (pif + 2 > tp->snd_cwnd) {
+ /* With no packets in flight, don't stall. */
+ if (pif == 0)
+ return -1;
+ else
+ return 0;
+ }
+
+ /* We're allowed to probe. Build it now. */
+ if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
+ return -1;
+ sk_charge_skb(sk, nskb);
+
+ skb = sk->sk_send_head;
+ __skb_insert(nskb, skb->prev, skb, &sk->sk_write_queue);
+ sk->sk_send_head = nskb;
+
+ TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
+ TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
+ TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK;
+ TCP_SKB_CB(nskb)->sacked = 0;
+ nskb->csum = 0;
+ if (skb->ip_summed == CHECKSUM_HW)
+ nskb->ip_summed = CHECKSUM_HW;
+
+ len = 0;
+ while (len < probe_size) {
+ next = skb->next;
+
+ copy = min_t(int, skb->len, probe_size - len);
+ if (nskb->ip_summed)
+ skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
+ else
+ nskb->csum = skb_copy_and_csum_bits(skb, 0,
+ skb_put(nskb, copy), copy, nskb->csum);
+
+ if (skb->len <= copy) {
+ /* We've eaten all the data from this skb.
+ * Throw it away. */
+ TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
+ __skb_unlink(skb, &sk->sk_write_queue);
+ sk_stream_free_skb(sk, skb);
+ } else {
+ TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
+ ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
+ if (!skb_shinfo(skb)->nr_frags) {
+ skb_pull(skb, copy);
+ if (skb->ip_summed != CHECKSUM_HW)
+ skb->csum = csum_partial(skb->data, skb->len, 0);
+ } else {
+ __pskb_trim_head(skb, copy);
+ tcp_set_skb_tso_segs(sk, skb, mss_now);
+ }
+ TCP_SKB_CB(skb)->seq += copy;
+ }
+
+ len += copy;
+ skb = next;
+ }
+ tcp_init_tso_segs(sk, nskb, nskb->len);
+
+ /* We're ready to send. If this fails, the probe will
+ * be resegmented into mss-sized pieces by tcp_write_xmit(). */
+ TCP_SKB_CB(nskb)->when = tcp_time_stamp;
+ if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
+ /* Decrement cwnd here because we are sending
+ * effectively two packets. */
+ tp->snd_cwnd--;
+ update_send_head(sk, tp, nskb);
+
+ icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
+ icsk->icsk_mtup.probe_seq_start = TCP_SKB_CB(nskb)->seq;
+ icsk->icsk_mtup.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
+
+ return 1;
+ }
+
+ return -1;
+}
+
+
/* This routine writes packets to the network. It advances the
* send_head. This happens as incoming acks open up the remote
* window for us.
@@ -1076,6 +1261,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
struct sk_buff *skb;
unsigned int tso_segs, sent_pkts;
int cwnd_quota;
+ int result;
/* If we are closed, the bytes will have to remain here.
* In time closedown will finish, we empty the write queue and all
@@ -1085,6 +1271,14 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
return 0;
sent_pkts = 0;
+
+ /* Do MTU probing. */
+ if ((result = tcp_mtu_probe(sk)) == 0) {
+ return 0;
+ } else if (result > 0) {
+ sent_pkts = 1;
+ }
+
while ((skb = sk->sk_send_head)) {
unsigned int limit;
@@ -1455,9 +1649,15 @@ void tcp_simple_retransmit(struct sock *sk)
int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
unsigned int cur_mss = tcp_current_mss(sk, 0);
int err;
+ /* Inconslusive MTU probe */
+ if (icsk->icsk_mtup.probe_size) {
+ icsk->icsk_mtup.probe_size = 0;
+ }
+
/* Do not sent more than we queued. 1/4 is reserved for possible
* copying overhead: fragmentation, tunneling, mangling etc.
*/
@@ -1883,6 +2083,7 @@ static void tcp_connect_init(struct sock *sk)
if (tp->rx_opt.user_mss)
tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
tp->max_window = 0;
+ tcp_mtup_init(sk);
tcp_sync_mss(sk, dst_mtu(dst));
if (!tp->window_clamp)
@@ -2180,3 +2381,4 @@ EXPORT_SYMBOL(tcp_make_synack);
EXPORT_SYMBOL(tcp_simple_retransmit);
EXPORT_SYMBOL(tcp_sync_mss);
EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor);
+EXPORT_SYMBOL(tcp_mtup_init);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index e1880959614..7c1bde3cd6c 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -119,8 +119,10 @@ static int tcp_orphan_retries(struct sock *sk, int alive)
/* A write timeout has occurred. Process the after effects. */
static int tcp_write_timeout(struct sock *sk)
{
- const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
int retry_until;
+ int mss;
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
if (icsk->icsk_retransmits)
@@ -128,25 +130,19 @@ static int tcp_write_timeout(struct sock *sk)
retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
} else {
if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
- /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
- hole detection. :-(
-
- It is place to make it. It is not made. I do not want
- to make it. It is disgusting. It does not work in any
- case. Let me to cite the same draft, which requires for
- us to implement this:
-
- "The one security concern raised by this memo is that ICMP black holes
- are often caused by over-zealous security administrators who block
- all ICMP messages. It is vitally important that those who design and
- deploy security systems understand the impact of strict filtering on
- upper-layer protocols. The safest web site in the world is worthless
- if most TCP implementations cannot transfer data from it. It would
- be far nicer to have all of the black holes fixed rather than fixing
- all of the TCP implementations."
-
- Golden words :-).
- */
+ /* Black hole detection */
+ if (sysctl_tcp_mtu_probing) {
+ if (!icsk->icsk_mtup.enabled) {
+ icsk->icsk_mtup.enabled = 1;
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+ } else {
+ mss = min(sysctl_tcp_base_mss,
+ tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low)/2);
+ mss = max(mss, 68 - tp->tcp_header_len);
+ icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
+ tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
+ }
+ }
dst_negative_advice(&sk->sk_dst_cache);
}
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index ca9cf685375..14de50380f4 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -987,6 +987,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
newnp->opt->opt_flen);
+ tcp_mtup_init(newsk);
tcp_sync_mss(newsk, dst_mtu(dst));
newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
tcp_initialize_rcv_mss(newsk);