From 3dc789320e1b310cb505dcd94512c279abcd5e1c Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Tue, 8 Dec 2009 20:07:54 -0800
Subject: tcp: Remove runtime check that can never be true.

GCC even warns about it, as reported by Andrew Morton:

net/ipv4/tcp.c: In function 'do_tcp_getsockopt':
net/ipv4/tcp.c:2544: warning: comparison is always false due to limited range of data type

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c8666b70cde..b0a26bb25e2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2540,11 +2540,6 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 			ctd.tcpct_cookie_desired = cvp->cookie_desired;
 			ctd.tcpct_s_data_desired = cvp->s_data_desired;
 
-			/* Cookie(s) saved, return as nonce */
-			if (sizeof(ctd.tcpct_value) < cvp->cookie_pair_size) {
-				/* impossible? */
-				return -EINVAL;
-			}
 			memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0],
 			       cvp->cookie_pair_size);
 			ctd.tcpct_used = cvp->cookie_pair_size;
-- 
cgit v1.2.3


From 9327f7053e3993c125944fdb137a0618319ef2a0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 4 Dec 2009 03:46:54 +0000
Subject: tcp: Fix a connect() race with timewait sockets

First patch changes __inet_hash_nolisten() and __inet6_hash()
to get a timewait parameter to be able to unhash it from ehash
at same time the new socket is inserted in hash.

This makes sure timewait socket wont be found by a concurrent
writer in __inet_check_established()

Reported-by: kapil dakhane <kdakhane@gmail.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_hashtables.c | 22 ++++++++++++++++------
 net/ipv4/tcp_ipv4.c        |  2 +-
 2 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 21e5e32d8c6..c4201b7ece3 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -351,12 +351,13 @@ static inline u32 inet_sk_port_offset(const struct sock *sk)
 					  inet->inet_dport);
 }
 
-void __inet_hash_nolisten(struct sock *sk)
+int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
 {
 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
 	struct hlist_nulls_head *list;
 	spinlock_t *lock;
 	struct inet_ehash_bucket *head;
+	int twrefcnt = 0;
 
 	WARN_ON(!sk_unhashed(sk));
 
@@ -367,8 +368,13 @@ void __inet_hash_nolisten(struct sock *sk)
 
 	spin_lock(lock);
 	__sk_nulls_add_node_rcu(sk, list);
+	if (tw) {
+		WARN_ON(sk->sk_hash != tw->tw_hash);
+		twrefcnt = inet_twsk_unhash(tw);
+	}
 	spin_unlock(lock);
 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+	return twrefcnt;
 }
 EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
 
@@ -378,7 +384,7 @@ static void __inet_hash(struct sock *sk)
 	struct inet_listen_hashbucket *ilb;
 
 	if (sk->sk_state != TCP_LISTEN) {
-		__inet_hash_nolisten(sk);
+		__inet_hash_nolisten(sk, NULL);
 		return;
 	}
 
@@ -427,7 +433,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 		struct sock *sk, u32 port_offset,
 		int (*check_established)(struct inet_timewait_death_row *,
 			struct sock *, __u16, struct inet_timewait_sock **),
-		void (*hash)(struct sock *sk))
+		int (*hash)(struct sock *sk, struct inet_timewait_sock *twp))
 {
 	struct inet_hashinfo *hinfo = death_row->hashinfo;
 	const unsigned short snum = inet_sk(sk)->inet_num;
@@ -435,6 +441,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
 	struct inet_bind_bucket *tb;
 	int ret;
 	struct net *net = sock_net(sk);
+	int twrefcnt = 1;
 
 	if (!snum) {
 		int i, remaining, low, high, port;
@@ -493,13 +500,16 @@ ok:
 		inet_bind_hash(sk, tb, port);
 		if (sk_unhashed(sk)) {
 			inet_sk(sk)->inet_sport = htons(port);
-			hash(sk);
+			twrefcnt += hash(sk, tw);
 		}
 		spin_unlock(&head->lock);
 
 		if (tw) {
 			inet_twsk_deschedule(tw, death_row);
-			inet_twsk_put(tw);
+			while (twrefcnt) {
+				twrefcnt--;
+				inet_twsk_put(tw);
+			}
 		}
 
 		ret = 0;
@@ -510,7 +520,7 @@ ok:
 	tb  = inet_csk(sk)->icsk_bind_hash;
 	spin_lock_bh(&head->lock);
 	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
-		hash(sk);
+		hash(sk, NULL);
 		spin_unlock_bh(&head->lock);
 		return 0;
 	} else {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 29002ab26e0..15e96030ce4 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1464,7 +1464,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	}
 #endif
 
-	__inet_hash_nolisten(newsk);
+	__inet_hash_nolisten(newsk, NULL);
 	__inet_inherit_port(sk, newsk);
 
 	return newsk;
-- 
cgit v1.2.3


From 3cdaedae635b17ce23c738ce7d364b442310cdec Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 4 Dec 2009 03:47:42 +0000
Subject: tcp: Fix a connect() race with timewait sockets

When we find a timewait connection in __inet_hash_connect() and reuse
it for a new connection request, we have a race window, releasing bind
list lock and reacquiring it in __inet_twsk_kill() to remove timewait
socket from list.

Another thread might find the timewait socket we already chose, leading to
list corruption and crashes.

Fix is to remove timewait socket from bind list before releasing the bind lock.

Note: This problem happens if sysctl_tcp_tw_reuse is set.

Reported-by: kapil dakhane <kdakhane@gmail.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_hashtables.c    |  2 ++
 net/ipv4/inet_timewait_sock.c | 29 +++++++++++++++++++++--------
 2 files changed, 23 insertions(+), 8 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index c4201b7ece3..2b79377b468 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -502,6 +502,8 @@ ok:
 			inet_sk(sk)->inet_sport = htons(port);
 			twrefcnt += hash(sk, tw);
 		}
+		if (tw)
+			twrefcnt += inet_twsk_bind_unhash(tw, hinfo);
 		spin_unlock(&head->lock);
 
 		if (tw) {
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 0fdf45e4c90..bf4b1e2a430 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -29,12 +29,29 @@ int inet_twsk_unhash(struct inet_timewait_sock *tw)
 	return 1;
 }
 
+/*
+ * unhash a timewait socket from bind hash
+ * lock must be hold by caller
+ */
+int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
+			  struct inet_hashinfo *hashinfo)
+{
+	struct inet_bind_bucket *tb = tw->tw_tb;
+
+	if (!tb)
+		return 0;
+
+	__hlist_del(&tw->tw_bind_node);
+	tw->tw_tb = NULL;
+	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+	return 1;
+}
+
 /* Must be called with locally disabled BHs. */
 static void __inet_twsk_kill(struct inet_timewait_sock *tw,
 			     struct inet_hashinfo *hashinfo)
 {
 	struct inet_bind_hashbucket *bhead;
-	struct inet_bind_bucket *tb;
 	int refcnt;
 	/* Unlink from established hashes. */
 	spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
@@ -46,15 +63,11 @@ static void __inet_twsk_kill(struct inet_timewait_sock *tw,
 	/* Disassociate with bind bucket. */
 	bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
 			hashinfo->bhash_size)];
+
 	spin_lock(&bhead->lock);
-	tb = tw->tw_tb;
-	if (tb) {
-		__hlist_del(&tw->tw_bind_node);
-		tw->tw_tb = NULL;
-		inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
-		refcnt++;
-	}
+	refcnt += inet_twsk_bind_unhash(tw, hashinfo);
 	spin_unlock(&bhead->lock);
+
 #ifdef SOCK_REFCNT_DEBUG
 	if (atomic_read(&tw->tw_refcnt) != 1) {
 		printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n",
-- 
cgit v1.2.3


From 2a8875e73ffb18165ceb245f99c2ccad77378051 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 8 Dec 2009 20:19:53 -0800
Subject: [PATCH] tcp: documents timewait refcnt tricks

Adds kerneldoc for inet_twsk_unhash() & inet_twsk_bind_unhash().

With help from Randy Dunlap.

Suggested-by: Evgeniy Polyakov <zbr@ioremap.net>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_timewait_sock.c | 38 ++++++++++++++++++++++++--------------
 1 file changed, 24 insertions(+), 14 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index bf4b1e2a430..cc94cc2d8b2 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -15,9 +15,13 @@
 #include <net/ip.h>
 
 
-/*
- * unhash a timewait socket from established hash
- * lock must be hold by caller
+/**
+ *	inet_twsk_unhash - unhash a timewait socket from established hash
+ *	@tw: timewait socket
+ *
+ *	unhash a timewait socket from established hash, if hashed.
+ *	ehash lock must be held by caller.
+ *	Returns 1 if caller should call inet_twsk_put() after lock release.
  */
 int inet_twsk_unhash(struct inet_timewait_sock *tw)
 {
@@ -26,12 +30,21 @@ int inet_twsk_unhash(struct inet_timewait_sock *tw)
 
 	hlist_nulls_del_rcu(&tw->tw_node);
 	sk_nulls_node_init(&tw->tw_node);
+	/*
+	 * We cannot call inet_twsk_put() ourself under lock,
+	 * caller must call it for us.
+	 */
 	return 1;
 }
 
-/*
- * unhash a timewait socket from bind hash
- * lock must be hold by caller
+/**
+ *	inet_twsk_bind_unhash - unhash a timewait socket from bind hash
+ *	@tw: timewait socket
+ *	@hashinfo: hashinfo pointer
+ *
+ *	unhash a timewait socket from bind hash, if hashed.
+ *	bind hash lock must be held by caller.
+ *	Returns 1 if caller should call inet_twsk_put() after lock release.
  */
 int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
 			  struct inet_hashinfo *hashinfo)
@@ -44,6 +57,10 @@ int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
 	__hlist_del(&tw->tw_bind_node);
 	tw->tw_tb = NULL;
 	inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+	/*
+	 * We cannot call inet_twsk_put() ourself under lock,
+	 * caller must call it for us.
+	 */
 	return 1;
 }
 
@@ -139,7 +156,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
 
 	/*
 	 * Notes :
- 	 * - We initially set tw_refcnt to 0 in inet_twsk_alloc()
+	 * - We initially set tw_refcnt to 0 in inet_twsk_alloc()
 	 * - We add one reference for the bhash link
 	 * - We add one reference for the ehash link
 	 * - We want this refcnt update done before allowing other
@@ -149,7 +166,6 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
 
 	spin_unlock(lock);
 }
-
 EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
 
 struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
@@ -190,7 +206,6 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
 
 	return tw;
 }
-
 EXPORT_SYMBOL_GPL(inet_twsk_alloc);
 
 /* Returns non-zero if quota exceeded.  */
@@ -269,7 +284,6 @@ void inet_twdr_hangman(unsigned long data)
 out:
 	spin_unlock(&twdr->death_lock);
 }
-
 EXPORT_SYMBOL_GPL(inet_twdr_hangman);
 
 void inet_twdr_twkill_work(struct work_struct *work)
@@ -300,7 +314,6 @@ void inet_twdr_twkill_work(struct work_struct *work)
 		spin_unlock_bh(&twdr->death_lock);
 	}
 }
-
 EXPORT_SYMBOL_GPL(inet_twdr_twkill_work);
 
 /* These are always called from BH context.  See callers in
@@ -320,7 +333,6 @@ void inet_twsk_deschedule(struct inet_timewait_sock *tw,
 	spin_unlock(&twdr->death_lock);
 	__inet_twsk_kill(tw, twdr->hashinfo);
 }
-
 EXPORT_SYMBOL(inet_twsk_deschedule);
 
 void inet_twsk_schedule(struct inet_timewait_sock *tw,
@@ -401,7 +413,6 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw,
 		mod_timer(&twdr->tw_timer, jiffies + twdr->period);
 	spin_unlock(&twdr->death_lock);
 }
-
 EXPORT_SYMBOL_GPL(inet_twsk_schedule);
 
 void inet_twdr_twcal_tick(unsigned long data)
@@ -462,7 +473,6 @@ out:
 #endif
 	spin_unlock(&twdr->death_lock);
 }
-
 EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
 
 void inet_twsk_purge(struct inet_hashinfo *hashinfo,
-- 
cgit v1.2.3


From 2f7de5710a4d394920405febc2a9937c69e16dda Mon Sep 17 00:00:00 2001
From: Damian Lukowski <damian@tvk.rwth-aachen.de>
Date: Mon, 7 Dec 2009 06:06:16 +0000
Subject: tcp: Stalling connections: Move timeout calculation routine
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch moves retransmits_timed_out() from include/net/tcp.h
to tcp_timer.c, where it is used.

Reported-by: Frederic Leroy <fredo@starox.org>
Signed-off-by: Damian Lukowski <damian@tvk.rwth-aachen.de>
Acked-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_timer.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 8353a538cd4..8816a20c259 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -132,6 +132,35 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
 	}
 }
 
+/* This function calculates a "timeout" which is equivalent to the timeout of a
+ * TCP connection after "boundary" unsucessful, exponentially backed-off
+ * retransmissions with an initial RTO of TCP_RTO_MIN.
+ */
+static bool retransmits_timed_out(struct sock *sk,
+				  unsigned int boundary)
+{
+	unsigned int timeout, linear_backoff_thresh;
+	unsigned int start_ts;
+
+	if (!inet_csk(sk)->icsk_retransmits)
+		return false;
+
+	if (unlikely(!tcp_sk(sk)->retrans_stamp))
+		start_ts = TCP_SKB_CB(tcp_write_queue_head(sk))->when;
+	else
+		start_ts = tcp_sk(sk)->retrans_stamp;
+
+	linear_backoff_thresh = ilog2(TCP_RTO_MAX/TCP_RTO_MIN);
+
+	if (boundary <= linear_backoff_thresh)
+		timeout = ((2 << boundary) - 1) * TCP_RTO_MIN;
+	else
+		timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN +
+			  (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
+
+	return (tcp_time_stamp - start_ts) >= timeout;
+}
+
 /* A write timeout has occurred. Process the after effects. */
 static int tcp_write_timeout(struct sock *sk)
 {
-- 
cgit v1.2.3


From 77722b177a1606669c0b95dde03347e37d13b8fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= <ilpo.jarvinen@helsinki.fi>
Date: Tue, 8 Dec 2009 20:54:11 -0800
Subject: tcp: fix retrans_stamp advancing in error cases
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It can happen, that tcp_retransmit_skb fails due to some error.
In such cases we might end up into a state where tp->retrans_out
is zero but that's only because we removed the TCPCB_SACKED_RETRANS
bit from a segment but couldn't retransmit it because of the error
that happened. Therefore some assumptions that retrans_out checks
are based do not necessarily hold, as there still can be an old
retransmission but that is only visible in TCPCB_EVER_RETRANS bit.
As retransmission happen in sequential order (except for some very
rare corner cases), it's enough to check the head skb for that bit.

Main reason for all this complexity is the fact that connection dying
time now depends on the validity of the retrans_stamp, in particular,
that successive retransmissions of a segment must not advance
retrans_stamp under any conditions. It seems after quick thinking that
this has relatively low impact as eventually TCP will go into CA_Loss
and either use the existing check for !retrans_stamp case or send a
retransmission successfully, setting a new base time for the dying
timer (can happen only once). At worst, the dying time will be
approximately the double of the intented time. In addition,
tcp_packet_delayed() will return wrong result (has some cc aspects
but due to rarity of these errors, it's hardly an issue).

One of retrans_stamp clearing happens indirectly through first going
into CA_Open state and then a later ACK lets the clearing to happen.
Thus tcp_try_keep_open has to be modified too.

Thanks to Damian Lukowski <damian@tvk.rwth-aachen.de> for hinting
that this possibility exists (though the particular case discussed
didn't after all have it happening but was just a debug patch
artifact).

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 35 ++++++++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 57ae96a0422..12cab7d74db 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2717,6 +2717,35 @@ static void tcp_try_undo_dsack(struct sock *sk)
 	}
 }
 
+/* We can clear retrans_stamp when there are no retransmissions in the
+ * window. It would seem that it is trivially available for us in
+ * tp->retrans_out, however, that kind of assumptions doesn't consider
+ * what will happen if errors occur when sending retransmission for the
+ * second time. ...It could the that such segment has only
+ * TCPCB_EVER_RETRANS set at the present time. It seems that checking
+ * the head skb is enough except for some reneging corner cases that
+ * are not worth the effort.
+ *
+ * Main reason for all this complexity is the fact that connection dying
+ * time now depends on the validity of the retrans_stamp, in particular,
+ * that successive retransmissions of a segment must not advance
+ * retrans_stamp under any conditions.
+ */
+static int tcp_any_retrans_done(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+
+	if (tp->retrans_out)
+		return 1;
+
+	skb = tcp_write_queue_head(sk);
+	if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
+		return 1;
+
+	return 0;
+}
+
 /* Undo during fast recovery after partial ACK. */
 
 static int tcp_try_undo_partial(struct sock *sk, int acked)
@@ -2729,7 +2758,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked)
 		/* Plain luck! Hole if filled with delayed
 		 * packet, rather than with a retransmit.
 		 */
-		if (tp->retrans_out == 0)
+		if (!tcp_any_retrans_done(sk))
 			tp->retrans_stamp = 0;
 
 		tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
@@ -2788,7 +2817,7 @@ static void tcp_try_keep_open(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	int state = TCP_CA_Open;
 
-	if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker)
+	if (tcp_left_out(tp) || tcp_any_retrans_done(sk) || tp->undo_marker)
 		state = TCP_CA_Disorder;
 
 	if (inet_csk(sk)->icsk_ca_state != state) {
@@ -2803,7 +2832,7 @@ static void tcp_try_to_open(struct sock *sk, int flag)
 
 	tcp_verify_left_out(tp);
 
-	if (!tp->frto_counter && tp->retrans_out == 0)
+	if (!tp->frto_counter && !tcp_any_retrans_done(sk))
 		tp->retrans_stamp = 0;
 
 	if (flag & FLAG_ECE)
-- 
cgit v1.2.3