From aa3c487f355ff1477b8369d9f0b9860387ae21d4 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Thu, 29 Oct 2009 15:35:10 +0100
Subject: netfilter: xt_socket: make module available for INPUT chain

This should make it possible to test for the existence of local
sockets in the INPUT path.

References: http://marc.info/?l=netfilter-devel&m=125380481517129&w=2

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
Signed-off-by: Balazs Scheidler <bazsi@balabit.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/xt_socket.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 362afbd60a9..6a902564d24 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -192,7 +192,8 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
 		.revision	= 0,
 		.family		= NFPROTO_IPV4,
 		.match		= socket_mt_v0,
-		.hooks		= 1 << NF_INET_PRE_ROUTING,
+		.hooks		= (1 << NF_INET_PRE_ROUTING) |
+				  (1 << NF_INET_LOCAL_IN),
 		.me		= THIS_MODULE,
 	},
 	{
@@ -201,7 +202,8 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
 		.family		= NFPROTO_IPV4,
 		.match		= socket_mt_v1,
 		.matchsize	= sizeof(struct xt_socket_mtinfo1),
-		.hooks		= 1 << NF_INET_PRE_ROUTING,
+		.hooks		= (1 << NF_INET_PRE_ROUTING) |
+				  (1 << NF_INET_LOCAL_IN),
 		.me		= THIS_MODULE,
 	},
 };
-- 
cgit v1.2.3


From 5ae27aa2b16478a84d833ab4065798e752941c5a Mon Sep 17 00:00:00 2001
From: Changli Gao <xiaosuo@gmail.com>
Date: Thu, 5 Nov 2009 14:51:31 +0100
Subject: netfilter: nf_conntrack: avoid additional compare.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_core.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 7c9ec3dee96..8e572d7c08c 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -511,11 +511,17 @@ static noinline int early_drop(struct net *net, unsigned int hash)
 			cnt++;
 		}
 
-		if (ct && unlikely(nf_ct_is_dying(ct) ||
-				   !atomic_inc_not_zero(&ct->ct_general.use)))
-			ct = NULL;
-		if (ct || cnt >= NF_CT_EVICTION_RANGE)
+		if (ct != NULL) {
+			if (likely(!nf_ct_is_dying(ct) &&
+				   atomic_inc_not_zero(&ct->ct_general.use)))
+				break;
+			else
+				ct = NULL;
+		}
+
+		if (cnt >= NF_CT_EVICTION_RANGE)
 			break;
+
 		hash = (hash + 1) % nf_conntrack_htable_size;
 	}
 	rcu_read_unlock();
-- 
cgit v1.2.3


From dee5817e88ac8195e5938d6671f434a071e35698 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 6 Nov 2009 17:04:00 +0100
Subject: netfilter: remove unneccessary checks from netlink notifiers

The NETLINK_URELEASE notifier is only invoked for bound sockets, so
there is no need to check ->pid again.

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nfnetlink_log.c   | 3 +--
 net/netfilter/nfnetlink_queue.c | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index f900dc3194a..3aa66b2f9e8 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -666,8 +666,7 @@ nfulnl_rcv_nl_event(struct notifier_block *this,
 {
 	struct netlink_notify *n = ptr;
 
-	if (event == NETLINK_URELEASE &&
-	    n->protocol == NETLINK_NETFILTER && n->pid) {
+	if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) {
 		int i;
 
 		/* destroy all instances for this pid */
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 7a9dec9fb82..7e3fa410641 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -574,8 +574,7 @@ nfqnl_rcv_nl_event(struct notifier_block *this,
 {
 	struct netlink_notify *n = ptr;
 
-	if (event == NETLINK_URELEASE &&
-	    n->protocol == NETLINK_NETFILTER && n->pid) {
+	if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) {
 		int i;
 
 		/* destroy all instances for this pid */
-- 
cgit v1.2.3


From c4832c7bbc3f7a4813347e871d7238651bf437d3 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 23 Nov 2009 10:34:39 +0100
Subject: netfilter: nf_ct_tcp: improve out-of-sync situation in TCP tracking

Without this patch, if we receive a SYN packet from the client while
the firewall is out-of-sync, we let it go through. Then, if we see
the SYN/ACK reply coming from the server, we destroy the conntrack
entry and drop the packet to trigger a new retransmission. Then,
the retransmision from the client is used to start a new clean
session.

This patch improves the current handling. Basically, if we see an
unexpected SYN packet, we annotate the TCP options. Then, if we
see the reply SYN/ACK, this means that the firewall was indeed
out-of-sync. Therefore, we set a clean new session from the existing
entry based on the annotated values.

This patch adds two new 8-bits fields that fit in a 16-bits gap of
the ip_ct_tcp structure.

This patch is particularly useful for conntrackd since the
asynchronous nature of the state-synchronization allows to have
backup nodes that are not perfect copies of the master. This helps
to improve the recovery under some worst-case scenarios.

I have tested this by creating lots of conntrack entries in wrong
state:

for ((i=1024;i<65535;i++)); do conntrack -I -p tcp -s 192.168.2.101 -d 192.168.2.2 --sport $i --dport 80 -t 800 --state ESTABLISHED -u ASSURED,SEEN_REPLY; done

Then, I make some TCP connections:

$ echo GET / | nc 192.168.2.2 80

The events show the result:

 [UPDATE] tcp      6 60 SYN_RECV src=192.168.2.101 dst=192.168.2.2 sport=33220 dport=80 src=192.168.2.2 dst=192.168.2.101 sport=80 dport=33220 [ASSURED]
 [UPDATE] tcp      6 432000 ESTABLISHED src=192.168.2.101 dst=192.168.2.2 sport=33220 dport=80 src=192.168.2.2 dst=192.168.2.101 sport=80 dport=33220 [ASSURED]
 [UPDATE] tcp      6 120 FIN_WAIT src=192.168.2.101 dst=192.168.2.2 sport=33220 dport=80 src=192.168.2.2 dst=192.168.2.101 sport=80 dport=33220 [ASSURED]
 [UPDATE] tcp      6 30 LAST_ACK src=192.168.2.101 dst=192.168.2.2 sport=33220 dport=80 src=192.168.2.2 dst=192.168.2.101 sport=80 dport=33220 [ASSURED]
 [UPDATE] tcp      6 120 TIME_WAIT src=192.168.2.101 dst=192.168.2.2 sport=33220 dport=80 src=192.168.2.2 dst=192.168.2.101 sport=80 dport=33220 [ASSURED]

and tcpdump shows no retransmissions:

20:47:57.271951 IP 192.168.2.101.33221 > 192.168.2.2.www: S 435402517:435402517(0) win 5840 <mss 1460,sackOK,timestamp 4294961827 0,nop,wscale 6>
20:47:57.273538 IP 192.168.2.2.www > 192.168.2.101.33221: S 3509927945:3509927945(0) ack 435402518 win 5792 <mss 1460,sackOK,timestamp 235681024 4294961827,nop,wscale 4>
20:47:57.273608 IP 192.168.2.101.33221 > 192.168.2.2.www: . ack 3509927946 win 92 <nop,nop,timestamp 4294961827 235681024>
20:47:57.273693 IP 192.168.2.101.33221 > 192.168.2.2.www: P 435402518:435402524(6) ack 3509927946 win 92 <nop,nop,timestamp 4294961827 235681024>
20:47:57.275492 IP 192.168.2.2.www > 192.168.2.101.33221: . ack 435402524 win 362 <nop,nop,timestamp 235681024 4294961827>
20:47:57.276492 IP 192.168.2.2.www > 192.168.2.101.33221: P 3509927946:3509928082(136) ack 435402524 win 362 <nop,nop,timestamp 235681025 4294961827>
20:47:57.276515 IP 192.168.2.101.33221 > 192.168.2.2.www: . ack 3509928082 win 108 <nop,nop,timestamp 4294961828 235681025>
20:47:57.276521 IP 192.168.2.2.www > 192.168.2.101.33221: F 3509928082:3509928082(0) ack 435402524 win 362 <nop,nop,timestamp 235681025 4294961827>
20:47:57.277369 IP 192.168.2.101.33221 > 192.168.2.2.www: F 435402524:435402524(0) ack 3509928083 win 108 <nop,nop,timestamp 4294961828 235681025>
20:47:57.279491 IP 192.168.2.2.www > 192.168.2.101.33221: . ack 435402525 win 362 <nop,nop,timestamp 235681025 4294961828>

I also added a rule to log invalid packets, with no occurrences  :-) .

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Acked-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/nf_conntrack_proto_tcp.c | 51 +++++++++++++++++++++++++++-------
 1 file changed, 41 insertions(+), 10 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 97a82ba7537..9cc6b5cb06a 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -908,23 +908,54 @@ static int tcp_packet(struct nf_conn *ct,
 			/* b) This SYN/ACK acknowledges a SYN that we earlier
 			 * ignored as invalid. This means that the client and
 			 * the server are both in sync, while the firewall is
-			 * not. We kill this session and block the SYN/ACK so
-			 * that the client cannot but retransmit its SYN and
-			 * thus initiate a clean new session.
+			 * not. We get in sync from the previously annotated
+			 * values.
 			 */
-			spin_unlock_bh(&ct->lock);
-			if (LOG_INVALID(net, IPPROTO_TCP))
-				nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
-					  "nf_ct_tcp: killing out of sync session ");
-			nf_ct_kill(ct);
-			return NF_DROP;
+			old_state = TCP_CONNTRACK_SYN_SENT;
+			new_state = TCP_CONNTRACK_SYN_RECV;
+			ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_end =
+				ct->proto.tcp.last_end;
+			ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxend =
+				ct->proto.tcp.last_end;
+			ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxwin =
+				ct->proto.tcp.last_win == 0 ?
+					1 : ct->proto.tcp.last_win;
+			ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale =
+				ct->proto.tcp.last_wscale;
+			ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags =
+				ct->proto.tcp.last_flags;
+			memset(&ct->proto.tcp.seen[dir], 0,
+			       sizeof(struct ip_ct_tcp_state));
+			break;
 		}
 		ct->proto.tcp.last_index = index;
 		ct->proto.tcp.last_dir = dir;
 		ct->proto.tcp.last_seq = ntohl(th->seq);
 		ct->proto.tcp.last_end =
 		    segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
-
+		ct->proto.tcp.last_win = ntohs(th->window);
+
+		/* a) This is a SYN in ORIGINAL. The client and the server
+		 * may be in sync but we are not. In that case, we annotate
+		 * the TCP options and let the packet go through. If it is a
+		 * valid SYN packet, the server will reply with a SYN/ACK, and
+		 * then we'll get in sync. Otherwise, the server ignores it. */
+		if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) {
+			struct ip_ct_tcp_state seen = {};
+
+			ct->proto.tcp.last_flags =
+			ct->proto.tcp.last_wscale = 0;
+			tcp_options(skb, dataoff, th, &seen);
+			if (seen.flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
+				ct->proto.tcp.last_flags |=
+					IP_CT_TCP_FLAG_WINDOW_SCALE;
+				ct->proto.tcp.last_wscale = seen.td_scale;
+			}
+			if (seen.flags & IP_CT_TCP_FLAG_SACK_PERM) {
+				ct->proto.tcp.last_flags |=
+					IP_CT_TCP_FLAG_SACK_PERM;
+			}
+		}
 		spin_unlock_bh(&ct->lock);
 		if (LOG_INVALID(net, IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
-- 
cgit v1.2.3


From 3a0429292daa0e1ec848bd26479f5e48b0d54a42 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 23 Nov 2009 10:43:57 +0100
Subject: netfilter: xtables: fix conntrack match v1 ipt-save output

commit d6d3f08b0fd998b647a05540cedd11a067b72867
(netfilter: xtables: conntrack match revision 2) does break the
v1 conntrack match iptables-save output in a subtle way.

Problem is as follows:

    up = kmalloc(sizeof(*up), GFP_KERNEL);
[..]
   /*
    * The strategy here is to minimize the overhead of v1 matching,
    * by prebuilding a v2 struct and putting the pointer into the
    * v1 dataspace.
    */
    memcpy(up, info, offsetof(typeof(*info), state_mask));
[..]
    *(void **)info  = up;

As the v2 struct pointer is saved in the match data space,
it clobbers the first structure member (->origsrc_addr).

Because the _v1 match function grabs this pointer and does not actually
look at the v1 origsrc, run time functionality does not break.
But iptables -nvL (or iptables-save) cannot know that v1 origsrc_addr
has been overloaded in this way:

$ iptables -p tcp -A OUTPUT -m conntrack --ctorigsrc 10.0.0.1 -j ACCEPT
$ iptables-save
-A OUTPUT -p tcp -m conntrack --ctorigsrc 128.173.134.206 -j ACCEPT

(128.173... is the address to the v2 match structure).

To fix this, we take advantage of the fact that the v1 and v2 structures
are identical with exception of the last two structure members (u8 in v1,
u16 in v2).

We extract them as early as possible and prevent the v2 matching function
from looking at those two members directly.

Previously reported by Michel Messerschmidt via Ben Hutchings, also
see Debian Bug tracker #556587.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/netfilter/xt_conntrack.c | 61 ++++++++++++--------------------------------
 1 file changed, 17 insertions(+), 44 deletions(-)

(limited to 'net/netfilter')

diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index 6dc4652f2fe..ae66305f0fe 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -113,7 +113,8 @@ ct_proto_port_check(const struct xt_conntrack_mtinfo2 *info,
 }
 
 static bool
-conntrack_mt(const struct sk_buff *skb, const struct xt_match_param *par)
+conntrack_mt(const struct sk_buff *skb, const struct xt_match_param *par,
+             u16 state_mask, u16 status_mask)
 {
 	const struct xt_conntrack_mtinfo2 *info = par->matchinfo;
 	enum ip_conntrack_info ctinfo;
@@ -136,7 +137,7 @@ conntrack_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 			if (test_bit(IPS_DST_NAT_BIT, &ct->status))
 				statebit |= XT_CONNTRACK_STATE_DNAT;
 		}
-		if (!!(info->state_mask & statebit) ^
+		if (!!(state_mask & statebit) ^
 		    !(info->invert_flags & XT_CONNTRACK_STATE))
 			return false;
 	}
@@ -172,7 +173,7 @@ conntrack_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 		return false;
 
 	if ((info->match_flags & XT_CONNTRACK_STATUS) &&
-	    (!!(info->status_mask & ct->status) ^
+	    (!!(status_mask & ct->status) ^
 	    !(info->invert_flags & XT_CONNTRACK_STATUS)))
 		return false;
 
@@ -192,11 +193,17 @@ conntrack_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 static bool
 conntrack_mt_v1(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_conntrack_mtinfo2 *const *info = par->matchinfo;
-	struct xt_match_param newpar = *par;
+	const struct xt_conntrack_mtinfo1 *info = par->matchinfo;
 
-	newpar.matchinfo = *info;
-	return conntrack_mt(skb, &newpar);
+	return conntrack_mt(skb, par, info->state_mask, info->status_mask);
+}
+
+static bool
+conntrack_mt_v2(const struct sk_buff *skb, const struct xt_match_param *par)
+{
+	const struct xt_conntrack_mtinfo2 *info = par->matchinfo;
+
+	return conntrack_mt(skb, par, info->state_mask, info->status_mask);
 }
 
 static bool conntrack_mt_check(const struct xt_mtchk_param *par)
@@ -209,45 +216,11 @@ static bool conntrack_mt_check(const struct xt_mtchk_param *par)
 	return true;
 }
 
-static bool conntrack_mt_check_v1(const struct xt_mtchk_param *par)
-{
-	struct xt_conntrack_mtinfo1 *info = par->matchinfo;
-	struct xt_conntrack_mtinfo2 *up;
-	int ret = conntrack_mt_check(par);
-
-	if (ret < 0)
-		return ret;
-
-	up = kmalloc(sizeof(*up), GFP_KERNEL);
-	if (up == NULL) {
-		nf_ct_l3proto_module_put(par->family);
-		return -ENOMEM;
-	}
-
-	/*
-	 * The strategy here is to minimize the overhead of v1 matching,
-	 * by prebuilding a v2 struct and putting the pointer into the
-	 * v1 dataspace.
-	 */
-	memcpy(up, info, offsetof(typeof(*info), state_mask));
-	up->state_mask  = info->state_mask;
-	up->status_mask = info->status_mask;
-	*(void **)info  = up;
-	return true;
-}
-
 static void conntrack_mt_destroy(const struct xt_mtdtor_param *par)
 {
 	nf_ct_l3proto_module_put(par->family);
 }
 
-static void conntrack_mt_destroy_v1(const struct xt_mtdtor_param *par)
-{
-	struct xt_conntrack_mtinfo2 **info = par->matchinfo;
-	kfree(*info);
-	conntrack_mt_destroy(par);
-}
-
 static struct xt_match conntrack_mt_reg[] __read_mostly = {
 	{
 		.name       = "conntrack",
@@ -255,8 +228,8 @@ static struct xt_match conntrack_mt_reg[] __read_mostly = {
 		.family     = NFPROTO_UNSPEC,
 		.matchsize  = sizeof(struct xt_conntrack_mtinfo1),
 		.match      = conntrack_mt_v1,
-		.checkentry = conntrack_mt_check_v1,
-		.destroy    = conntrack_mt_destroy_v1,
+		.checkentry = conntrack_mt_check,
+		.destroy    = conntrack_mt_destroy,
 		.me         = THIS_MODULE,
 	},
 	{
@@ -264,7 +237,7 @@ static struct xt_match conntrack_mt_reg[] __read_mostly = {
 		.revision   = 2,
 		.family     = NFPROTO_UNSPEC,
 		.matchsize  = sizeof(struct xt_conntrack_mtinfo2),
-		.match      = conntrack_mt,
+		.match      = conntrack_mt_v2,
 		.checkentry = conntrack_mt_check,
 		.destroy    = conntrack_mt_destroy,
 		.me         = THIS_MODULE,
-- 
cgit v1.2.3