From 66bf79182d6531c14c1f9a507b6bbf374a2ae4cd Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 7 Sep 2008 18:19:25 -0700 Subject: netfilter: nf_conntrack_sip: de-static helper pointers Helper's ->help hook can run concurrently with itself, so iterating over SIP helpers with static pointer won't work reliably. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_sip.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 2f9bbc058b4..1fa306be60f 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1193,7 +1193,6 @@ static const struct sip_handler sip_handlers[] = { static int process_sip_response(struct sk_buff *skb, const char **dptr, unsigned int *datalen) { - static const struct sip_handler *handler; enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); unsigned int matchoff, matchlen; @@ -1214,6 +1213,8 @@ static int process_sip_response(struct sk_buff *skb, dataoff = matchoff + matchlen + 1; for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) { + const struct sip_handler *handler; + handler = &sip_handlers[i]; if (handler->response == NULL) continue; @@ -1228,13 +1229,14 @@ static int process_sip_response(struct sk_buff *skb, static int process_sip_request(struct sk_buff *skb, const char **dptr, unsigned int *datalen) { - static const struct sip_handler *handler; enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); unsigned int matchoff, matchlen; unsigned int cseq, i; for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) { + const struct sip_handler *handler; + handler = &sip_handlers[i]; if (handler->request == NULL) continue; -- cgit v1.2.3 From 887464a41fde7e9e1e11ca86748338033c502446 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 7 Sep 2008 18:20:08 -0700 Subject: netfilter: nf_conntrack_gre: more locking around keymap list gre_keymap_list should be protected in all places. (unless I'm misreading something) Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_proto_gre.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 654a4f7f12c..b308bb4c12b 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -97,10 +97,14 @@ int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir, kmp = &help->help.ct_pptp_info.keymap[dir]; if (*kmp) { /* check whether it's a retransmission */ + read_lock_bh(&nf_ct_gre_lock); list_for_each_entry(km, &gre_keymap_list, list) { - if (gre_key_cmpfn(km, t) && km == *kmp) + if (gre_key_cmpfn(km, t) && km == *kmp) { + read_unlock_bh(&nf_ct_gre_lock); return 0; + } } + read_unlock_bh(&nf_ct_gre_lock); pr_debug("trying to override keymap_%s for ct %p\n", dir == IP_CT_DIR_REPLY ? "reply" : "orig", ct); return -EEXIST; -- cgit v1.2.3 From 51807e91a76a531d059ec7ce3395c435e4df52a8 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 7 Sep 2008 18:20:36 -0700 Subject: netfilter: nf_conntrack_gre: nf_ct_gre_keymap_flush() fixlet It does "kfree(list_head)" which looks wrong because entity that was allocated is definitely not list_head. However, this all works because list_head is first item in struct nf_ct_gre_keymap. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_proto_gre.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index b308bb4c12b..9bd03967fea 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -45,12 +45,12 @@ static LIST_HEAD(gre_keymap_list); void nf_ct_gre_keymap_flush(void) { - struct list_head *pos, *n; + struct nf_ct_gre_keymap *km, *tmp; write_lock_bh(&nf_ct_gre_lock); - list_for_each_safe(pos, n, &gre_keymap_list) { - list_del(pos); - kfree(pos); + list_for_each_entry_safe(km, tmp, &gre_keymap_list, list) { + list_del(&km->list); + kfree(km); } write_unlock_bh(&nf_ct_gre_lock); } -- cgit v1.2.3 From e3b802ba885b54f4050164c3cfd9e0ba9c73173a Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 7 Sep 2008 18:21:24 -0700 Subject: netfilter: nf_conntrack_irc: make sure string is terminated before calling simple_strtoul Alexey Dobriyan points out: 1. simple_strtoul() silently accepts all characters for given base even if result won't fit into unsigned long. This is amazing stupidity in itself, but 2. nf_conntrack_irc helper use simple_strtoul() for DCC request parsing. Data first copied into 64KB buffer, so theoretically nothing prevents reading past the end of it, since data comes from network given 1). This is not actually a problem currently since we're guaranteed to have a 0 byte in skb_shared_info or in the buffer the data is copied to, but to make this more robust, make sure the string is actually terminated. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_irc.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index 1b1226d6653..20633fdf7e6 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -68,11 +68,21 @@ static const char *const dccprotos[] = { static int parse_dcc(char *data, const char *data_end, u_int32_t *ip, u_int16_t *port, char **ad_beg_p, char **ad_end_p) { + char *tmp; + /* at least 12: "AAAAAAAA P\1\n" */ while (*data++ != ' ') if (data > data_end - 12) return -1; + /* Make sure we have a newline character within the packet boundaries + * because simple_strtoul parses until the first invalid character. */ + for (tmp = data; tmp <= data_end; tmp++) + if (*tmp == '\n') + break; + if (tmp > data_end || *tmp != '\n') + return -1; + *ad_beg_p = data; *ip = simple_strtoul(data, &data, 10); -- cgit v1.2.3 From 2cdc55751c33829f00510e0104562d0f8d8a9b85 Mon Sep 17 00:00:00 2001 From: Kaihui Luo Date: Mon, 22 Sep 2008 19:02:36 -0700 Subject: netfilter: xt_time gives a wrong monthday in a leap year The function localtime_3 in xt_time.c gives a wrong monthday in a leap year after 28th 2. calculating monthday should use the array days_since_leapyear[] not days_since_year[] in a leap year. Signed-off-by: Kaihui Luo Acked-by: Jan Engelhardt Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- net/netfilter/xt_time.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index 9f328593287..307a2c3c2df 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -136,17 +136,19 @@ static void localtime_3(struct xtm *r, time_t time) * from w repeatedly while counting.) */ if (is_leap(year)) { + /* use days_since_leapyear[] in a leap year */ for (i = ARRAY_SIZE(days_since_leapyear) - 1; - i > 0 && days_since_year[i] > w; --i) + i > 0 && days_since_leapyear[i] > w; --i) /* just loop */; + r->monthday = w - days_since_leapyear[i] + 1; } else { for (i = ARRAY_SIZE(days_since_year) - 1; i > 0 && days_since_year[i] > w; --i) /* just loop */; + r->monthday = w - days_since_year[i] + 1; } r->month = i + 1; - r->monthday = w - days_since_year[i] + 1; return; } -- cgit v1.2.3 From cb7f6a7b716e801097b564dec3ccb58d330aef56 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Fri, 19 Sep 2008 12:32:57 +0200 Subject: IPVS: Move IPVS to net/netfilter/ipvs Since IPVS now has partial IPv6 support, this patch moves IPVS from net/ipv4/ipvs to net/netfilter/ipvs. It's a result of: $ git mv net/ipv4/ipvs net/netfilter and adapting the relevant Kconfigs/Makefiles to the new path. Signed-off-by: Julius Volz Signed-off-by: Simon Horman --- net/netfilter/Kconfig | 2 + net/netfilter/Makefile | 3 + net/netfilter/ipvs/Kconfig | 239 +++ net/netfilter/ipvs/Makefile | 33 + net/netfilter/ipvs/ip_vs_app.c | 622 ++++++ net/netfilter/ipvs/ip_vs_conn.c | 1110 ++++++++++ net/netfilter/ipvs/ip_vs_core.c | 1542 ++++++++++++++ net/netfilter/ipvs/ip_vs_ctl.c | 3443 +++++++++++++++++++++++++++++++ net/netfilter/ipvs/ip_vs_dh.c | 261 +++ net/netfilter/ipvs/ip_vs_est.c | 166 ++ net/netfilter/ipvs/ip_vs_ftp.c | 410 ++++ net/netfilter/ipvs/ip_vs_lblc.c | 555 +++++ net/netfilter/ipvs/ip_vs_lblcr.c | 755 +++++++ net/netfilter/ipvs/ip_vs_lc.c | 103 + net/netfilter/ipvs/ip_vs_nq.c | 138 ++ net/netfilter/ipvs/ip_vs_proto.c | 288 +++ net/netfilter/ipvs/ip_vs_proto_ah_esp.c | 235 +++ net/netfilter/ipvs/ip_vs_proto_tcp.c | 732 +++++++ net/netfilter/ipvs/ip_vs_proto_udp.c | 533 +++++ net/netfilter/ipvs/ip_vs_rr.c | 112 + net/netfilter/ipvs/ip_vs_sched.c | 251 +++ net/netfilter/ipvs/ip_vs_sed.c | 140 ++ net/netfilter/ipvs/ip_vs_sh.c | 258 +++ net/netfilter/ipvs/ip_vs_sync.c | 942 +++++++++ net/netfilter/ipvs/ip_vs_wlc.c | 128 ++ net/netfilter/ipvs/ip_vs_wrr.c | 237 +++ net/netfilter/ipvs/ip_vs_xmit.c | 1004 +++++++++ 27 files changed, 14242 insertions(+) create mode 100644 net/netfilter/ipvs/Kconfig create mode 100644 net/netfilter/ipvs/Makefile create mode 100644 net/netfilter/ipvs/ip_vs_app.c create mode 100644 net/netfilter/ipvs/ip_vs_conn.c create mode 100644 net/netfilter/ipvs/ip_vs_core.c create mode 100644 net/netfilter/ipvs/ip_vs_ctl.c create mode 100644 net/netfilter/ipvs/ip_vs_dh.c create mode 100644 net/netfilter/ipvs/ip_vs_est.c create mode 100644 net/netfilter/ipvs/ip_vs_ftp.c create mode 100644 net/netfilter/ipvs/ip_vs_lblc.c create mode 100644 net/netfilter/ipvs/ip_vs_lblcr.c create mode 100644 net/netfilter/ipvs/ip_vs_lc.c create mode 100644 net/netfilter/ipvs/ip_vs_nq.c create mode 100644 net/netfilter/ipvs/ip_vs_proto.c create mode 100644 net/netfilter/ipvs/ip_vs_proto_ah_esp.c create mode 100644 net/netfilter/ipvs/ip_vs_proto_tcp.c create mode 100644 net/netfilter/ipvs/ip_vs_proto_udp.c create mode 100644 net/netfilter/ipvs/ip_vs_rr.c create mode 100644 net/netfilter/ipvs/ip_vs_sched.c create mode 100644 net/netfilter/ipvs/ip_vs_sed.c create mode 100644 net/netfilter/ipvs/ip_vs_sh.c create mode 100644 net/netfilter/ipvs/ip_vs_sync.c create mode 100644 net/netfilter/ipvs/ip_vs_wlc.c create mode 100644 net/netfilter/ipvs/ip_vs_wrr.c create mode 100644 net/netfilter/ipvs/ip_vs_xmit.c (limited to 'net/netfilter') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index ee898e74808..73f9378e1bf 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -838,3 +838,5 @@ config NETFILTER_XT_MATCH_HASHLIMIT endmenu +source "net/netfilter/ipvs/Kconfig" + diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 3bd2cc556ae..cf75055be83 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -83,3 +83,6 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o obj-$(CONFIG_NETFILTER_XT_MATCH_TIME) += xt_time.o obj-$(CONFIG_NETFILTER_XT_MATCH_U32) += xt_u32.o + +# IPVS +obj-$(CONFIG_IP_VS) += ipvs/ diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig new file mode 100644 index 00000000000..de6004de80b --- /dev/null +++ b/net/netfilter/ipvs/Kconfig @@ -0,0 +1,239 @@ +# +# IP Virtual Server configuration +# +menuconfig IP_VS + tristate "IP virtual server support (EXPERIMENTAL)" + depends on NETFILTER + ---help--- + IP Virtual Server support will let you build a high-performance + virtual server based on cluster of two or more real servers. This + option must be enabled for at least one of the clustered computers + that will take care of intercepting incoming connections to a + single IP address and scheduling them to real servers. + + Three request dispatching techniques are implemented, they are + virtual server via NAT, virtual server via tunneling and virtual + server via direct routing. The several scheduling algorithms can + be used to choose which server the connection is directed to, + thus load balancing can be achieved among the servers. For more + information and its administration program, please visit the + following URL: . + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +if IP_VS + +config IP_VS_IPV6 + bool "IPv6 support for IPVS (DANGEROUS)" + depends on EXPERIMENTAL && (IPV6 = y || IP_VS = IPV6) + ---help--- + Add IPv6 support to IPVS. This is incomplete and might be dangerous. + + Say N if unsure. + +config IP_VS_DEBUG + bool "IP virtual server debugging" + ---help--- + Say Y here if you want to get additional messages useful in + debugging the IP virtual server code. You can change the debug + level in /proc/sys/net/ipv4/vs/debug_level + +config IP_VS_TAB_BITS + int "IPVS connection table size (the Nth power of 2)" + range 8 20 + default 12 + ---help--- + The IPVS connection hash table uses the chaining scheme to handle + hash collisions. Using a big IPVS connection hash table will greatly + reduce conflicts when there are hundreds of thousands of connections + in the hash table. + + Note the table size must be power of 2. The table size will be the + value of 2 to the your input number power. The number to choose is + from 8 to 20, the default number is 12, which means the table size + is 4096. Don't input the number too small, otherwise you will lose + performance on it. You can adapt the table size yourself, according + to your virtual server application. It is good to set the table size + not far less than the number of connections per second multiplying + average lasting time of connection in the table. For example, your + virtual server gets 200 connections per second, the connection lasts + for 200 seconds in average in the connection table, the table size + should be not far less than 200x200, it is good to set the table + size 32768 (2**15). + + Another note that each connection occupies 128 bytes effectively and + each hash entry uses 8 bytes, so you can estimate how much memory is + needed for your box. + +comment "IPVS transport protocol load balancing support" + +config IP_VS_PROTO_TCP + bool "TCP load balancing support" + ---help--- + This option enables support for load balancing TCP transport + protocol. Say Y if unsure. + +config IP_VS_PROTO_UDP + bool "UDP load balancing support" + ---help--- + This option enables support for load balancing UDP transport + protocol. Say Y if unsure. + +config IP_VS_PROTO_AH_ESP + bool + depends on UNDEFINED + +config IP_VS_PROTO_ESP + bool "ESP load balancing support" + select IP_VS_PROTO_AH_ESP + ---help--- + This option enables support for load balancing ESP (Encapsulation + Security Payload) transport protocol. Say Y if unsure. + +config IP_VS_PROTO_AH + bool "AH load balancing support" + select IP_VS_PROTO_AH_ESP + ---help--- + This option enables support for load balancing AH (Authentication + Header) transport protocol. Say Y if unsure. + +comment "IPVS scheduler" + +config IP_VS_RR + tristate "round-robin scheduling" + ---help--- + The robin-robin scheduling algorithm simply directs network + connections to different real servers in a round-robin manner. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_WRR + tristate "weighted round-robin scheduling" + ---help--- + The weighted robin-robin scheduling algorithm directs network + connections to different real servers based on server weights + in a round-robin manner. Servers with higher weights receive + new connections first than those with less weights, and servers + with higher weights get more connections than those with less + weights and servers with equal weights get equal connections. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_LC + tristate "least-connection scheduling" + ---help--- + The least-connection scheduling algorithm directs network + connections to the server with the least number of active + connections. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_WLC + tristate "weighted least-connection scheduling" + ---help--- + The weighted least-connection scheduling algorithm directs network + connections to the server with the least active connections + normalized by the server weight. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_LBLC + tristate "locality-based least-connection scheduling" + ---help--- + The locality-based least-connection scheduling algorithm is for + destination IP load balancing. It is usually used in cache cluster. + This algorithm usually directs packet destined for an IP address to + its server if the server is alive and under load. If the server is + overloaded (its active connection numbers is larger than its weight) + and there is a server in its half load, then allocate the weighted + least-connection server to this IP address. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_LBLCR + tristate "locality-based least-connection with replication scheduling" + ---help--- + The locality-based least-connection with replication scheduling + algorithm is also for destination IP load balancing. It is + usually used in cache cluster. It differs from the LBLC scheduling + as follows: the load balancer maintains mappings from a target + to a set of server nodes that can serve the target. Requests for + a target are assigned to the least-connection node in the target's + server set. If all the node in the server set are over loaded, + it picks up a least-connection node in the cluster and adds it + in the sever set for the target. If the server set has not been + modified for the specified time, the most loaded node is removed + from the server set, in order to avoid high degree of replication. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_DH + tristate "destination hashing scheduling" + ---help--- + The destination hashing scheduling algorithm assigns network + connections to the servers through looking up a statically assigned + hash table by their destination IP addresses. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_SH + tristate "source hashing scheduling" + ---help--- + The source hashing scheduling algorithm assigns network + connections to the servers through looking up a statically assigned + hash table by their source IP addresses. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_SED + tristate "shortest expected delay scheduling" + ---help--- + The shortest expected delay scheduling algorithm assigns network + connections to the server with the shortest expected delay. The + expected delay that the job will experience is (Ci + 1) / Ui if + sent to the ith server, in which Ci is the number of connections + on the ith server and Ui is the fixed service rate (weight) + of the ith server. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_NQ + tristate "never queue scheduling" + ---help--- + The never queue scheduling algorithm adopts a two-speed model. + When there is an idle server available, the job will be sent to + the idle server, instead of waiting for a fast one. When there + is no idle server available, the job will be sent to the server + that minimize its expected delay (The Shortest Expected Delay + scheduling algorithm). + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +comment 'IPVS application helper' + +config IP_VS_FTP + tristate "FTP protocol helper" + depends on IP_VS_PROTO_TCP + ---help--- + FTP is a protocol that transfers IP address and/or port number in + the payload. In the virtual server via Network Address Translation, + the IP address and port number of real servers cannot be sent to + clients in ftp connections directly, so FTP protocol helper is + required for tracking the connection and mangling it back to that of + virtual service. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +endif # IP_VS diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile new file mode 100644 index 00000000000..73a46fe1fe4 --- /dev/null +++ b/net/netfilter/ipvs/Makefile @@ -0,0 +1,33 @@ +# +# Makefile for the IPVS modules on top of IPv4. +# + +# IPVS transport protocol load balancing support +ip_vs_proto-objs-y := +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o + +ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ + ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ + ip_vs_est.o ip_vs_proto.o \ + $(ip_vs_proto-objs-y) + + +# IPVS core +obj-$(CONFIG_IP_VS) += ip_vs.o + +# IPVS schedulers +obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o +obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o +obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o +obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o +obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o +obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o +obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o +obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o +obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o +obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o + +# IPVS application helpers +obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c new file mode 100644 index 00000000000..201b8ea3020 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -0,0 +1,622 @@ +/* + * ip_vs_app.c: Application module support for IPVS + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference + * is that ip_vs_app module handles the reverse direction (incoming requests + * and outgoing responses). + * + * IP_MASQ_APP application masquerading module + * + * Author: Juan Jose Ciarlante, + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +EXPORT_SYMBOL(register_ip_vs_app); +EXPORT_SYMBOL(unregister_ip_vs_app); +EXPORT_SYMBOL(register_ip_vs_app_inc); + +/* ipvs application list head */ +static LIST_HEAD(ip_vs_app_list); +static DEFINE_MUTEX(__ip_vs_app_mutex); + + +/* + * Get an ip_vs_app object + */ +static inline int ip_vs_app_get(struct ip_vs_app *app) +{ + return try_module_get(app->module); +} + + +static inline void ip_vs_app_put(struct ip_vs_app *app) +{ + module_put(app->module); +} + + +/* + * Allocate/initialize app incarnation and register it in proto apps. + */ +static int +ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) +{ + struct ip_vs_protocol *pp; + struct ip_vs_app *inc; + int ret; + + if (!(pp = ip_vs_proto_get(proto))) + return -EPROTONOSUPPORT; + + if (!pp->unregister_app) + return -EOPNOTSUPP; + + inc = kmemdup(app, sizeof(*inc), GFP_KERNEL); + if (!inc) + return -ENOMEM; + INIT_LIST_HEAD(&inc->p_list); + INIT_LIST_HEAD(&inc->incs_list); + inc->app = app; + inc->port = htons(port); + atomic_set(&inc->usecnt, 0); + + if (app->timeouts) { + inc->timeout_table = + ip_vs_create_timeout_table(app->timeouts, + app->timeouts_size); + if (!inc->timeout_table) { + ret = -ENOMEM; + goto out; + } + } + + ret = pp->register_app(inc); + if (ret) + goto out; + + list_add(&inc->a_list, &app->incs_list); + IP_VS_DBG(9, "%s application %s:%u registered\n", + pp->name, inc->name, inc->port); + + return 0; + + out: + kfree(inc->timeout_table); + kfree(inc); + return ret; +} + + +/* + * Release app incarnation + */ +static void +ip_vs_app_inc_release(struct ip_vs_app *inc) +{ + struct ip_vs_protocol *pp; + + if (!(pp = ip_vs_proto_get(inc->protocol))) + return; + + if (pp->unregister_app) + pp->unregister_app(inc); + + IP_VS_DBG(9, "%s App %s:%u unregistered\n", + pp->name, inc->name, inc->port); + + list_del(&inc->a_list); + + kfree(inc->timeout_table); + kfree(inc); +} + + +/* + * Get reference to app inc (only called from softirq) + * + */ +int ip_vs_app_inc_get(struct ip_vs_app *inc) +{ + int result; + + atomic_inc(&inc->usecnt); + if (unlikely((result = ip_vs_app_get(inc->app)) != 1)) + atomic_dec(&inc->usecnt); + return result; +} + + +/* + * Put the app inc (only called from timer or net softirq) + */ +void ip_vs_app_inc_put(struct ip_vs_app *inc) +{ + ip_vs_app_put(inc->app); + atomic_dec(&inc->usecnt); +} + + +/* + * Register an application incarnation in protocol applications + */ +int +register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port) +{ + int result; + + mutex_lock(&__ip_vs_app_mutex); + + result = ip_vs_app_inc_new(app, proto, port); + + mutex_unlock(&__ip_vs_app_mutex); + + return result; +} + + +/* + * ip_vs_app registration routine + */ +int register_ip_vs_app(struct ip_vs_app *app) +{ + /* increase the module use count */ + ip_vs_use_count_inc(); + + mutex_lock(&__ip_vs_app_mutex); + + list_add(&app->a_list, &ip_vs_app_list); + + mutex_unlock(&__ip_vs_app_mutex); + + return 0; +} + + +/* + * ip_vs_app unregistration routine + * We are sure there are no app incarnations attached to services + */ +void unregister_ip_vs_app(struct ip_vs_app *app) +{ + struct ip_vs_app *inc, *nxt; + + mutex_lock(&__ip_vs_app_mutex); + + list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) { + ip_vs_app_inc_release(inc); + } + + list_del(&app->a_list); + + mutex_unlock(&__ip_vs_app_mutex); + + /* decrease the module use count */ + ip_vs_use_count_dec(); +} + + +/* + * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) + */ +int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp) +{ + return pp->app_conn_bind(cp); +} + + +/* + * Unbind cp from application incarnation (called by cp destructor) + */ +void ip_vs_unbind_app(struct ip_vs_conn *cp) +{ + struct ip_vs_app *inc = cp->app; + + if (!inc) + return; + + if (inc->unbind_conn) + inc->unbind_conn(inc, cp); + if (inc->done_conn) + inc->done_conn(inc, cp); + ip_vs_app_inc_put(inc); + cp->app = NULL; +} + + +/* + * Fixes th->seq based on ip_vs_seq info. + */ +static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) +{ + __u32 seq = ntohl(th->seq); + + /* + * Adjust seq with delta-offset for all packets after + * the most recent resized pkt seq and with previous_delta offset + * for all packets before most recent resized pkt seq. + */ + if (vseq->delta || vseq->previous_delta) { + if(after(seq, vseq->init_seq)) { + th->seq = htonl(seq + vseq->delta); + IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n", + vseq->delta); + } else { + th->seq = htonl(seq + vseq->previous_delta); + IP_VS_DBG(9, "vs_fix_seq(): added previous_delta " + "(%d) to seq\n", vseq->previous_delta); + } + } +} + + +/* + * Fixes th->ack_seq based on ip_vs_seq info. + */ +static inline void +vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) +{ + __u32 ack_seq = ntohl(th->ack_seq); + + /* + * Adjust ack_seq with delta-offset for + * the packets AFTER most recent resized pkt has caused a shift + * for packets before most recent resized pkt, use previous_delta + */ + if (vseq->delta || vseq->previous_delta) { + /* since ack_seq is the number of octet that is expected + to receive next, so compare it with init_seq+delta */ + if(after(ack_seq, vseq->init_seq+vseq->delta)) { + th->ack_seq = htonl(ack_seq - vseq->delta); + IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta " + "(%d) from ack_seq\n", vseq->delta); + + } else { + th->ack_seq = htonl(ack_seq - vseq->previous_delta); + IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted " + "previous_delta (%d) from ack_seq\n", + vseq->previous_delta); + } + } +} + + +/* + * Updates ip_vs_seq if pkt has been resized + * Assumes already checked proto==IPPROTO_TCP and diff!=0. + */ +static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, + unsigned flag, __u32 seq, int diff) +{ + /* spinlock is to keep updating cp->flags atomic */ + spin_lock(&cp->lock); + if (!(cp->flags & flag) || after(seq, vseq->init_seq)) { + vseq->previous_delta = vseq->delta; + vseq->delta += diff; + vseq->init_seq = seq; + cp->flags |= flag; + } + spin_unlock(&cp->lock); +} + +static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, + struct ip_vs_app *app) +{ + int diff; + const unsigned int tcp_offset = ip_hdrlen(skb); + struct tcphdr *th; + __u32 seq; + + if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) + return 0; + + th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); + + /* + * Remember seq number in case this pkt gets resized + */ + seq = ntohl(th->seq); + + /* + * Fix seq stuff if flagged as so. + */ + if (cp->flags & IP_VS_CONN_F_OUT_SEQ) + vs_fix_seq(&cp->out_seq, th); + if (cp->flags & IP_VS_CONN_F_IN_SEQ) + vs_fix_ack_seq(&cp->in_seq, th); + + /* + * Call private output hook function + */ + if (app->pkt_out == NULL) + return 1; + + if (!app->pkt_out(app, cp, skb, &diff)) + return 0; + + /* + * Update ip_vs seq stuff if len has changed. + */ + if (diff != 0) + vs_seq_update(cp, &cp->out_seq, + IP_VS_CONN_F_OUT_SEQ, seq, diff); + + return 1; +} + +/* + * Output pkt hook. Will call bound ip_vs_app specific function + * called by ipvs packet handler, assumes previously checked cp!=NULL + * returns false if it can't handle packet (oom) + */ +int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_app *app; + + /* + * check if application module is bound to + * this ip_vs_conn. + */ + if ((app = cp->app) == NULL) + return 1; + + /* TCP is complicated */ + if (cp->protocol == IPPROTO_TCP) + return app_tcp_pkt_out(cp, skb, app); + + /* + * Call private output hook function + */ + if (app->pkt_out == NULL) + return 1; + + return app->pkt_out(app, cp, skb, NULL); +} + + +static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, + struct ip_vs_app *app) +{ + int diff; + const unsigned int tcp_offset = ip_hdrlen(skb); + struct tcphdr *th; + __u32 seq; + + if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) + return 0; + + th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); + + /* + * Remember seq number in case this pkt gets resized + */ + seq = ntohl(th->seq); + + /* + * Fix seq stuff if flagged as so. + */ + if (cp->flags & IP_VS_CONN_F_IN_SEQ) + vs_fix_seq(&cp->in_seq, th); + if (cp->flags & IP_VS_CONN_F_OUT_SEQ) + vs_fix_ack_seq(&cp->out_seq, th); + + /* + * Call private input hook function + */ + if (app->pkt_in == NULL) + return 1; + + if (!app->pkt_in(app, cp, skb, &diff)) + return 0; + + /* + * Update ip_vs seq stuff if len has changed. + */ + if (diff != 0) + vs_seq_update(cp, &cp->in_seq, + IP_VS_CONN_F_IN_SEQ, seq, diff); + + return 1; +} + +/* + * Input pkt hook. Will call bound ip_vs_app specific function + * called by ipvs packet handler, assumes previously checked cp!=NULL. + * returns false if can't handle packet (oom). + */ +int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_app *app; + + /* + * check if application module is bound to + * this ip_vs_conn. + */ + if ((app = cp->app) == NULL) + return 1; + + /* TCP is complicated */ + if (cp->protocol == IPPROTO_TCP) + return app_tcp_pkt_in(cp, skb, app); + + /* + * Call private input hook function + */ + if (app->pkt_in == NULL) + return 1; + + return app->pkt_in(app, cp, skb, NULL); +} + + +#ifdef CONFIG_PROC_FS +/* + * /proc/net/ip_vs_app entry function + */ + +static struct ip_vs_app *ip_vs_app_idx(loff_t pos) +{ + struct ip_vs_app *app, *inc; + + list_for_each_entry(app, &ip_vs_app_list, a_list) { + list_for_each_entry(inc, &app->incs_list, a_list) { + if (pos-- == 0) + return inc; + } + } + return NULL; + +} + +static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos) +{ + mutex_lock(&__ip_vs_app_mutex); + + return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN; +} + +static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip_vs_app *inc, *app; + struct list_head *e; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip_vs_app_idx(0); + + inc = v; + app = inc->app; + + if ((e = inc->a_list.next) != &app->incs_list) + return list_entry(e, struct ip_vs_app, a_list); + + /* go on to next application */ + for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) { + app = list_entry(e, struct ip_vs_app, a_list); + list_for_each_entry(inc, &app->incs_list, a_list) { + return inc; + } + } + return NULL; +} + +static void ip_vs_app_seq_stop(struct seq_file *seq, void *v) +{ + mutex_unlock(&__ip_vs_app_mutex); +} + +static int ip_vs_app_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "prot port usecnt name\n"); + else { + const struct ip_vs_app *inc = v; + + seq_printf(seq, "%-3s %-7u %-6d %-17s\n", + ip_vs_proto_name(inc->protocol), + ntohs(inc->port), + atomic_read(&inc->usecnt), + inc->name); + } + return 0; +} + +static const struct seq_operations ip_vs_app_seq_ops = { + .start = ip_vs_app_seq_start, + .next = ip_vs_app_seq_next, + .stop = ip_vs_app_seq_stop, + .show = ip_vs_app_seq_show, +}; + +static int ip_vs_app_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ip_vs_app_seq_ops); +} + +static const struct file_operations ip_vs_app_fops = { + .owner = THIS_MODULE, + .open = ip_vs_app_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif + + +/* + * Replace a segment of data with a new segment + */ +int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri, + char *o_buf, int o_len, char *n_buf, int n_len) +{ + int diff; + int o_offset; + int o_left; + + EnterFunction(9); + + diff = n_len - o_len; + o_offset = o_buf - (char *)skb->data; + /* The length of left data after o_buf+o_len in the skb data */ + o_left = skb->len - (o_offset + o_len); + + if (diff <= 0) { + memmove(o_buf + n_len, o_buf + o_len, o_left); + memcpy(o_buf, n_buf, n_len); + skb_trim(skb, skb->len + diff); + } else if (diff <= skb_tailroom(skb)) { + skb_put(skb, diff); + memmove(o_buf + n_len, o_buf + o_len, o_left); + memcpy(o_buf, n_buf, n_len); + } else { + if (pskb_expand_head(skb, skb_headroom(skb), diff, pri)) + return -ENOMEM; + skb_put(skb, diff); + memmove(skb->data + o_offset + n_len, + skb->data + o_offset + o_len, o_left); + skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len); + } + + /* must update the iph total length here */ + ip_hdr(skb)->tot_len = htons(skb->len); + + LeaveFunction(9); + return 0; +} + + +int __init ip_vs_app_init(void) +{ + /* we will replace it with proc_net_ipvs_create() soon */ + proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops); + return 0; +} + + +void ip_vs_app_cleanup(void) +{ + proc_net_remove(&init_net, "ip_vs_app"); +} diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c new file mode 100644 index 00000000000..9a24332fbed --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -0,0 +1,1110 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang + * Peter Kese + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, + * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms + * and others. Many code here is taken from IP MASQ code of kernel 2.2. + * + * Changes: + * + */ + +#include +#include +#include +#include +#include +#include +#include /* for proc_net_* */ +#include +#include +#include + +#include +#include + + +/* + * Connection hash table: for input and output packets lookups of IPVS + */ +static struct list_head *ip_vs_conn_tab; + +/* SLAB cache for IPVS connections */ +static struct kmem_cache *ip_vs_conn_cachep __read_mostly; + +/* counter for current IPVS connections */ +static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); + +/* counter for no client port connections */ +static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); + +/* random value for IPVS connection hash */ +static unsigned int ip_vs_conn_rnd; + +/* + * Fine locking granularity for big connection hash table + */ +#define CT_LOCKARRAY_BITS 4 +#define CT_LOCKARRAY_SIZE (1<ip, (__force u32)port, proto, + ip_vs_conn_rnd) + & IP_VS_CONN_TAB_MASK; +} + + +/* + * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port. + * returns bool success. + */ +static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) +{ + unsigned hash; + int ret; + + /* Hash by protocol, client address and port */ + hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); + + ct_write_lock(hash); + + if (!(cp->flags & IP_VS_CONN_F_HASHED)) { + list_add(&cp->c_list, &ip_vs_conn_tab[hash]); + cp->flags |= IP_VS_CONN_F_HASHED; + atomic_inc(&cp->refcnt); + ret = 1; + } else { + IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, " + "called from %p\n", __builtin_return_address(0)); + ret = 0; + } + + ct_write_unlock(hash); + + return ret; +} + + +/* + * UNhashes ip_vs_conn from ip_vs_conn_tab. + * returns bool success. + */ +static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) +{ + unsigned hash; + int ret; + + /* unhash it and decrease its reference counter */ + hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); + + ct_write_lock(hash); + + if (cp->flags & IP_VS_CONN_F_HASHED) { + list_del(&cp->c_list); + cp->flags &= ~IP_VS_CONN_F_HASHED; + atomic_dec(&cp->refcnt); + ret = 1; + } else + ret = 0; + + ct_write_unlock(hash); + + return ret; +} + + +/* + * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. + * Called for pkts coming from OUTside-to-INside. + * s_addr, s_port: pkt source address (foreign host) + * d_addr, d_port: pkt dest address (load balancer) + */ +static inline struct ip_vs_conn *__ip_vs_conn_in_get +(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, + const union nf_inet_addr *d_addr, __be16 d_port) +{ + unsigned hash; + struct ip_vs_conn *cp; + + hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port); + + ct_read_lock(hash); + + list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { + if (cp->af == af && + ip_vs_addr_equal(af, s_addr, &cp->caddr) && + ip_vs_addr_equal(af, d_addr, &cp->vaddr) && + s_port == cp->cport && d_port == cp->vport && + ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && + protocol == cp->protocol) { + /* HIT */ + atomic_inc(&cp->refcnt); + ct_read_unlock(hash); + return cp; + } + } + + ct_read_unlock(hash); + + return NULL; +} + +struct ip_vs_conn *ip_vs_conn_in_get +(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, + const union nf_inet_addr *d_addr, __be16 d_port) +{ + struct ip_vs_conn *cp; + + cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port); + if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) + cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr, + d_port); + + IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n", + ip_vs_proto_name(protocol), + IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), + IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), + cp ? "hit" : "not hit"); + + return cp; +} + +/* Get reference to connection template */ +struct ip_vs_conn *ip_vs_ct_in_get +(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, + const union nf_inet_addr *d_addr, __be16 d_port) +{ + unsigned hash; + struct ip_vs_conn *cp; + + hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port); + + ct_read_lock(hash); + + list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { + if (cp->af == af && + ip_vs_addr_equal(af, s_addr, &cp->caddr) && + ip_vs_addr_equal(af, d_addr, &cp->vaddr) && + s_port == cp->cport && d_port == cp->vport && + cp->flags & IP_VS_CONN_F_TEMPLATE && + protocol == cp->protocol) { + /* HIT */ + atomic_inc(&cp->refcnt); + goto out; + } + } + cp = NULL; + + out: + ct_read_unlock(hash); + + IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", + ip_vs_proto_name(protocol), + IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), + IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), + cp ? "hit" : "not hit"); + + return cp; +} + +/* + * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. + * Called for pkts coming from inside-to-OUTside. + * s_addr, s_port: pkt source address (inside host) + * d_addr, d_port: pkt dest address (foreign host) + */ +struct ip_vs_conn *ip_vs_conn_out_get +(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, + const union nf_inet_addr *d_addr, __be16 d_port) +{ + unsigned hash; + struct ip_vs_conn *cp, *ret=NULL; + + /* + * Check for "full" addressed entries + */ + hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port); + + ct_read_lock(hash); + + list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { + if (cp->af == af && + ip_vs_addr_equal(af, d_addr, &cp->caddr) && + ip_vs_addr_equal(af, s_addr, &cp->daddr) && + d_port == cp->cport && s_port == cp->dport && + protocol == cp->protocol) { + /* HIT */ + atomic_inc(&cp->refcnt); + ret = cp; + break; + } + } + + ct_read_unlock(hash); + + IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", + ip_vs_proto_name(protocol), + IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), + IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), + ret ? "hit" : "not hit"); + + return ret; +} + + +/* + * Put back the conn and restart its timer with its timeout + */ +void ip_vs_conn_put(struct ip_vs_conn *cp) +{ + /* reset it expire in its timeout */ + mod_timer(&cp->timer, jiffies+cp->timeout); + + __ip_vs_conn_put(cp); +} + + +/* + * Fill a no_client_port connection with a client port number + */ +void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) +{ + if (ip_vs_conn_unhash(cp)) { + spin_lock(&cp->lock); + if (cp->flags & IP_VS_CONN_F_NO_CPORT) { + atomic_dec(&ip_vs_conn_no_cport_cnt); + cp->flags &= ~IP_VS_CONN_F_NO_CPORT; + cp->cport = cport; + } + spin_unlock(&cp->lock); + + /* hash on new dport */ + ip_vs_conn_hash(cp); + } +} + + +/* + * Bind a connection entry with the corresponding packet_xmit. + * Called by ip_vs_conn_new. + */ +static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) +{ + switch (IP_VS_FWD_METHOD(cp)) { + case IP_VS_CONN_F_MASQ: + cp->packet_xmit = ip_vs_nat_xmit; + break; + + case IP_VS_CONN_F_TUNNEL: + cp->packet_xmit = ip_vs_tunnel_xmit; + break; + + case IP_VS_CONN_F_DROUTE: + cp->packet_xmit = ip_vs_dr_xmit; + break; + + case IP_VS_CONN_F_LOCALNODE: + cp->packet_xmit = ip_vs_null_xmit; + break; + + case IP_VS_CONN_F_BYPASS: + cp->packet_xmit = ip_vs_bypass_xmit; + break; + } +} + +#ifdef CONFIG_IP_VS_IPV6 +static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp) +{ + switch (IP_VS_FWD_METHOD(cp)) { + case IP_VS_CONN_F_MASQ: + cp->packet_xmit = ip_vs_nat_xmit_v6; + break; + + case IP_VS_CONN_F_TUNNEL: + cp->packet_xmit = ip_vs_tunnel_xmit_v6; + break; + + case IP_VS_CONN_F_DROUTE: + cp->packet_xmit = ip_vs_dr_xmit_v6; + break; + + case IP_VS_CONN_F_LOCALNODE: + cp->packet_xmit = ip_vs_null_xmit; + break; + + case IP_VS_CONN_F_BYPASS: + cp->packet_xmit = ip_vs_bypass_xmit_v6; + break; + } +} +#endif + + +static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) +{ + return atomic_read(&dest->activeconns) + + atomic_read(&dest->inactconns); +} + +/* + * Bind a connection entry with a virtual service destination + * Called just after a new connection entry is created. + */ +static inline void +ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) +{ + /* if dest is NULL, then return directly */ + if (!dest) + return; + + /* Increase the refcnt counter of the dest */ + atomic_inc(&dest->refcnt); + + /* Bind with the destination and its corresponding transmitter */ + if ((cp->flags & IP_VS_CONN_F_SYNC) && + (!(cp->flags & IP_VS_CONN_F_TEMPLATE))) + /* if the connection is not template and is created + * by sync, preserve the activity flag. + */ + cp->flags |= atomic_read(&dest->conn_flags) & + (~IP_VS_CONN_F_INACTIVE); + else + cp->flags |= atomic_read(&dest->conn_flags); + cp->dest = dest; + + IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d " + "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " + "dest->refcnt:%d\n", + ip_vs_proto_name(cp->protocol), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), + ip_vs_fwd_tag(cp), cp->state, + cp->flags, atomic_read(&cp->refcnt), + atomic_read(&dest->refcnt)); + + /* Update the connection counters */ + if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { + /* It is a normal connection, so increase the inactive + connection counter because it is in TCP SYNRECV + state (inactive) or other protocol inacive state */ + if ((cp->flags & IP_VS_CONN_F_SYNC) && + (!(cp->flags & IP_VS_CONN_F_INACTIVE))) + atomic_inc(&dest->activeconns); + else + atomic_inc(&dest->inactconns); + } else { + /* It is a persistent connection/template, so increase + the peristent connection counter */ + atomic_inc(&dest->persistconns); + } + + if (dest->u_threshold != 0 && + ip_vs_dest_totalconns(dest) >= dest->u_threshold) + dest->flags |= IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Check if there is a destination for the connection, if so + * bind the connection to the destination. + */ +struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) +{ + struct ip_vs_dest *dest; + + if ((cp) && (!cp->dest)) { + dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport, + &cp->vaddr, cp->vport, + cp->protocol); + ip_vs_bind_dest(cp, dest); + return dest; + } else + return NULL; +} + + +/* + * Unbind a connection entry with its VS destination + * Called by the ip_vs_conn_expire function. + */ +static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) +{ + struct ip_vs_dest *dest = cp->dest; + + if (!dest) + return; + + IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d " + "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " + "dest->refcnt:%d\n", + ip_vs_proto_name(cp->protocol), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), + ip_vs_fwd_tag(cp), cp->state, + cp->flags, atomic_read(&cp->refcnt), + atomic_read(&dest->refcnt)); + + /* Update the connection counters */ + if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { + /* It is a normal connection, so decrease the inactconns + or activeconns counter */ + if (cp->flags & IP_VS_CONN_F_INACTIVE) { + atomic_dec(&dest->inactconns); + } else { + atomic_dec(&dest->activeconns); + } + } else { + /* It is a persistent connection/template, so decrease + the peristent connection counter */ + atomic_dec(&dest->persistconns); + } + + if (dest->l_threshold != 0) { + if (ip_vs_dest_totalconns(dest) < dest->l_threshold) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } else if (dest->u_threshold != 0) { + if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } else { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } + + /* + * Simply decrease the refcnt of the dest, because the + * dest will be either in service's destination list + * or in the trash. + */ + atomic_dec(&dest->refcnt); +} + + +/* + * Checking if the destination of a connection template is available. + * If available, return 1, otherwise invalidate this connection + * template and return 0. + */ +int ip_vs_check_template(struct ip_vs_conn *ct) +{ + struct ip_vs_dest *dest = ct->dest; + + /* + * Checking the dest server status. + */ + if ((dest == NULL) || + !(dest->flags & IP_VS_DEST_F_AVAILABLE) || + (sysctl_ip_vs_expire_quiescent_template && + (atomic_read(&dest->weight) == 0))) { + IP_VS_DBG_BUF(9, "check_template: dest not available for " + "protocol %s s:%s:%d v:%s:%d " + "-> d:%s:%d\n", + ip_vs_proto_name(ct->protocol), + IP_VS_DBG_ADDR(ct->af, &ct->caddr), + ntohs(ct->cport), + IP_VS_DBG_ADDR(ct->af, &ct->vaddr), + ntohs(ct->vport), + IP_VS_DBG_ADDR(ct->af, &ct->daddr), + ntohs(ct->dport)); + + /* + * Invalidate the connection template + */ + if (ct->vport != htons(0xffff)) { + if (ip_vs_conn_unhash(ct)) { + ct->dport = htons(0xffff); + ct->vport = htons(0xffff); + ct->cport = 0; + ip_vs_conn_hash(ct); + } + } + + /* + * Simply decrease the refcnt of the template, + * don't restart its timer. + */ + atomic_dec(&ct->refcnt); + return 0; + } + return 1; +} + +static void ip_vs_conn_expire(unsigned long data) +{ + struct ip_vs_conn *cp = (struct ip_vs_conn *)data; + + cp->timeout = 60*HZ; + + /* + * hey, I'm using it + */ + atomic_inc(&cp->refcnt); + + /* + * do I control anybody? + */ + if (atomic_read(&cp->n_control)) + goto expire_later; + + /* + * unhash it if it is hashed in the conn table + */ + if (!ip_vs_conn_unhash(cp)) + goto expire_later; + + /* + * refcnt==1 implies I'm the only one referrer + */ + if (likely(atomic_read(&cp->refcnt) == 1)) { + /* delete the timer if it is activated by other users */ + if (timer_pending(&cp->timer)) + del_timer(&cp->timer); + + /* does anybody control me? */ + if (cp->control) + ip_vs_control_del(cp); + + if (unlikely(cp->app != NULL)) + ip_vs_unbind_app(cp); + ip_vs_unbind_dest(cp); + if (cp->flags & IP_VS_CONN_F_NO_CPORT) + atomic_dec(&ip_vs_conn_no_cport_cnt); + atomic_dec(&ip_vs_conn_count); + + kmem_cache_free(ip_vs_conn_cachep, cp); + return; + } + + /* hash it back to the table */ + ip_vs_conn_hash(cp); + + expire_later: + IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", + atomic_read(&cp->refcnt)-1, + atomic_read(&cp->n_control)); + + ip_vs_conn_put(cp); +} + + +void ip_vs_conn_expire_now(struct ip_vs_conn *cp) +{ + if (del_timer(&cp->timer)) + mod_timer(&cp->timer, jiffies); +} + + +/* + * Create a new connection entry and hash it into the ip_vs_conn_tab + */ +struct ip_vs_conn * +ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport, + const union nf_inet_addr *vaddr, __be16 vport, + const union nf_inet_addr *daddr, __be16 dport, unsigned flags, + struct ip_vs_dest *dest) +{ + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp = ip_vs_proto_get(proto); + + cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); + if (cp == NULL) { + IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n"); + return NULL; + } + + INIT_LIST_HEAD(&cp->c_list); + setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); + cp->af = af; + cp->protocol = proto; + ip_vs_addr_copy(af, &cp->caddr, caddr); + cp->cport = cport; + ip_vs_addr_copy(af, &cp->vaddr, vaddr); + cp->vport = vport; + ip_vs_addr_copy(af, &cp->daddr, daddr); + cp->dport = dport; + cp->flags = flags; + spin_lock_init(&cp->lock); + + /* + * Set the entry is referenced by the current thread before hashing + * it in the table, so that other thread run ip_vs_random_dropentry + * but cannot drop this entry. + */ + atomic_set(&cp->refcnt, 1); + + atomic_set(&cp->n_control, 0); + atomic_set(&cp->in_pkts, 0); + + atomic_inc(&ip_vs_conn_count); + if (flags & IP_VS_CONN_F_NO_CPORT) + atomic_inc(&ip_vs_conn_no_cport_cnt); + + /* Bind the connection with a destination server */ + ip_vs_bind_dest(cp, dest); + + /* Set its state and timeout */ + cp->state = 0; + cp->timeout = 3*HZ; + + /* Bind its packet transmitter */ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ip_vs_bind_xmit_v6(cp); + else +#endif + ip_vs_bind_xmit(cp); + + if (unlikely(pp && atomic_read(&pp->appcnt))) + ip_vs_bind_app(cp, pp); + + /* Hash it in the ip_vs_conn_tab finally */ + ip_vs_conn_hash(cp); + + return cp; +} + + +/* + * /proc/net/ip_vs_conn entries + */ +#ifdef CONFIG_PROC_FS + +static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) +{ + int idx; + struct ip_vs_conn *cp; + + for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { + ct_read_lock_bh(idx); + list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { + if (pos-- == 0) { + seq->private = &ip_vs_conn_tab[idx]; + return cp; + } + } + ct_read_unlock_bh(idx); + } + + return NULL; +} + +static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) +{ + seq->private = NULL; + return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; +} + +static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip_vs_conn *cp = v; + struct list_head *e, *l = seq->private; + int idx; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip_vs_conn_array(seq, 0); + + /* more on same hash chain? */ + if ((e = cp->c_list.next) != l) + return list_entry(e, struct ip_vs_conn, c_list); + + idx = l - ip_vs_conn_tab; + ct_read_unlock_bh(idx); + + while (++idx < IP_VS_CONN_TAB_SIZE) { + ct_read_lock_bh(idx); + list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { + seq->private = &ip_vs_conn_tab[idx]; + return cp; + } + ct_read_unlock_bh(idx); + } + seq->private = NULL; + return NULL; +} + +static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) +{ + struct list_head *l = seq->private; + + if (l) + ct_read_unlock_bh(l - ip_vs_conn_tab); +} + +static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) +{ + + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n"); + else { + const struct ip_vs_conn *cp = v; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + seq_printf(seq, + "%-3s " NIP6_FMT " %04X " NIP6_FMT + " %04X " NIP6_FMT " %04X %-11s %7lu\n", + ip_vs_proto_name(cp->protocol), + NIP6(cp->caddr.in6), ntohs(cp->cport), + NIP6(cp->vaddr.in6), ntohs(cp->vport), + NIP6(cp->daddr.in6), ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + (cp->timer.expires-jiffies)/HZ); + else +#endif + seq_printf(seq, + "%-3s %08X %04X %08X %04X" + " %08X %04X %-11s %7lu\n", + ip_vs_proto_name(cp->protocol), + ntohl(cp->caddr.ip), ntohs(cp->cport), + ntohl(cp->vaddr.ip), ntohs(cp->vport), + ntohl(cp->daddr.ip), ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + (cp->timer.expires-jiffies)/HZ); + } + return 0; +} + +static const struct seq_operations ip_vs_conn_seq_ops = { + .start = ip_vs_conn_seq_start, + .next = ip_vs_conn_seq_next, + .stop = ip_vs_conn_seq_stop, + .show = ip_vs_conn_seq_show, +}; + +static int ip_vs_conn_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ip_vs_conn_seq_ops); +} + +static const struct file_operations ip_vs_conn_fops = { + .owner = THIS_MODULE, + .open = ip_vs_conn_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const char *ip_vs_origin_name(unsigned flags) +{ + if (flags & IP_VS_CONN_F_SYNC) + return "SYNC"; + else + return "LOCAL"; +} + +static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) +{ + + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); + else { + const struct ip_vs_conn *cp = v; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + seq_printf(seq, + "%-3s " NIP6_FMT " %04X " NIP6_FMT + " %04X " NIP6_FMT " %04X %-11s %-6s %7lu\n", + ip_vs_proto_name(cp->protocol), + NIP6(cp->caddr.in6), ntohs(cp->cport), + NIP6(cp->vaddr.in6), ntohs(cp->vport), + NIP6(cp->daddr.in6), ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + ip_vs_origin_name(cp->flags), + (cp->timer.expires-jiffies)/HZ); + else +#endif + seq_printf(seq, + "%-3s %08X %04X %08X %04X " + "%08X %04X %-11s %-6s %7lu\n", + ip_vs_proto_name(cp->protocol), + ntohl(cp->caddr.ip), ntohs(cp->cport), + ntohl(cp->vaddr.ip), ntohs(cp->vport), + ntohl(cp->daddr.ip), ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + ip_vs_origin_name(cp->flags), + (cp->timer.expires-jiffies)/HZ); + } + return 0; +} + +static const struct seq_operations ip_vs_conn_sync_seq_ops = { + .start = ip_vs_conn_seq_start, + .next = ip_vs_conn_seq_next, + .stop = ip_vs_conn_seq_stop, + .show = ip_vs_conn_sync_seq_show, +}; + +static int ip_vs_conn_sync_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ip_vs_conn_sync_seq_ops); +} + +static const struct file_operations ip_vs_conn_sync_fops = { + .owner = THIS_MODULE, + .open = ip_vs_conn_sync_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif + + +/* + * Randomly drop connection entries before running out of memory + */ +static inline int todrop_entry(struct ip_vs_conn *cp) +{ + /* + * The drop rate array needs tuning for real environments. + * Called from timer bh only => no locking + */ + static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; + static char todrop_counter[9] = {0}; + int i; + + /* if the conn entry hasn't lasted for 60 seconds, don't drop it. + This will leave enough time for normal connection to get + through. */ + if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ)) + return 0; + + /* Don't drop the entry if its number of incoming packets is not + located in [0, 8] */ + i = atomic_read(&cp->in_pkts); + if (i > 8 || i < 0) return 0; + + if (!todrop_rate[i]) return 0; + if (--todrop_counter[i] > 0) return 0; + + todrop_counter[i] = todrop_rate[i]; + return 1; +} + +/* Called from keventd and must protect itself from softirqs */ +void ip_vs_random_dropentry(void) +{ + int idx; + struct ip_vs_conn *cp; + + /* + * Randomly scan 1/32 of the whole table every second + */ + for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) { + unsigned hash = net_random() & IP_VS_CONN_TAB_MASK; + + /* + * Lock is actually needed in this loop. + */ + ct_write_lock_bh(hash); + + list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { + if (cp->flags & IP_VS_CONN_F_TEMPLATE) + /* connection template */ + continue; + + if (cp->protocol == IPPROTO_TCP) { + switch(cp->state) { + case IP_VS_TCP_S_SYN_RECV: + case IP_VS_TCP_S_SYNACK: + break; + + case IP_VS_TCP_S_ESTABLISHED: + if (todrop_entry(cp)) + break; + continue; + + default: + continue; + } + } else { + if (!todrop_entry(cp)) + continue; + } + + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_expire_now(cp); + if (cp->control) { + IP_VS_DBG(4, "del conn template\n"); + ip_vs_conn_expire_now(cp->control); + } + } + ct_write_unlock_bh(hash); + } +} + + +/* + * Flush all the connection entries in the ip_vs_conn_tab + */ +static void ip_vs_conn_flush(void) +{ + int idx; + struct ip_vs_conn *cp; + + flush_again: + for (idx=0; idxcontrol) { + IP_VS_DBG(4, "del conn template\n"); + ip_vs_conn_expire_now(cp->control); + } + } + ct_write_unlock_bh(idx); + } + + /* the counter may be not NULL, because maybe some conn entries + are run by slow timer handler or unhashed but still referred */ + if (atomic_read(&ip_vs_conn_count) != 0) { + schedule(); + goto flush_again; + } +} + + +int __init ip_vs_conn_init(void) +{ + int idx; + + /* + * Allocate the connection hash table and initialize its list heads + */ + ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head)); + if (!ip_vs_conn_tab) + return -ENOMEM; + + /* Allocate ip_vs_conn slab cache */ + ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", + sizeof(struct ip_vs_conn), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!ip_vs_conn_cachep) { + vfree(ip_vs_conn_tab); + return -ENOMEM; + } + + IP_VS_INFO("Connection hash table configured " + "(size=%d, memory=%ldKbytes)\n", + IP_VS_CONN_TAB_SIZE, + (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024); + IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", + sizeof(struct ip_vs_conn)); + + for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); + } + + for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { + rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); + } + + proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops); + proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops); + + /* calculate the random value for connection hash */ + get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); + + return 0; +} + + +void ip_vs_conn_cleanup(void) +{ + /* flush all the connection entries first */ + ip_vs_conn_flush(); + + /* Release the empty cache */ + kmem_cache_destroy(ip_vs_conn_cachep); + proc_net_remove(&init_net, "ip_vs_conn"); + proc_net_remove(&init_net, "ip_vs_conn_sync"); + vfree(ip_vs_conn_tab); +} diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c new file mode 100644 index 00000000000..958abf3e5f8 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -0,0 +1,1542 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang + * Peter Kese + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, + * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms + * and others. + * + * Changes: + * Paul `Rusty' Russell properly handle non-linear skbs + * Harald Welte don't use nfcache + * + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include /* for icmp_send */ +#include + +#include +#include + +#ifdef CONFIG_IP_VS_IPV6 +#include +#include +#endif + +#include + + +EXPORT_SYMBOL(register_ip_vs_scheduler); +EXPORT_SYMBOL(unregister_ip_vs_scheduler); +EXPORT_SYMBOL(ip_vs_skb_replace); +EXPORT_SYMBOL(ip_vs_proto_name); +EXPORT_SYMBOL(ip_vs_conn_new); +EXPORT_SYMBOL(ip_vs_conn_in_get); +EXPORT_SYMBOL(ip_vs_conn_out_get); +#ifdef CONFIG_IP_VS_PROTO_TCP +EXPORT_SYMBOL(ip_vs_tcp_conn_listen); +#endif +EXPORT_SYMBOL(ip_vs_conn_put); +#ifdef CONFIG_IP_VS_DEBUG +EXPORT_SYMBOL(ip_vs_get_debug_level); +#endif + + +/* ID used in ICMP lookups */ +#define icmp_id(icmph) (((icmph)->un).echo.id) +#define icmpv6_id(icmph) (icmph->icmp6_dataun.u_echo.identifier) + +const char *ip_vs_proto_name(unsigned proto) +{ + static char buf[20]; + + switch (proto) { + case IPPROTO_IP: + return "IP"; + case IPPROTO_UDP: + return "UDP"; + case IPPROTO_TCP: + return "TCP"; + case IPPROTO_ICMP: + return "ICMP"; +#ifdef CONFIG_IP_VS_IPV6 + case IPPROTO_ICMPV6: + return "ICMPv6"; +#endif + default: + sprintf(buf, "IP_%d", proto); + return buf; + } +} + +void ip_vs_init_hash_table(struct list_head *table, int rows) +{ + while (--rows >= 0) + INIT_LIST_HEAD(&table[rows]); +} + +static inline void +ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_dest *dest = cp->dest; + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + spin_lock(&dest->stats.lock); + dest->stats.ustats.inpkts++; + dest->stats.ustats.inbytes += skb->len; + spin_unlock(&dest->stats.lock); + + spin_lock(&dest->svc->stats.lock); + dest->svc->stats.ustats.inpkts++; + dest->svc->stats.ustats.inbytes += skb->len; + spin_unlock(&dest->svc->stats.lock); + + spin_lock(&ip_vs_stats.lock); + ip_vs_stats.ustats.inpkts++; + ip_vs_stats.ustats.inbytes += skb->len; + spin_unlock(&ip_vs_stats.lock); + } +} + + +static inline void +ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_dest *dest = cp->dest; + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + spin_lock(&dest->stats.lock); + dest->stats.ustats.outpkts++; + dest->stats.ustats.outbytes += skb->len; + spin_unlock(&dest->stats.lock); + + spin_lock(&dest->svc->stats.lock); + dest->svc->stats.ustats.outpkts++; + dest->svc->stats.ustats.outbytes += skb->len; + spin_unlock(&dest->svc->stats.lock); + + spin_lock(&ip_vs_stats.lock); + ip_vs_stats.ustats.outpkts++; + ip_vs_stats.ustats.outbytes += skb->len; + spin_unlock(&ip_vs_stats.lock); + } +} + + +static inline void +ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) +{ + spin_lock(&cp->dest->stats.lock); + cp->dest->stats.ustats.conns++; + spin_unlock(&cp->dest->stats.lock); + + spin_lock(&svc->stats.lock); + svc->stats.ustats.conns++; + spin_unlock(&svc->stats.lock); + + spin_lock(&ip_vs_stats.lock); + ip_vs_stats.ustats.conns++; + spin_unlock(&ip_vs_stats.lock); +} + + +static inline int +ip_vs_set_state(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_protocol *pp) +{ + if (unlikely(!pp->state_transition)) + return 0; + return pp->state_transition(cp, direction, skb, pp); +} + + +/* + * IPVS persistent scheduling function + * It creates a connection entry according to its template if exists, + * or selects a server and creates a connection entry plus a template. + * Locking: we are svc user (svc->refcnt), so we hold all dests too + * Protocols supported: TCP, UDP + */ +static struct ip_vs_conn * +ip_vs_sched_persist(struct ip_vs_service *svc, + const struct sk_buff *skb, + __be16 ports[2]) +{ + struct ip_vs_conn *cp = NULL; + struct ip_vs_iphdr iph; + struct ip_vs_dest *dest; + struct ip_vs_conn *ct; + __be16 dport; /* destination port to forward */ + union nf_inet_addr snet; /* source network of the client, + after masking */ + + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + + /* Mask saddr with the netmask to adjust template granularity */ +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask); + else +#endif + snet.ip = iph.saddr.ip & svc->netmask; + + IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " + "mnet %s\n", + IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]), + IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]), + IP_VS_DBG_ADDR(svc->af, &snet)); + + /* + * As far as we know, FTP is a very complicated network protocol, and + * it uses control connection and data connections. For active FTP, + * FTP server initialize data connection to the client, its source port + * is often 20. For passive FTP, FTP server tells the clients the port + * that it passively listens to, and the client issues the data + * connection. In the tunneling or direct routing mode, the load + * balancer is on the client-to-server half of connection, the port + * number is unknown to the load balancer. So, a conn template like + * is created for persistent FTP + * service, and a template like + * is created for other persistent services. + */ + if (ports[1] == svc->port) { + /* Check if a template already exists */ + if (svc->port != FTPPORT) + ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, + &iph.daddr, ports[1]); + else + ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, + &iph.daddr, 0); + + if (!ct || !ip_vs_check_template(ct)) { + /* + * No template found or the dest of the connection + * template is not available. + */ + dest = svc->scheduler->schedule(svc, skb); + if (dest == NULL) { + IP_VS_DBG(1, "p-schedule: no dest found.\n"); + return NULL; + } + + /* + * Create a template like for non-ftp service, + * and + * for ftp service. + */ + if (svc->port != FTPPORT) + ct = ip_vs_conn_new(svc->af, iph.protocol, + &snet, 0, + &iph.daddr, + ports[1], + &dest->addr, dest->port, + IP_VS_CONN_F_TEMPLATE, + dest); + else + ct = ip_vs_conn_new(svc->af, iph.protocol, + &snet, 0, + &iph.daddr, 0, + &dest->addr, 0, + IP_VS_CONN_F_TEMPLATE, + dest); + if (ct == NULL) + return NULL; + + ct->timeout = svc->timeout; + } else { + /* set destination with the found template */ + dest = ct->dest; + } + dport = dest->port; + } else { + /* + * Note: persistent fwmark-based services and persistent + * port zero service are handled here. + * fwmark template: + * port zero template: + */ + if (svc->fwmark) { + union nf_inet_addr fwmark = { + .all = { 0, 0, 0, htonl(svc->fwmark) } + }; + + ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0, + &fwmark, 0); + } else + ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, + &iph.daddr, 0); + + if (!ct || !ip_vs_check_template(ct)) { + /* + * If it is not persistent port zero, return NULL, + * otherwise create a connection template. + */ + if (svc->port) + return NULL; + + dest = svc->scheduler->schedule(svc, skb); + if (dest == NULL) { + IP_VS_DBG(1, "p-schedule: no dest found.\n"); + return NULL; + } + + /* + * Create a template according to the service + */ + if (svc->fwmark) { + union nf_inet_addr fwmark = { + .all = { 0, 0, 0, htonl(svc->fwmark) } + }; + + ct = ip_vs_conn_new(svc->af, IPPROTO_IP, + &snet, 0, + &fwmark, 0, + &dest->addr, 0, + IP_VS_CONN_F_TEMPLATE, + dest); + } else + ct = ip_vs_conn_new(svc->af, iph.protocol, + &snet, 0, + &iph.daddr, 0, + &dest->addr, 0, + IP_VS_CONN_F_TEMPLATE, + dest); + if (ct == NULL) + return NULL; + + ct->timeout = svc->timeout; + } else { + /* set destination with the found template */ + dest = ct->dest; + } + dport = ports[1]; + } + + /* + * Create a new connection according to the template + */ + cp = ip_vs_conn_new(svc->af, iph.protocol, + &iph.saddr, ports[0], + &iph.daddr, ports[1], + &dest->addr, dport, + 0, + dest); + if (cp == NULL) { + ip_vs_conn_put(ct); + return NULL; + } + + /* + * Add its control + */ + ip_vs_control_add(cp, ct); + ip_vs_conn_put(ct); + + ip_vs_conn_stats(cp, svc); + return cp; +} + + +/* + * IPVS main scheduling function + * It selects a server according to the virtual service, and + * creates a connection entry. + * Protocols supported: TCP, UDP + */ +struct ip_vs_conn * +ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_conn *cp = NULL; + struct ip_vs_iphdr iph; + struct ip_vs_dest *dest; + __be16 _ports[2], *pptr; + + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + /* + * Persistent service + */ + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + return ip_vs_sched_persist(svc, skb, pptr); + + /* + * Non-persistent service + */ + if (!svc->fwmark && pptr[1] != svc->port) { + if (!svc->port) + IP_VS_ERR("Schedule: port zero only supported " + "in persistent services, " + "check your ipvs configuration\n"); + return NULL; + } + + dest = svc->scheduler->schedule(svc, skb); + if (dest == NULL) { + IP_VS_DBG(1, "Schedule: no dest found.\n"); + return NULL; + } + + /* + * Create a connection entry. + */ + cp = ip_vs_conn_new(svc->af, iph.protocol, + &iph.saddr, pptr[0], + &iph.daddr, pptr[1], + &dest->addr, dest->port ? dest->port : pptr[1], + 0, + dest); + if (cp == NULL) + return NULL; + + IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " + "d:%s:%u conn->flags:%X conn->refcnt:%d\n", + ip_vs_fwd_tag(cp), + IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport), + cp->flags, atomic_read(&cp->refcnt)); + + ip_vs_conn_stats(cp, svc); + return cp; +} + + +/* + * Pass or drop the packet. + * Called by ip_vs_in, when the virtual service is available but + * no destination is available for a new connection. + */ +int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, + struct ip_vs_protocol *pp) +{ + __be16 _ports[2], *pptr; + struct ip_vs_iphdr iph; + int unicast; + ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); + + pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); + if (pptr == NULL) { + ip_vs_service_put(svc); + return NF_DROP; + } + +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST; + else +#endif + unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST); + + /* if it is fwmark-based service, the cache_bypass sysctl is up + and the destination is a non-local unicast, then create + a cache_bypass connection entry */ + if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) { + int ret, cs; + struct ip_vs_conn *cp; + union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; + + ip_vs_service_put(svc); + + /* create a new connection entry */ + IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n"); + cp = ip_vs_conn_new(svc->af, iph.protocol, + &iph.saddr, pptr[0], + &iph.daddr, pptr[1], + &daddr, 0, + IP_VS_CONN_F_BYPASS, + NULL); + if (cp == NULL) + return NF_DROP; + + /* statistics */ + ip_vs_in_stats(cp, skb); + + /* set state */ + cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); + + /* transmit the first SYN packet */ + ret = cp->packet_xmit(skb, cp, pp); + /* do not touch skb anymore */ + + atomic_inc(&cp->in_pkts); + ip_vs_conn_put(cp); + return ret; + } + + /* + * When the virtual ftp service is presented, packets destined + * for other services on the VIP may get here (except services + * listed in the ipvs table), pass the packets, because it is + * not ipvs job to decide to drop the packets. + */ + if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) { + ip_vs_service_put(svc); + return NF_ACCEPT; + } + + ip_vs_service_put(svc); + + /* + * Notify the client that the destination is unreachable, and + * release the socket buffer. + * Since it is in IP layer, the TCP socket is not actually + * created, the TCP RST packet cannot be sent, instead that + * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ + */ +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, + skb->dev); + else +#endif + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + return NF_DROP; +} + + +/* + * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING + * chain, and is used for VS/NAT. + * It detects packets for VS/NAT connections and sends the packets + * immediately. This can avoid that iptable_nat mangles the packets + * for VS/NAT. + */ +static unsigned int ip_vs_post_routing(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + if (!skb->ipvs_property) + return NF_ACCEPT; + /* The packet was sent from IPVS, exit this chain */ + return NF_STOP; +} + +__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) +{ + return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); +} + +static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) +{ + int err = ip_defrag(skb, user); + + if (!err) + ip_send_check(ip_hdr(skb)); + + return err; +} + +#ifdef CONFIG_IP_VS_IPV6 +static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user) +{ + /* TODO IPv6: Find out what to do here for IPv6 */ + return 0; +} +#endif + +/* + * Packet has been made sufficiently writable in caller + * - inout: 1=in->out, 0=out->in + */ +void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, int inout) +{ + struct iphdr *iph = ip_hdr(skb); + unsigned int icmp_offset = iph->ihl*4; + struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) + + icmp_offset); + struct iphdr *ciph = (struct iphdr *)(icmph + 1); + + if (inout) { + iph->saddr = cp->vaddr.ip; + ip_send_check(iph); + ciph->daddr = cp->vaddr.ip; + ip_send_check(ciph); + } else { + iph->daddr = cp->daddr.ip; + ip_send_check(iph); + ciph->saddr = cp->daddr.ip; + ip_send_check(ciph); + } + + /* the TCP/UDP port */ + if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) { + __be16 *ports = (void *)ciph + ciph->ihl*4; + + if (inout) + ports[1] = cp->vport; + else + ports[0] = cp->dport; + } + + /* And finally the ICMP checksum */ + icmph->checksum = 0; + icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset); + skb->ip_summed = CHECKSUM_UNNECESSARY; + + if (inout) + IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + "Forwarding altered outgoing ICMP"); + else + IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + "Forwarding altered incoming ICMP"); +} + +#ifdef CONFIG_IP_VS_IPV6 +void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, int inout) +{ + struct ipv6hdr *iph = ipv6_hdr(skb); + unsigned int icmp_offset = sizeof(struct ipv6hdr); + struct icmp6hdr *icmph = (struct icmp6hdr *)(skb_network_header(skb) + + icmp_offset); + struct ipv6hdr *ciph = (struct ipv6hdr *)(icmph + 1); + + if (inout) { + iph->saddr = cp->vaddr.in6; + ciph->daddr = cp->vaddr.in6; + } else { + iph->daddr = cp->daddr.in6; + ciph->saddr = cp->daddr.in6; + } + + /* the TCP/UDP port */ + if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr) { + __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr); + + if (inout) + ports[1] = cp->vport; + else + ports[0] = cp->dport; + } + + /* And finally the ICMP checksum */ + icmph->icmp6_cksum = 0; + /* TODO IPv6: is this correct for ICMPv6? */ + ip_vs_checksum_complete(skb, icmp_offset); + skb->ip_summed = CHECKSUM_UNNECESSARY; + + if (inout) + IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + "Forwarding altered outgoing ICMPv6"); + else + IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + "Forwarding altered incoming ICMPv6"); +} +#endif + +/* Handle relevant response ICMP messages - forward to the right + * destination host. Used for NAT and local client. + */ +static int handle_response_icmp(int af, struct sk_buff *skb, + union nf_inet_addr *snet, + __u8 protocol, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, + unsigned int offset, unsigned int ihl) +{ + unsigned int verdict = NF_DROP; + + if (IP_VS_FWD_METHOD(cp) != 0) { + IP_VS_ERR("shouldn't reach here, because the box is on the " + "half connection in the tun/dr module.\n"); + } + + /* Ensure the checksum is correct */ + if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { + /* Failed checksum! */ + IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n", + IP_VS_DBG_ADDR(af, snet)); + goto out; + } + + if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol) + offset += 2 * sizeof(__u16); + if (!skb_make_writable(skb, offset)) + goto out; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ip_vs_nat_icmp_v6(skb, pp, cp, 1); + else +#endif + ip_vs_nat_icmp(skb, pp, cp, 1); + + /* do the statistics and put it back */ + ip_vs_out_stats(cp, skb); + + skb->ipvs_property = 1; + verdict = NF_ACCEPT; + +out: + __ip_vs_conn_put(cp); + + return verdict; +} + +/* + * Handle ICMP messages in the inside-to-outside direction (outgoing). + * Find any that might be relevant, check against existing connections. + * Currently handles error types - unreachable, quench, ttl exceeded. + */ +static int ip_vs_out_icmp(struct sk_buff *skb, int *related) +{ + struct iphdr *iph; + struct icmphdr _icmph, *ic; + struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ + struct ip_vs_iphdr ciph; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + unsigned int offset, ihl; + union nf_inet_addr snet; + + *related = 1; + + /* reassemble IP fragments */ + if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { + if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) + return NF_STOLEN; + } + + iph = ip_hdr(skb); + offset = ihl = iph->ihl * 4; + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", + ic->type, ntohs(icmp_id(ic)), + NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->type != ICMP_DEST_UNREACH) && + (ic->type != ICMP_SOURCE_QUENCH) && + (ic->type != ICMP_TIME_EXCEEDED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + pp = ip_vs_proto_get(cih->protocol); + if (!pp) + return NF_ACCEPT; + + /* Is the embedded protocol header present? */ + if (unlikely(cih->frag_off & htons(IP_OFFSET) && + pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for"); + + offset += cih->ihl * 4; + + ip_vs_fill_iphdr(AF_INET, cih, &ciph); + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); + if (!cp) + return NF_ACCEPT; + + snet.ip = iph->saddr; + return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp, + pp, offset, ihl); +} + +#ifdef CONFIG_IP_VS_IPV6 +static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related) +{ + struct ipv6hdr *iph; + struct icmp6hdr _icmph, *ic; + struct ipv6hdr _ciph, *cih; /* The ip header contained + within the ICMP */ + struct ip_vs_iphdr ciph; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + unsigned int offset; + union nf_inet_addr snet; + + *related = 1; + + /* reassemble IP fragments */ + if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { + if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT)) + return NF_STOLEN; + } + + iph = ipv6_hdr(skb); + offset = sizeof(struct ipv6hdr); + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n", + ic->icmp6_type, ntohs(icmpv6_id(ic)), + NIP6(iph->saddr), NIP6(iph->daddr)); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && + (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && + (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + pp = ip_vs_proto_get(cih->nexthdr); + if (!pp) + return NF_ACCEPT; + + /* Is the embedded protocol header present? */ + /* TODO: we don't support fragmentation at the moment anyways */ + if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for"); + + offset += sizeof(struct ipv6hdr); + + ip_vs_fill_iphdr(AF_INET6, cih, &ciph); + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); + if (!cp) + return NF_ACCEPT; + + ipv6_addr_copy(&snet.in6, &iph->saddr); + return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp, + pp, offset, sizeof(struct ipv6hdr)); +} +#endif + +static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) +{ + struct tcphdr _tcph, *th; + + th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph); + if (th == NULL) + return 0; + return th->rst; +} + +/* Handle response packets: rewrite addresses and send away... + * Used for NAT and local client. + */ +static unsigned int +handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, int ihl) +{ + IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet"); + + if (!skb_make_writable(skb, ihl)) + goto drop; + + /* mangle the packet */ + if (pp->snat_handler && !pp->snat_handler(skb, pp, cp)) + goto drop; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + ipv6_hdr(skb)->saddr = cp->vaddr.in6; + else +#endif + { + ip_hdr(skb)->saddr = cp->vaddr.ip; + ip_send_check(ip_hdr(skb)); + } + + /* For policy routing, packets originating from this + * machine itself may be routed differently to packets + * passing through. We want this packet to be routed as + * if it came from this machine itself. So re-compute + * the routing information. + */ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (ip6_route_me_harder(skb) != 0) + goto drop; + } else +#endif + if (ip_route_me_harder(skb, RTN_LOCAL) != 0) + goto drop; + + IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); + + ip_vs_out_stats(cp, skb); + ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); + ip_vs_conn_put(cp); + + skb->ipvs_property = 1; + + LeaveFunction(11); + return NF_ACCEPT; + +drop: + ip_vs_conn_put(cp); + kfree_skb(skb); + return NF_STOLEN; +} + +/* + * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. + * Check if outgoing packet belongs to the established ip_vs_conn. + */ +static unsigned int +ip_vs_out(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct ip_vs_iphdr iph; + struct ip_vs_protocol *pp; + struct ip_vs_conn *cp; + int af; + + EnterFunction(11); + + af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; + + if (skb->ipvs_property) + return NF_ACCEPT; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { + int related, verdict = ip_vs_out_icmp_v6(skb, &related); + + if (related) + return verdict; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + } else +#endif + if (unlikely(iph.protocol == IPPROTO_ICMP)) { + int related, verdict = ip_vs_out_icmp(skb, &related); + + if (related) + return verdict; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + + pp = ip_vs_proto_get(iph.protocol); + if (unlikely(!pp)) + return NF_ACCEPT; + + /* reassemble IP fragments */ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { + int related, verdict = ip_vs_out_icmp_v6(skb, &related); + + if (related) + return verdict; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + } else +#endif + if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) && + !pp->dont_defrag)) { + if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) + return NF_STOLEN; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + + /* + * Check if the packet belongs to an existing entry + */ + cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); + + if (unlikely(!cp)) { + if (sysctl_ip_vs_nat_icmp_send && + (pp->protocol == IPPROTO_TCP || + pp->protocol == IPPROTO_UDP)) { + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, iph.len, + sizeof(_ports), _ports); + if (pptr == NULL) + return NF_ACCEPT; /* Not for me */ + if (ip_vs_lookup_real_service(af, iph.protocol, + &iph.saddr, + pptr[0])) { + /* + * Notify the real server: there is no + * existing entry if it is not RST + * packet or not TCP packet. + */ + if (iph.protocol != IPPROTO_TCP + || !is_tcp_reset(skb, iph.len)) { +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + icmpv6_send(skb, + ICMPV6_DEST_UNREACH, + ICMPV6_PORT_UNREACH, + 0, skb->dev); + else +#endif + icmp_send(skb, + ICMP_DEST_UNREACH, + ICMP_PORT_UNREACH, 0); + return NF_DROP; + } + } + } + IP_VS_DBG_PKT(12, pp, skb, 0, + "packet continues traversal as normal"); + return NF_ACCEPT; + } + + return handle_response(af, skb, pp, cp, iph.len); +} + + +/* + * Handle ICMP messages in the outside-to-inside direction (incoming). + * Find any that might be relevant, check against existing connections, + * forward to the right destination host if relevant. + * Currently handles error types - unreachable, quench, ttl exceeded. + */ +static int +ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) +{ + struct iphdr *iph; + struct icmphdr _icmph, *ic; + struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ + struct ip_vs_iphdr ciph; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + unsigned int offset, ihl, verdict; + union nf_inet_addr snet; + + *related = 1; + + /* reassemble IP fragments */ + if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { + if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ? + IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD)) + return NF_STOLEN; + } + + iph = ip_hdr(skb); + offset = ihl = iph->ihl * 4; + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", + ic->type, ntohs(icmp_id(ic)), + NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->type != ICMP_DEST_UNREACH) && + (ic->type != ICMP_SOURCE_QUENCH) && + (ic->type != ICMP_TIME_EXCEEDED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + pp = ip_vs_proto_get(cih->protocol); + if (!pp) + return NF_ACCEPT; + + /* Is the embedded protocol header present? */ + if (unlikely(cih->frag_off & htons(IP_OFFSET) && + pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for"); + + offset += cih->ihl * 4; + + ip_vs_fill_iphdr(AF_INET, cih, &ciph); + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1); + if (!cp) { + /* The packet could also belong to a local client */ + cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); + if (cp) { + snet.ip = iph->saddr; + return handle_response_icmp(AF_INET, skb, &snet, + cih->protocol, cp, pp, + offset, ihl); + } + return NF_ACCEPT; + } + + verdict = NF_DROP; + + /* Ensure the checksum is correct */ + if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { + /* Failed checksum! */ + IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n", + NIPQUAD(iph->saddr)); + goto out; + } + + /* do the statistics and put it back */ + ip_vs_in_stats(cp, skb); + if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) + offset += 2 * sizeof(__u16); + verdict = ip_vs_icmp_xmit(skb, cp, pp, offset); + /* do not touch skb anymore */ + + out: + __ip_vs_conn_put(cp); + + return verdict; +} + +#ifdef CONFIG_IP_VS_IPV6 +static int +ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) +{ + struct ipv6hdr *iph; + struct icmp6hdr _icmph, *ic; + struct ipv6hdr _ciph, *cih; /* The ip header contained + within the ICMP */ + struct ip_vs_iphdr ciph; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + unsigned int offset, verdict; + union nf_inet_addr snet; + + *related = 1; + + /* reassemble IP fragments */ + if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { + if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ? + IP_DEFRAG_VS_IN : + IP_DEFRAG_VS_FWD)) + return NF_STOLEN; + } + + iph = ipv6_hdr(skb); + offset = sizeof(struct ipv6hdr); + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) " NIP6_FMT "->" NIP6_FMT "\n", + ic->icmp6_type, ntohs(icmpv6_id(ic)), + NIP6(iph->saddr), NIP6(iph->daddr)); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) && + (ic->icmp6_type != ICMPV6_PKT_TOOBIG) && + (ic->icmp6_type != ICMPV6_TIME_EXCEED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + pp = ip_vs_proto_get(cih->nexthdr); + if (!pp) + return NF_ACCEPT; + + /* Is the embedded protocol header present? */ + /* TODO: we don't support fragmentation at the moment anyways */ + if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for"); + + offset += sizeof(struct ipv6hdr); + + ip_vs_fill_iphdr(AF_INET6, cih, &ciph); + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1); + if (!cp) { + /* The packet could also belong to a local client */ + cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); + if (cp) { + ipv6_addr_copy(&snet.in6, &iph->saddr); + return handle_response_icmp(AF_INET6, skb, &snet, + cih->nexthdr, + cp, pp, offset, + sizeof(struct ipv6hdr)); + } + return NF_ACCEPT; + } + + verdict = NF_DROP; + + /* do the statistics and put it back */ + ip_vs_in_stats(cp, skb); + if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr) + offset += 2 * sizeof(__u16); + verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset); + /* do not touch skb anymore */ + + __ip_vs_conn_put(cp); + + return verdict; +} +#endif + + +/* + * Check if it's for virtual services, look it up, + * and send it on its way... + */ +static unsigned int +ip_vs_in(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct ip_vs_iphdr iph; + struct ip_vs_protocol *pp; + struct ip_vs_conn *cp; + int ret, restart, af; + + af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + + /* + * Big tappo: only PACKET_HOST, including loopback for local client + * Don't handle local packets on IPv6 for now + */ + if (unlikely(skb->pkt_type != PACKET_HOST)) { + IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n", + skb->pkt_type, + iph.protocol, + IP_VS_DBG_ADDR(af, &iph.daddr)); + return NF_ACCEPT; + } + + if (unlikely(iph.protocol == IPPROTO_ICMP)) { + int related, verdict = ip_vs_in_icmp(skb, &related, hooknum); + + if (related) + return verdict; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + } + + /* Protocol supported? */ + pp = ip_vs_proto_get(iph.protocol); + if (unlikely(!pp)) + return NF_ACCEPT; + + /* + * Check if the packet belongs to an existing connection entry + */ + cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0); + + if (unlikely(!cp)) { + int v; + + /* For local client packets, it could be a response */ + cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); + if (cp) + return handle_response(af, skb, pp, cp, iph.len); + + if (!pp->conn_schedule(af, skb, pp, &v, &cp)) + return v; + } + + if (unlikely(!cp)) { + /* sorry, all this trouble for a no-hit :) */ + IP_VS_DBG_PKT(12, pp, skb, 0, + "packet continues traversal as normal"); + return NF_ACCEPT; + } + + IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet"); + + /* Check the server status */ + if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { + /* the destination server is not available */ + + if (sysctl_ip_vs_expire_nodest_conn) { + /* try to expire the connection immediately */ + ip_vs_conn_expire_now(cp); + } + /* don't restart its timer, and silently + drop the packet. */ + __ip_vs_conn_put(cp); + return NF_DROP; + } + + ip_vs_in_stats(cp, skb); + restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); + if (cp->packet_xmit) + ret = cp->packet_xmit(skb, cp, pp); + /* do not touch skb anymore */ + else { + IP_VS_DBG_RL("warning: packet_xmit is null"); + ret = NF_ACCEPT; + } + + /* Increase its packet counter and check if it is needed + * to be synchronized + * + * Sync connection if it is about to close to + * encorage the standby servers to update the connections timeout + */ + atomic_inc(&cp->in_pkts); + if (af == AF_INET && + (ip_vs_sync_state & IP_VS_STATE_MASTER) && + (((cp->protocol != IPPROTO_TCP || + cp->state == IP_VS_TCP_S_ESTABLISHED) && + (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1] + == sysctl_ip_vs_sync_threshold[0])) || + ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && + ((cp->state == IP_VS_TCP_S_FIN_WAIT) || + (cp->state == IP_VS_TCP_S_CLOSE_WAIT) || + (cp->state == IP_VS_TCP_S_TIME_WAIT))))) + ip_vs_sync_conn(cp); + cp->old_state = cp->state; + + ip_vs_conn_put(cp); + return ret; +} + + +/* + * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP + * related packets destined for 0.0.0.0/0. + * When fwmark-based virtual service is used, such as transparent + * cache cluster, TCP packets can be marked and routed to ip_vs_in, + * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and + * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain + * and send them to ip_vs_in_icmp. + */ +static unsigned int +ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + int r; + + if (ip_hdr(skb)->protocol != IPPROTO_ICMP) + return NF_ACCEPT; + + return ip_vs_in_icmp(skb, &r, hooknum); +} + +#ifdef CONFIG_IP_VS_IPV6 +static unsigned int +ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + int r; + + if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6) + return NF_ACCEPT; + + return ip_vs_in_icmp_v6(skb, &r, hooknum); +} +#endif + + +static struct nf_hook_ops ip_vs_ops[] __read_mostly = { + /* After packet filtering, forward packet through VS/DR, VS/TUN, + * or VS/NAT(change destination), so that filtering rules can be + * applied to IPVS. */ + { + .hook = ip_vs_in, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_IN, + .priority = 100, + }, + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_out, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_FORWARD, + .priority = 100, + }, + /* After packet filtering (but before ip_vs_out_icmp), catch icmp + * destined for 0.0.0.0/0, which is for incoming IPVS connections */ + { + .hook = ip_vs_forward_icmp, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_FORWARD, + .priority = 99, + }, + /* Before the netfilter connection tracking, exit from POST_ROUTING */ + { + .hook = ip_vs_post_routing, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_NAT_SRC-1, + }, +#ifdef CONFIG_IP_VS_IPV6 + /* After packet filtering, forward packet through VS/DR, VS/TUN, + * or VS/NAT(change destination), so that filtering rules can be + * applied to IPVS. */ + { + .hook = ip_vs_in, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_LOCAL_IN, + .priority = 100, + }, + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_out, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_FORWARD, + .priority = 100, + }, + /* After packet filtering (but before ip_vs_out_icmp), catch icmp + * destined for 0.0.0.0/0, which is for incoming IPVS connections */ + { + .hook = ip_vs_forward_icmp_v6, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_FORWARD, + .priority = 99, + }, + /* Before the netfilter connection tracking, exit from POST_ROUTING */ + { + .hook = ip_vs_post_routing, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP6_PRI_NAT_SRC-1, + }, +#endif +}; + + +/* + * Initialize IP Virtual Server + */ +static int __init ip_vs_init(void) +{ + int ret; + + ip_vs_estimator_init(); + + ret = ip_vs_control_init(); + if (ret < 0) { + IP_VS_ERR("can't setup control.\n"); + goto cleanup_estimator; + } + + ip_vs_protocol_init(); + + ret = ip_vs_app_init(); + if (ret < 0) { + IP_VS_ERR("can't setup application helper.\n"); + goto cleanup_protocol; + } + + ret = ip_vs_conn_init(); + if (ret < 0) { + IP_VS_ERR("can't setup connection table.\n"); + goto cleanup_app; + } + + ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + if (ret < 0) { + IP_VS_ERR("can't register hooks.\n"); + goto cleanup_conn; + } + + IP_VS_INFO("ipvs loaded.\n"); + return ret; + + cleanup_conn: + ip_vs_conn_cleanup(); + cleanup_app: + ip_vs_app_cleanup(); + cleanup_protocol: + ip_vs_protocol_cleanup(); + ip_vs_control_cleanup(); + cleanup_estimator: + ip_vs_estimator_cleanup(); + return ret; +} + +static void __exit ip_vs_cleanup(void) +{ + nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + ip_vs_conn_cleanup(); + ip_vs_app_cleanup(); + ip_vs_protocol_cleanup(); + ip_vs_control_cleanup(); + ip_vs_estimator_cleanup(); + IP_VS_INFO("ipvs unloaded.\n"); +} + +module_init(ip_vs_init); +module_exit(ip_vs_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c new file mode 100644 index 00000000000..0302cf3e503 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -0,0 +1,3443 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the NetFilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang + * Peter Kese + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#ifdef CONFIG_IP_VS_IPV6 +#include +#include +#endif +#include +#include +#include + +#include + +#include + +/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ +static DEFINE_MUTEX(__ip_vs_mutex); + +/* lock for service table */ +static DEFINE_RWLOCK(__ip_vs_svc_lock); + +/* lock for table with the real services */ +static DEFINE_RWLOCK(__ip_vs_rs_lock); + +/* lock for state and timeout tables */ +static DEFINE_RWLOCK(__ip_vs_securetcp_lock); + +/* lock for drop entry handling */ +static DEFINE_SPINLOCK(__ip_vs_dropentry_lock); + +/* lock for drop packet handling */ +static DEFINE_SPINLOCK(__ip_vs_droppacket_lock); + +/* 1/rate drop and drop-entry variables */ +int ip_vs_drop_rate = 0; +int ip_vs_drop_counter = 0; +static atomic_t ip_vs_dropentry = ATOMIC_INIT(0); + +/* number of virtual services */ +static int ip_vs_num_services = 0; + +/* sysctl variables */ +static int sysctl_ip_vs_drop_entry = 0; +static int sysctl_ip_vs_drop_packet = 0; +static int sysctl_ip_vs_secure_tcp = 0; +static int sysctl_ip_vs_amemthresh = 1024; +static int sysctl_ip_vs_am_droprate = 10; +int sysctl_ip_vs_cache_bypass = 0; +int sysctl_ip_vs_expire_nodest_conn = 0; +int sysctl_ip_vs_expire_quiescent_template = 0; +int sysctl_ip_vs_sync_threshold[2] = { 3, 50 }; +int sysctl_ip_vs_nat_icmp_send = 0; + + +#ifdef CONFIG_IP_VS_DEBUG +static int sysctl_ip_vs_debug_level = 0; + +int ip_vs_get_debug_level(void) +{ + return sysctl_ip_vs_debug_level; +} +#endif + +#ifdef CONFIG_IP_VS_IPV6 +/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ +static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr) +{ + struct rt6_info *rt; + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip6_u = { + .daddr = *addr, + .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } }, + }; + + rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); + if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK)) + return 1; + + return 0; +} +#endif +/* + * update_defense_level is called from keventd and from sysctl, + * so it needs to protect itself from softirqs + */ +static void update_defense_level(void) +{ + struct sysinfo i; + static int old_secure_tcp = 0; + int availmem; + int nomem; + int to_change = -1; + + /* we only count free and buffered memory (in pages) */ + si_meminfo(&i); + availmem = i.freeram + i.bufferram; + /* however in linux 2.5 the i.bufferram is total page cache size, + we need adjust it */ + /* si_swapinfo(&i); */ + /* availmem = availmem - (i.totalswap - i.freeswap); */ + + nomem = (availmem < sysctl_ip_vs_amemthresh); + + local_bh_disable(); + + /* drop_entry */ + spin_lock(&__ip_vs_dropentry_lock); + switch (sysctl_ip_vs_drop_entry) { + case 0: + atomic_set(&ip_vs_dropentry, 0); + break; + case 1: + if (nomem) { + atomic_set(&ip_vs_dropentry, 1); + sysctl_ip_vs_drop_entry = 2; + } else { + atomic_set(&ip_vs_dropentry, 0); + } + break; + case 2: + if (nomem) { + atomic_set(&ip_vs_dropentry, 1); + } else { + atomic_set(&ip_vs_dropentry, 0); + sysctl_ip_vs_drop_entry = 1; + }; + break; + case 3: + atomic_set(&ip_vs_dropentry, 1); + break; + } + spin_unlock(&__ip_vs_dropentry_lock); + + /* drop_packet */ + spin_lock(&__ip_vs_droppacket_lock); + switch (sysctl_ip_vs_drop_packet) { + case 0: + ip_vs_drop_rate = 0; + break; + case 1: + if (nomem) { + ip_vs_drop_rate = ip_vs_drop_counter + = sysctl_ip_vs_amemthresh / + (sysctl_ip_vs_amemthresh-availmem); + sysctl_ip_vs_drop_packet = 2; + } else { + ip_vs_drop_rate = 0; + } + break; + case 2: + if (nomem) { + ip_vs_drop_rate = ip_vs_drop_counter + = sysctl_ip_vs_amemthresh / + (sysctl_ip_vs_amemthresh-availmem); + } else { + ip_vs_drop_rate = 0; + sysctl_ip_vs_drop_packet = 1; + } + break; + case 3: + ip_vs_drop_rate = sysctl_ip_vs_am_droprate; + break; + } + spin_unlock(&__ip_vs_droppacket_lock); + + /* secure_tcp */ + write_lock(&__ip_vs_securetcp_lock); + switch (sysctl_ip_vs_secure_tcp) { + case 0: + if (old_secure_tcp >= 2) + to_change = 0; + break; + case 1: + if (nomem) { + if (old_secure_tcp < 2) + to_change = 1; + sysctl_ip_vs_secure_tcp = 2; + } else { + if (old_secure_tcp >= 2) + to_change = 0; + } + break; + case 2: + if (nomem) { + if (old_secure_tcp < 2) + to_change = 1; + } else { + if (old_secure_tcp >= 2) + to_change = 0; + sysctl_ip_vs_secure_tcp = 1; + } + break; + case 3: + if (old_secure_tcp < 2) + to_change = 1; + break; + } + old_secure_tcp = sysctl_ip_vs_secure_tcp; + if (to_change >= 0) + ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); + write_unlock(&__ip_vs_securetcp_lock); + + local_bh_enable(); +} + + +/* + * Timer for checking the defense + */ +#define DEFENSE_TIMER_PERIOD 1*HZ +static void defense_work_handler(struct work_struct *work); +static DECLARE_DELAYED_WORK(defense_work, defense_work_handler); + +static void defense_work_handler(struct work_struct *work) +{ + update_defense_level(); + if (atomic_read(&ip_vs_dropentry)) + ip_vs_random_dropentry(); + + schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); +} + +int +ip_vs_use_count_inc(void) +{ + return try_module_get(THIS_MODULE); +} + +void +ip_vs_use_count_dec(void) +{ + module_put(THIS_MODULE); +} + + +/* + * Hash table: for virtual service lookups + */ +#define IP_VS_SVC_TAB_BITS 8 +#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS) +#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) + +/* the service table hashed by */ +static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; +/* the service table hashed by fwmark */ +static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; + +/* + * Hash table: for real service lookups + */ +#define IP_VS_RTAB_BITS 4 +#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) +#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) + +static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE]; + +/* + * Trash for destinations + */ +static LIST_HEAD(ip_vs_dest_trash); + +/* + * FTP & NULL virtual service counters + */ +static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0); +static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0); + + +/* + * Returns hash value for virtual service + */ +static __inline__ unsigned +ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr, + __be16 port) +{ + register unsigned porth = ntohs(port); + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + + return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth) + & IP_VS_SVC_TAB_MASK; +} + +/* + * Returns hash value of fwmark for virtual service lookup + */ +static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark) +{ + return fwmark & IP_VS_SVC_TAB_MASK; +} + +/* + * Hashes a service in the ip_vs_svc_table by + * or in the ip_vs_svc_fwm_table by fwmark. + * Should be called with locked tables. + */ +static int ip_vs_svc_hash(struct ip_vs_service *svc) +{ + unsigned hash; + + if (svc->flags & IP_VS_SVC_F_HASHED) { + IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + if (svc->fwmark == 0) { + /* + * Hash it by in ip_vs_svc_table + */ + hash = ip_vs_svc_hashkey(svc->af, svc->protocol, &svc->addr, + svc->port); + list_add(&svc->s_list, &ip_vs_svc_table[hash]); + } else { + /* + * Hash it by fwmark in ip_vs_svc_fwm_table + */ + hash = ip_vs_svc_fwm_hashkey(svc->fwmark); + list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); + } + + svc->flags |= IP_VS_SVC_F_HASHED; + /* increase its refcnt because it is referenced by the svc table */ + atomic_inc(&svc->refcnt); + return 1; +} + + +/* + * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table. + * Should be called with locked tables. + */ +static int ip_vs_svc_unhash(struct ip_vs_service *svc) +{ + if (!(svc->flags & IP_VS_SVC_F_HASHED)) { + IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + if (svc->fwmark == 0) { + /* Remove it from the ip_vs_svc_table table */ + list_del(&svc->s_list); + } else { + /* Remove it from the ip_vs_svc_fwm_table table */ + list_del(&svc->f_list); + } + + svc->flags &= ~IP_VS_SVC_F_HASHED; + atomic_dec(&svc->refcnt); + return 1; +} + + +/* + * Get service by {proto,addr,port} in the service table. + */ +static inline struct ip_vs_service * +__ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr, + __be16 vport) +{ + unsigned hash; + struct ip_vs_service *svc; + + /* Check for "full" addressed entries */ + hash = ip_vs_svc_hashkey(af, protocol, vaddr, vport); + + list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ + if ((svc->af == af) + && ip_vs_addr_equal(af, &svc->addr, vaddr) + && (svc->port == vport) + && (svc->protocol == protocol)) { + /* HIT */ + atomic_inc(&svc->usecnt); + return svc; + } + } + + return NULL; +} + + +/* + * Get service by {fwmark} in the service table. + */ +static inline struct ip_vs_service * +__ip_vs_svc_fwm_get(int af, __u32 fwmark) +{ + unsigned hash; + struct ip_vs_service *svc; + + /* Check for fwmark addressed entries */ + hash = ip_vs_svc_fwm_hashkey(fwmark); + + list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { + if (svc->fwmark == fwmark && svc->af == af) { + /* HIT */ + atomic_inc(&svc->usecnt); + return svc; + } + } + + return NULL; +} + +struct ip_vs_service * +ip_vs_service_get(int af, __u32 fwmark, __u16 protocol, + const union nf_inet_addr *vaddr, __be16 vport) +{ + struct ip_vs_service *svc; + + read_lock(&__ip_vs_svc_lock); + + /* + * Check the table hashed by fwmark first + */ + if (fwmark && (svc = __ip_vs_svc_fwm_get(af, fwmark))) + goto out; + + /* + * Check the table hashed by + * for "full" addressed entries + */ + svc = __ip_vs_service_get(af, protocol, vaddr, vport); + + if (svc == NULL + && protocol == IPPROTO_TCP + && atomic_read(&ip_vs_ftpsvc_counter) + && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) { + /* + * Check if ftp service entry exists, the packet + * might belong to FTP data connections. + */ + svc = __ip_vs_service_get(af, protocol, vaddr, FTPPORT); + } + + if (svc == NULL + && atomic_read(&ip_vs_nullsvc_counter)) { + /* + * Check if the catch-all port (port zero) exists + */ + svc = __ip_vs_service_get(af, protocol, vaddr, 0); + } + + out: + read_unlock(&__ip_vs_svc_lock); + + IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", + fwmark, ip_vs_proto_name(protocol), + IP_VS_DBG_ADDR(af, vaddr), ntohs(vport), + svc ? "hit" : "not hit"); + + return svc; +} + + +static inline void +__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + atomic_inc(&svc->refcnt); + dest->svc = svc; +} + +static inline void +__ip_vs_unbind_svc(struct ip_vs_dest *dest) +{ + struct ip_vs_service *svc = dest->svc; + + dest->svc = NULL; + if (atomic_dec_and_test(&svc->refcnt)) + kfree(svc); +} + + +/* + * Returns hash value for real service + */ +static inline unsigned ip_vs_rs_hashkey(int af, + const union nf_inet_addr *addr, + __be16 port) +{ + register unsigned porth = ntohs(port); + __be32 addr_fold = addr->ip; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + addr_fold = addr->ip6[0]^addr->ip6[1]^ + addr->ip6[2]^addr->ip6[3]; +#endif + + return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth) + & IP_VS_RTAB_MASK; +} + +/* + * Hashes ip_vs_dest in ip_vs_rtable by . + * should be called with locked tables. + */ +static int ip_vs_rs_hash(struct ip_vs_dest *dest) +{ + unsigned hash; + + if (!list_empty(&dest->d_list)) { + return 0; + } + + /* + * Hash by proto,addr,port, + * which are the parameters of the real service. + */ + hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port); + + list_add(&dest->d_list, &ip_vs_rtable[hash]); + + return 1; +} + +/* + * UNhashes ip_vs_dest from ip_vs_rtable. + * should be called with locked tables. + */ +static int ip_vs_rs_unhash(struct ip_vs_dest *dest) +{ + /* + * Remove it from the ip_vs_rtable table. + */ + if (!list_empty(&dest->d_list)) { + list_del(&dest->d_list); + INIT_LIST_HEAD(&dest->d_list); + } + + return 1; +} + +/* + * Lookup real service by in the real service table. + */ +struct ip_vs_dest * +ip_vs_lookup_real_service(int af, __u16 protocol, + const union nf_inet_addr *daddr, + __be16 dport) +{ + unsigned hash; + struct ip_vs_dest *dest; + + /* + * Check for "full" addressed entries + * Return the first found entry + */ + hash = ip_vs_rs_hashkey(af, daddr, dport); + + read_lock(&__ip_vs_rs_lock); + list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) { + if ((dest->af == af) + && ip_vs_addr_equal(af, &dest->addr, daddr) + && (dest->port == dport) + && ((dest->protocol == protocol) || + dest->vfwmark)) { + /* HIT */ + read_unlock(&__ip_vs_rs_lock); + return dest; + } + } + read_unlock(&__ip_vs_rs_lock); + + return NULL; +} + +/* + * Lookup destination by {addr,port} in the given service + */ +static struct ip_vs_dest * +ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, + __be16 dport) +{ + struct ip_vs_dest *dest; + + /* + * Find the destination for the given service + */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if ((dest->af == svc->af) + && ip_vs_addr_equal(svc->af, &dest->addr, daddr) + && (dest->port == dport)) { + /* HIT */ + return dest; + } + } + + return NULL; +} + +/* + * Find destination by {daddr,dport,vaddr,protocol} + * Cretaed to be used in ip_vs_process_message() in + * the backup synchronization daemon. It finds the + * destination to be bound to the received connection + * on the backup. + * + * ip_vs_lookup_real_service() looked promissing, but + * seems not working as expected. + */ +struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr, + __be16 dport, + const union nf_inet_addr *vaddr, + __be16 vport, __u16 protocol) +{ + struct ip_vs_dest *dest; + struct ip_vs_service *svc; + + svc = ip_vs_service_get(af, 0, protocol, vaddr, vport); + if (!svc) + return NULL; + dest = ip_vs_lookup_dest(svc, daddr, dport); + if (dest) + atomic_inc(&dest->refcnt); + ip_vs_service_put(svc); + return dest; +} + +/* + * Lookup dest by {svc,addr,port} in the destination trash. + * The destination trash is used to hold the destinations that are removed + * from the service table but are still referenced by some conn entries. + * The reason to add the destination trash is when the dest is temporary + * down (either by administrator or by monitor program), the dest can be + * picked back from the trash, the remaining connections to the dest can + * continue, and the counting information of the dest is also useful for + * scheduling. + */ +static struct ip_vs_dest * +ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr, + __be16 dport) +{ + struct ip_vs_dest *dest, *nxt; + + /* + * Find the destination in trash + */ + list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { + IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " + "dest->refcnt=%d\n", + dest->vfwmark, + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port), + atomic_read(&dest->refcnt)); + if (dest->af == svc->af && + ip_vs_addr_equal(svc->af, &dest->addr, daddr) && + dest->port == dport && + dest->vfwmark == svc->fwmark && + dest->protocol == svc->protocol && + (svc->fwmark || + (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) && + dest->vport == svc->port))) { + /* HIT */ + return dest; + } + + /* + * Try to purge the destination from trash if not referenced + */ + if (atomic_read(&dest->refcnt) == 1) { + IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u " + "from trash\n", + dest->vfwmark, + IP_VS_DBG_ADDR(svc->af, &dest->addr), + ntohs(dest->port)); + list_del(&dest->n_list); + ip_vs_dst_reset(dest); + __ip_vs_unbind_svc(dest); + kfree(dest); + } + } + + return NULL; +} + + +/* + * Clean up all the destinations in the trash + * Called by the ip_vs_control_cleanup() + * + * When the ip_vs_control_clearup is activated by ipvs module exit, + * the service tables must have been flushed and all the connections + * are expired, and the refcnt of each destination in the trash must + * be 1, so we simply release them here. + */ +static void ip_vs_trash_cleanup(void) +{ + struct ip_vs_dest *dest, *nxt; + + list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { + list_del(&dest->n_list); + ip_vs_dst_reset(dest); + __ip_vs_unbind_svc(dest); + kfree(dest); + } +} + + +static void +ip_vs_zero_stats(struct ip_vs_stats *stats) +{ + spin_lock_bh(&stats->lock); + + memset(&stats->ustats, 0, sizeof(stats->ustats)); + ip_vs_zero_estimator(stats); + + spin_unlock_bh(&stats->lock); +} + +/* + * Update a destination in the given service + */ +static void +__ip_vs_update_dest(struct ip_vs_service *svc, + struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest) +{ + int conn_flags; + + /* set the weight and the flags */ + atomic_set(&dest->weight, udest->weight); + conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE; + + /* check if local node and update the flags */ +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) { + if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) { + conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) + | IP_VS_CONN_F_LOCALNODE; + } + } else +#endif + if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) { + conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) + | IP_VS_CONN_F_LOCALNODE; + } + + /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ + if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) { + conn_flags |= IP_VS_CONN_F_NOOUTPUT; + } else { + /* + * Put the real service in ip_vs_rtable if not present. + * For now only for NAT! + */ + write_lock_bh(&__ip_vs_rs_lock); + ip_vs_rs_hash(dest); + write_unlock_bh(&__ip_vs_rs_lock); + } + atomic_set(&dest->conn_flags, conn_flags); + + /* bind the service */ + if (!dest->svc) { + __ip_vs_bind_svc(dest, svc); + } else { + if (dest->svc != svc) { + __ip_vs_unbind_svc(dest); + ip_vs_zero_stats(&dest->stats); + __ip_vs_bind_svc(dest, svc); + } + } + + /* set the dest status flags */ + dest->flags |= IP_VS_DEST_F_AVAILABLE; + + if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + dest->u_threshold = udest->u_threshold; + dest->l_threshold = udest->l_threshold; +} + + +/* + * Create a destination for the given service + */ +static int +ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, + struct ip_vs_dest **dest_p) +{ + struct ip_vs_dest *dest; + unsigned atype; + + EnterFunction(2); + +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) { + atype = ipv6_addr_type(&udest->addr.in6); + if ((!(atype & IPV6_ADDR_UNICAST) || + atype & IPV6_ADDR_LINKLOCAL) && + !__ip_vs_addr_is_local_v6(&udest->addr.in6)) + return -EINVAL; + } else +#endif + { + atype = inet_addr_type(&init_net, udest->addr.ip); + if (atype != RTN_LOCAL && atype != RTN_UNICAST) + return -EINVAL; + } + + dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC); + if (dest == NULL) { + IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n"); + return -ENOMEM; + } + + dest->af = svc->af; + dest->protocol = svc->protocol; + dest->vaddr = svc->addr; + dest->vport = svc->port; + dest->vfwmark = svc->fwmark; + ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr); + dest->port = udest->port; + + atomic_set(&dest->activeconns, 0); + atomic_set(&dest->inactconns, 0); + atomic_set(&dest->persistconns, 0); + atomic_set(&dest->refcnt, 0); + + INIT_LIST_HEAD(&dest->d_list); + spin_lock_init(&dest->dst_lock); + spin_lock_init(&dest->stats.lock); + __ip_vs_update_dest(svc, dest, udest); + ip_vs_new_estimator(&dest->stats); + + *dest_p = dest; + + LeaveFunction(2); + return 0; +} + + +/* + * Add a destination into an existing service + */ +static int +ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) +{ + struct ip_vs_dest *dest; + union nf_inet_addr daddr; + __be16 dport = udest->port; + int ret; + + EnterFunction(2); + + if (udest->weight < 0) { + IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n"); + return -ERANGE; + } + + if (udest->l_threshold > udest->u_threshold) { + IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than " + "upper threshold\n"); + return -ERANGE; + } + + ip_vs_addr_copy(svc->af, &daddr, &udest->addr); + + /* + * Check if the dest already exists in the list + */ + dest = ip_vs_lookup_dest(svc, &daddr, dport); + + if (dest != NULL) { + IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n"); + return -EEXIST; + } + + /* + * Check if the dest already exists in the trash and + * is from the same service + */ + dest = ip_vs_trash_get_dest(svc, &daddr, dport); + + if (dest != NULL) { + IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " + "dest->refcnt=%d, service %u/%s:%u\n", + IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport), + atomic_read(&dest->refcnt), + dest->vfwmark, + IP_VS_DBG_ADDR(svc->af, &dest->vaddr), + ntohs(dest->vport)); + + __ip_vs_update_dest(svc, dest, udest); + + /* + * Get the destination from the trash + */ + list_del(&dest->n_list); + + ip_vs_new_estimator(&dest->stats); + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + list_add(&dest->n_list, &svc->destinations); + svc->num_dests++; + + /* call the update_service function of its scheduler */ + if (svc->scheduler->update_service) + svc->scheduler->update_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); + return 0; + } + + /* + * Allocate and initialize the dest structure + */ + ret = ip_vs_new_dest(svc, udest, &dest); + if (ret) { + return ret; + } + + /* + * Add the dest entry into the list + */ + atomic_inc(&dest->refcnt); + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + list_add(&dest->n_list, &svc->destinations); + svc->num_dests++; + + /* call the update_service function of its scheduler */ + if (svc->scheduler->update_service) + svc->scheduler->update_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); + + LeaveFunction(2); + + return 0; +} + + +/* + * Edit a destination in the given service + */ +static int +ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) +{ + struct ip_vs_dest *dest; + union nf_inet_addr daddr; + __be16 dport = udest->port; + + EnterFunction(2); + + if (udest->weight < 0) { + IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n"); + return -ERANGE; + } + + if (udest->l_threshold > udest->u_threshold) { + IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than " + "upper threshold\n"); + return -ERANGE; + } + + ip_vs_addr_copy(svc->af, &daddr, &udest->addr); + + /* + * Lookup the destination list + */ + dest = ip_vs_lookup_dest(svc, &daddr, dport); + + if (dest == NULL) { + IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n"); + return -ENOENT; + } + + __ip_vs_update_dest(svc, dest, udest); + + write_lock_bh(&__ip_vs_svc_lock); + + /* Wait until all other svc users go away */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + /* call the update_service, because server weight may be changed */ + if (svc->scheduler->update_service) + svc->scheduler->update_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); + + LeaveFunction(2); + + return 0; +} + + +/* + * Delete a destination (must be already unlinked from the service) + */ +static void __ip_vs_del_dest(struct ip_vs_dest *dest) +{ + ip_vs_kill_estimator(&dest->stats); + + /* + * Remove it from the d-linked list with the real services. + */ + write_lock_bh(&__ip_vs_rs_lock); + ip_vs_rs_unhash(dest); + write_unlock_bh(&__ip_vs_rs_lock); + + /* + * Decrease the refcnt of the dest, and free the dest + * if nobody refers to it (refcnt=0). Otherwise, throw + * the destination into the trash. + */ + if (atomic_dec_and_test(&dest->refcnt)) { + ip_vs_dst_reset(dest); + /* simply decrease svc->refcnt here, let the caller check + and release the service if nobody refers to it. + Only user context can release destination and service, + and only one user context can update virtual service at a + time, so the operation here is OK */ + atomic_dec(&dest->svc->refcnt); + kfree(dest); + } else { + IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, " + "dest->refcnt=%d\n", + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port), + atomic_read(&dest->refcnt)); + list_add(&dest->n_list, &ip_vs_dest_trash); + atomic_inc(&dest->refcnt); + } +} + + +/* + * Unlink a destination from the given service + */ +static void __ip_vs_unlink_dest(struct ip_vs_service *svc, + struct ip_vs_dest *dest, + int svcupd) +{ + dest->flags &= ~IP_VS_DEST_F_AVAILABLE; + + /* + * Remove it from the d-linked destination list. + */ + list_del(&dest->n_list); + svc->num_dests--; + + /* + * Call the update_service function of its scheduler + */ + if (svcupd && svc->scheduler->update_service) + svc->scheduler->update_service(svc); +} + + +/* + * Delete a destination server in the given service + */ +static int +ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) +{ + struct ip_vs_dest *dest; + __be16 dport = udest->port; + + EnterFunction(2); + + dest = ip_vs_lookup_dest(svc, &udest->addr, dport); + + if (dest == NULL) { + IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n"); + return -ENOENT; + } + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + /* + * Unlink dest from the service + */ + __ip_vs_unlink_dest(svc, dest, 1); + + write_unlock_bh(&__ip_vs_svc_lock); + + /* + * Delete the destination + */ + __ip_vs_del_dest(dest); + + LeaveFunction(2); + + return 0; +} + + +/* + * Add a service into the service hash table + */ +static int +ip_vs_add_service(struct ip_vs_service_user_kern *u, + struct ip_vs_service **svc_p) +{ + int ret = 0; + struct ip_vs_scheduler *sched = NULL; + struct ip_vs_service *svc = NULL; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + /* Lookup the scheduler by 'u->sched_name' */ + sched = ip_vs_scheduler_get(u->sched_name); + if (sched == NULL) { + IP_VS_INFO("Scheduler module ip_vs_%s not found\n", + u->sched_name); + ret = -ENOENT; + goto out_mod_dec; + } + +#ifdef CONFIG_IP_VS_IPV6 + if (u->af == AF_INET6) { + if (!sched->supports_ipv6) { + ret = -EAFNOSUPPORT; + goto out_err; + } + if ((u->netmask < 1) || (u->netmask > 128)) { + ret = -EINVAL; + goto out_err; + } + } +#endif + + svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC); + if (svc == NULL) { + IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n"); + ret = -ENOMEM; + goto out_err; + } + + /* I'm the first user of the service */ + atomic_set(&svc->usecnt, 1); + atomic_set(&svc->refcnt, 0); + + svc->af = u->af; + svc->protocol = u->protocol; + ip_vs_addr_copy(svc->af, &svc->addr, &u->addr); + svc->port = u->port; + svc->fwmark = u->fwmark; + svc->flags = u->flags; + svc->timeout = u->timeout * HZ; + svc->netmask = u->netmask; + + INIT_LIST_HEAD(&svc->destinations); + rwlock_init(&svc->sched_lock); + spin_lock_init(&svc->stats.lock); + + /* Bind the scheduler */ + ret = ip_vs_bind_scheduler(svc, sched); + if (ret) + goto out_err; + sched = NULL; + + /* Update the virtual service counters */ + if (svc->port == FTPPORT) + atomic_inc(&ip_vs_ftpsvc_counter); + else if (svc->port == 0) + atomic_inc(&ip_vs_nullsvc_counter); + + ip_vs_new_estimator(&svc->stats); + + /* Count only IPv4 services for old get/setsockopt interface */ + if (svc->af == AF_INET) + ip_vs_num_services++; + + /* Hash the service into the service table */ + write_lock_bh(&__ip_vs_svc_lock); + ip_vs_svc_hash(svc); + write_unlock_bh(&__ip_vs_svc_lock); + + *svc_p = svc; + return 0; + + out_err: + if (svc != NULL) { + if (svc->scheduler) + ip_vs_unbind_scheduler(svc); + if (svc->inc) { + local_bh_disable(); + ip_vs_app_inc_put(svc->inc); + local_bh_enable(); + } + kfree(svc); + } + ip_vs_scheduler_put(sched); + + out_mod_dec: + /* decrease the module use count */ + ip_vs_use_count_dec(); + + return ret; +} + + +/* + * Edit a service and bind it with a new scheduler + */ +static int +ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) +{ + struct ip_vs_scheduler *sched, *old_sched; + int ret = 0; + + /* + * Lookup the scheduler, by 'u->sched_name' + */ + sched = ip_vs_scheduler_get(u->sched_name); + if (sched == NULL) { + IP_VS_INFO("Scheduler module ip_vs_%s not found\n", + u->sched_name); + return -ENOENT; + } + old_sched = sched; + +#ifdef CONFIG_IP_VS_IPV6 + if (u->af == AF_INET6) { + if (!sched->supports_ipv6) { + ret = -EAFNOSUPPORT; + goto out; + } + if ((u->netmask < 1) || (u->netmask > 128)) { + ret = -EINVAL; + goto out; + } + } +#endif + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + /* + * Set the flags and timeout value + */ + svc->flags = u->flags | IP_VS_SVC_F_HASHED; + svc->timeout = u->timeout * HZ; + svc->netmask = u->netmask; + + old_sched = svc->scheduler; + if (sched != old_sched) { + /* + * Unbind the old scheduler + */ + if ((ret = ip_vs_unbind_scheduler(svc))) { + old_sched = sched; + goto out_unlock; + } + + /* + * Bind the new scheduler + */ + if ((ret = ip_vs_bind_scheduler(svc, sched))) { + /* + * If ip_vs_bind_scheduler fails, restore the old + * scheduler. + * The main reason of failure is out of memory. + * + * The question is if the old scheduler can be + * restored all the time. TODO: if it cannot be + * restored some time, we must delete the service, + * otherwise the system may crash. + */ + ip_vs_bind_scheduler(svc, old_sched); + old_sched = sched; + goto out_unlock; + } + } + + out_unlock: + write_unlock_bh(&__ip_vs_svc_lock); +#ifdef CONFIG_IP_VS_IPV6 + out: +#endif + + if (old_sched) + ip_vs_scheduler_put(old_sched); + + return ret; +} + + +/* + * Delete a service from the service list + * - The service must be unlinked, unlocked and not referenced! + * - We are called under _bh lock + */ +static void __ip_vs_del_service(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest, *nxt; + struct ip_vs_scheduler *old_sched; + + /* Count only IPv4 services for old get/setsockopt interface */ + if (svc->af == AF_INET) + ip_vs_num_services--; + + ip_vs_kill_estimator(&svc->stats); + + /* Unbind scheduler */ + old_sched = svc->scheduler; + ip_vs_unbind_scheduler(svc); + if (old_sched) + ip_vs_scheduler_put(old_sched); + + /* Unbind app inc */ + if (svc->inc) { + ip_vs_app_inc_put(svc->inc); + svc->inc = NULL; + } + + /* + * Unlink the whole destination list + */ + list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { + __ip_vs_unlink_dest(svc, dest, 0); + __ip_vs_del_dest(dest); + } + + /* + * Update the virtual service counters + */ + if (svc->port == FTPPORT) + atomic_dec(&ip_vs_ftpsvc_counter); + else if (svc->port == 0) + atomic_dec(&ip_vs_nullsvc_counter); + + /* + * Free the service if nobody refers to it + */ + if (atomic_read(&svc->refcnt) == 0) + kfree(svc); + + /* decrease the module use count */ + ip_vs_use_count_dec(); +} + +/* + * Delete a service from the service list + */ +static int ip_vs_del_service(struct ip_vs_service *svc) +{ + if (svc == NULL) + return -EEXIST; + + /* + * Unhash it from the service table + */ + write_lock_bh(&__ip_vs_svc_lock); + + ip_vs_svc_unhash(svc); + + /* + * Wait until all the svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + __ip_vs_del_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); + + return 0; +} + + +/* + * Flush all the virtual services + */ +static int ip_vs_flush(void) +{ + int idx; + struct ip_vs_service *svc, *nxt; + + /* + * Flush the service table hashed by + */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) { + write_lock_bh(&__ip_vs_svc_lock); + ip_vs_svc_unhash(svc); + /* + * Wait until all the svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); + __ip_vs_del_service(svc); + write_unlock_bh(&__ip_vs_svc_lock); + } + } + + /* + * Flush the service table hashed by fwmark + */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry_safe(svc, nxt, + &ip_vs_svc_fwm_table[idx], f_list) { + write_lock_bh(&__ip_vs_svc_lock); + ip_vs_svc_unhash(svc); + /* + * Wait until all the svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); + __ip_vs_del_service(svc); + write_unlock_bh(&__ip_vs_svc_lock); + } + } + + return 0; +} + + +/* + * Zero counters in a service or all services + */ +static int ip_vs_zero_service(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + + write_lock_bh(&__ip_vs_svc_lock); + list_for_each_entry(dest, &svc->destinations, n_list) { + ip_vs_zero_stats(&dest->stats); + } + ip_vs_zero_stats(&svc->stats); + write_unlock_bh(&__ip_vs_svc_lock); + return 0; +} + +static int ip_vs_zero_all(void) +{ + int idx; + struct ip_vs_service *svc; + + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + ip_vs_zero_service(svc); + } + } + + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + ip_vs_zero_service(svc); + } + } + + ip_vs_zero_stats(&ip_vs_stats); + return 0; +} + + +static int +proc_do_defense_mode(ctl_table *table, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = table->data; + int val = *valp; + int rc; + + rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); + if (write && (*valp != val)) { + if ((*valp < 0) || (*valp > 3)) { + /* Restore the correct value */ + *valp = val; + } else { + update_defense_level(); + } + } + return rc; +} + + +static int +proc_do_sync_threshold(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = table->data; + int val[2]; + int rc; + + /* backup the value first */ + memcpy(val, valp, sizeof(val)); + + rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); + if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) { + /* Restore the correct value */ + memcpy(valp, val, sizeof(val)); + } + return rc; +} + + +/* + * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) + */ + +static struct ctl_table vs_vars[] = { + { + .procname = "amemthresh", + .data = &sysctl_ip_vs_amemthresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#ifdef CONFIG_IP_VS_DEBUG + { + .procname = "debug_level", + .data = &sysctl_ip_vs_debug_level, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .procname = "am_droprate", + .data = &sysctl_ip_vs_am_droprate, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "drop_entry", + .data = &sysctl_ip_vs_drop_entry, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_do_defense_mode, + }, + { + .procname = "drop_packet", + .data = &sysctl_ip_vs_drop_packet, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_do_defense_mode, + }, + { + .procname = "secure_tcp", + .data = &sysctl_ip_vs_secure_tcp, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_do_defense_mode, + }, +#if 0 + { + .procname = "timeout_established", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_synsent", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_synrecv", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_finwait", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_timewait", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_close", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_closewait", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_lastack", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_listen", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_synack", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_udp", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .procname = "timeout_icmp", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, +#endif + { + .procname = "cache_bypass", + .data = &sysctl_ip_vs_cache_bypass, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "expire_nodest_conn", + .data = &sysctl_ip_vs_expire_nodest_conn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "expire_quiescent_template", + .data = &sysctl_ip_vs_expire_quiescent_template, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .procname = "sync_threshold", + .data = &sysctl_ip_vs_sync_threshold, + .maxlen = sizeof(sysctl_ip_vs_sync_threshold), + .mode = 0644, + .proc_handler = &proc_do_sync_threshold, + }, + { + .procname = "nat_icmp_send", + .data = &sysctl_ip_vs_nat_icmp_send, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; + +const struct ctl_path net_vs_ctl_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "ipv4", .ctl_name = NET_IPV4, }, + { .procname = "vs", }, + { } +}; +EXPORT_SYMBOL_GPL(net_vs_ctl_path); + +static struct ctl_table_header * sysctl_header; + +#ifdef CONFIG_PROC_FS + +struct ip_vs_iter { + struct list_head *table; + int bucket; +}; + +/* + * Write the contents of the VS rule table to a PROCfs file. + * (It is kept just for backward compatibility) + */ +static inline const char *ip_vs_fwd_name(unsigned flags) +{ + switch (flags & IP_VS_CONN_F_FWD_MASK) { + case IP_VS_CONN_F_LOCALNODE: + return "Local"; + case IP_VS_CONN_F_TUNNEL: + return "Tunnel"; + case IP_VS_CONN_F_DROUTE: + return "Route"; + default: + return "Masq"; + } +} + + +/* Get the Nth entry in the two lists */ +static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) +{ + struct ip_vs_iter *iter = seq->private; + int idx; + struct ip_vs_service *svc; + + /* look in hash by protocol */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + if (pos-- == 0){ + iter->table = ip_vs_svc_table; + iter->bucket = idx; + return svc; + } + } + } + + /* keep looking in fwmark */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + if (pos-- == 0) { + iter->table = ip_vs_svc_fwm_table; + iter->bucket = idx; + return svc; + } + } + } + + return NULL; +} + +static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) +__acquires(__ip_vs_svc_lock) +{ + + read_lock_bh(&__ip_vs_svc_lock); + return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; +} + + +static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct list_head *e; + struct ip_vs_iter *iter; + struct ip_vs_service *svc; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip_vs_info_array(seq,0); + + svc = v; + iter = seq->private; + + if (iter->table == ip_vs_svc_table) { + /* next service in table hashed by protocol */ + if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket]) + return list_entry(e, struct ip_vs_service, s_list); + + + while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { + list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket], + s_list) { + return svc; + } + } + + iter->table = ip_vs_svc_fwm_table; + iter->bucket = -1; + goto scan_fwmark; + } + + /* next service in hashed by fwmark */ + if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket]) + return list_entry(e, struct ip_vs_service, f_list); + + scan_fwmark: + while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket], + f_list) + return svc; + } + + return NULL; +} + +static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) +__releases(__ip_vs_svc_lock) +{ + read_unlock_bh(&__ip_vs_svc_lock); +} + + +static int ip_vs_info_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) { + seq_printf(seq, + "IP Virtual Server version %d.%d.%d (size=%d)\n", + NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); + seq_puts(seq, + "Prot LocalAddress:Port Scheduler Flags\n"); + seq_puts(seq, + " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); + } else { + const struct ip_vs_service *svc = v; + const struct ip_vs_iter *iter = seq->private; + const struct ip_vs_dest *dest; + + if (iter->table == ip_vs_svc_table) { +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + seq_printf(seq, "%s [" NIP6_FMT "]:%04X %s ", + ip_vs_proto_name(svc->protocol), + NIP6(svc->addr.in6), + ntohs(svc->port), + svc->scheduler->name); + else +#endif + seq_printf(seq, "%s %08X:%04X %s ", + ip_vs_proto_name(svc->protocol), + ntohl(svc->addr.ip), + ntohs(svc->port), + svc->scheduler->name); + } else { + seq_printf(seq, "FWM %08X %s ", + svc->fwmark, svc->scheduler->name); + } + + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + seq_printf(seq, "persistent %d %08X\n", + svc->timeout, + ntohl(svc->netmask)); + else + seq_putc(seq, '\n'); + + list_for_each_entry(dest, &svc->destinations, n_list) { +#ifdef CONFIG_IP_VS_IPV6 + if (dest->af == AF_INET6) + seq_printf(seq, + " -> [" NIP6_FMT "]:%04X" + " %-7s %-6d %-10d %-10d\n", + NIP6(dest->addr.in6), + ntohs(dest->port), + ip_vs_fwd_name(atomic_read(&dest->conn_flags)), + atomic_read(&dest->weight), + atomic_read(&dest->activeconns), + atomic_read(&dest->inactconns)); + else +#endif + seq_printf(seq, + " -> %08X:%04X " + "%-7s %-6d %-10d %-10d\n", + ntohl(dest->addr.ip), + ntohs(dest->port), + ip_vs_fwd_name(atomic_read(&dest->conn_flags)), + atomic_read(&dest->weight), + atomic_read(&dest->activeconns), + atomic_read(&dest->inactconns)); + + } + } + return 0; +} + +static const struct seq_operations ip_vs_info_seq_ops = { + .start = ip_vs_info_seq_start, + .next = ip_vs_info_seq_next, + .stop = ip_vs_info_seq_stop, + .show = ip_vs_info_seq_show, +}; + +static int ip_vs_info_open(struct inode *inode, struct file *file) +{ + return seq_open_private(file, &ip_vs_info_seq_ops, + sizeof(struct ip_vs_iter)); +} + +static const struct file_operations ip_vs_info_fops = { + .owner = THIS_MODULE, + .open = ip_vs_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +#endif + +struct ip_vs_stats ip_vs_stats = { + .lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock), +}; + +#ifdef CONFIG_PROC_FS +static int ip_vs_stats_show(struct seq_file *seq, void *v) +{ + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Total Incoming Outgoing Incoming Outgoing\n"); + seq_printf(seq, + " Conns Packets Packets Bytes Bytes\n"); + + spin_lock_bh(&ip_vs_stats.lock); + seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.ustats.conns, + ip_vs_stats.ustats.inpkts, ip_vs_stats.ustats.outpkts, + (unsigned long long) ip_vs_stats.ustats.inbytes, + (unsigned long long) ip_vs_stats.ustats.outbytes); + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); + seq_printf(seq,"%8X %8X %8X %16X %16X\n", + ip_vs_stats.ustats.cps, + ip_vs_stats.ustats.inpps, + ip_vs_stats.ustats.outpps, + ip_vs_stats.ustats.inbps, + ip_vs_stats.ustats.outbps); + spin_unlock_bh(&ip_vs_stats.lock); + + return 0; +} + +static int ip_vs_stats_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, ip_vs_stats_show, NULL); +} + +static const struct file_operations ip_vs_stats_fops = { + .owner = THIS_MODULE, + .open = ip_vs_stats_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif + +/* + * Set timeout values for tcp tcpfin udp in the timeout_table. + */ +static int ip_vs_set_timeout(struct ip_vs_timeout_user *u) +{ + IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", + u->tcp_timeout, + u->tcp_fin_timeout, + u->udp_timeout); + +#ifdef CONFIG_IP_VS_PROTO_TCP + if (u->tcp_timeout) { + ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] + = u->tcp_timeout * HZ; + } + + if (u->tcp_fin_timeout) { + ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] + = u->tcp_fin_timeout * HZ; + } +#endif + +#ifdef CONFIG_IP_VS_PROTO_UDP + if (u->udp_timeout) { + ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] + = u->udp_timeout * HZ; + } +#endif + return 0; +} + + +#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) +#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user)) +#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \ + sizeof(struct ip_vs_dest_user)) +#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) +#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user)) +#define MAX_ARG_LEN SVCDEST_ARG_LEN + +static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { + [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0, + [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN, +}; + +static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, + struct ip_vs_service_user *usvc_compat) +{ + usvc->af = AF_INET; + usvc->protocol = usvc_compat->protocol; + usvc->addr.ip = usvc_compat->addr; + usvc->port = usvc_compat->port; + usvc->fwmark = usvc_compat->fwmark; + + /* Deep copy of sched_name is not needed here */ + usvc->sched_name = usvc_compat->sched_name; + + usvc->flags = usvc_compat->flags; + usvc->timeout = usvc_compat->timeout; + usvc->netmask = usvc_compat->netmask; +} + +static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, + struct ip_vs_dest_user *udest_compat) +{ + udest->addr.ip = udest_compat->addr; + udest->port = udest_compat->port; + udest->conn_flags = udest_compat->conn_flags; + udest->weight = udest_compat->weight; + udest->u_threshold = udest_compat->u_threshold; + udest->l_threshold = udest_compat->l_threshold; +} + +static int +do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +{ + int ret; + unsigned char arg[MAX_ARG_LEN]; + struct ip_vs_service_user *usvc_compat; + struct ip_vs_service_user_kern usvc; + struct ip_vs_service *svc; + struct ip_vs_dest_user *udest_compat; + struct ip_vs_dest_user_kern udest; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (len != set_arglen[SET_CMDID(cmd)]) { + IP_VS_ERR("set_ctl: len %u != %u\n", + len, set_arglen[SET_CMDID(cmd)]); + return -EINVAL; + } + + if (copy_from_user(arg, user, len) != 0) + return -EFAULT; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + if (mutex_lock_interruptible(&__ip_vs_mutex)) { + ret = -ERESTARTSYS; + goto out_dec; + } + + if (cmd == IP_VS_SO_SET_FLUSH) { + /* Flush the virtual service */ + ret = ip_vs_flush(); + goto out_unlock; + } else if (cmd == IP_VS_SO_SET_TIMEOUT) { + /* Set timeout values for (tcp tcpfin udp) */ + ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg); + goto out_unlock; + } else if (cmd == IP_VS_SO_SET_STARTDAEMON) { + struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; + ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid); + goto out_unlock; + } else if (cmd == IP_VS_SO_SET_STOPDAEMON) { + struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; + ret = stop_sync_thread(dm->state); + goto out_unlock; + } + + usvc_compat = (struct ip_vs_service_user *)arg; + udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1); + + /* We only use the new structs internally, so copy userspace compat + * structs to extended internal versions */ + ip_vs_copy_usvc_compat(&usvc, usvc_compat); + ip_vs_copy_udest_compat(&udest, udest_compat); + + if (cmd == IP_VS_SO_SET_ZERO) { + /* if no service address is set, zero counters in all */ + if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { + ret = ip_vs_zero_all(); + goto out_unlock; + } + } + + /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */ + if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP) { + IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n", + usvc.protocol, NIPQUAD(usvc.addr.ip), + ntohs(usvc.port), usvc.sched_name); + ret = -EFAULT; + goto out_unlock; + } + + /* Lookup the exact service by or fwmark */ + if (usvc.fwmark == 0) + svc = __ip_vs_service_get(usvc.af, usvc.protocol, + &usvc.addr, usvc.port); + else + svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); + + if (cmd != IP_VS_SO_SET_ADD + && (svc == NULL || svc->protocol != usvc.protocol)) { + ret = -ESRCH; + goto out_unlock; + } + + switch (cmd) { + case IP_VS_SO_SET_ADD: + if (svc != NULL) + ret = -EEXIST; + else + ret = ip_vs_add_service(&usvc, &svc); + break; + case IP_VS_SO_SET_EDIT: + ret = ip_vs_edit_service(svc, &usvc); + break; + case IP_VS_SO_SET_DEL: + ret = ip_vs_del_service(svc); + if (!ret) + goto out_unlock; + break; + case IP_VS_SO_SET_ZERO: + ret = ip_vs_zero_service(svc); + break; + case IP_VS_SO_SET_ADDDEST: + ret = ip_vs_add_dest(svc, &udest); + break; + case IP_VS_SO_SET_EDITDEST: + ret = ip_vs_edit_dest(svc, &udest); + break; + case IP_VS_SO_SET_DELDEST: + ret = ip_vs_del_dest(svc, &udest); + break; + default: + ret = -EINVAL; + } + + if (svc) + ip_vs_service_put(svc); + + out_unlock: + mutex_unlock(&__ip_vs_mutex); + out_dec: + /* decrease the module use count */ + ip_vs_use_count_dec(); + + return ret; +} + + +static void +ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) +{ + spin_lock_bh(&src->lock); + memcpy(dst, &src->ustats, sizeof(*dst)); + spin_unlock_bh(&src->lock); +} + +static void +ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) +{ + dst->protocol = src->protocol; + dst->addr = src->addr.ip; + dst->port = src->port; + dst->fwmark = src->fwmark; + strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name)); + dst->flags = src->flags; + dst->timeout = src->timeout / HZ; + dst->netmask = src->netmask; + dst->num_dests = src->num_dests; + ip_vs_copy_stats(&dst->stats, &src->stats); +} + +static inline int +__ip_vs_get_service_entries(const struct ip_vs_get_services *get, + struct ip_vs_get_services __user *uptr) +{ + int idx, count=0; + struct ip_vs_service *svc; + struct ip_vs_service_entry entry; + int ret = 0; + + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + /* Only expose IPv4 entries to old interface */ + if (svc->af != AF_INET) + continue; + + if (count >= get->num_services) + goto out; + memset(&entry, 0, sizeof(entry)); + ip_vs_copy_service(&entry, svc); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + goto out; + } + count++; + } + } + + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + /* Only expose IPv4 entries to old interface */ + if (svc->af != AF_INET) + continue; + + if (count >= get->num_services) + goto out; + memset(&entry, 0, sizeof(entry)); + ip_vs_copy_service(&entry, svc); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + goto out; + } + count++; + } + } + out: + return ret; +} + +static inline int +__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, + struct ip_vs_get_dests __user *uptr) +{ + struct ip_vs_service *svc; + union nf_inet_addr addr = { .ip = get->addr }; + int ret = 0; + + if (get->fwmark) + svc = __ip_vs_svc_fwm_get(AF_INET, get->fwmark); + else + svc = __ip_vs_service_get(AF_INET, get->protocol, &addr, + get->port); + + if (svc) { + int count = 0; + struct ip_vs_dest *dest; + struct ip_vs_dest_entry entry; + + list_for_each_entry(dest, &svc->destinations, n_list) { + if (count >= get->num_dests) + break; + + entry.addr = dest->addr.ip; + entry.port = dest->port; + entry.conn_flags = atomic_read(&dest->conn_flags); + entry.weight = atomic_read(&dest->weight); + entry.u_threshold = dest->u_threshold; + entry.l_threshold = dest->l_threshold; + entry.activeconns = atomic_read(&dest->activeconns); + entry.inactconns = atomic_read(&dest->inactconns); + entry.persistconns = atomic_read(&dest->persistconns); + ip_vs_copy_stats(&entry.stats, &dest->stats); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + break; + } + count++; + } + ip_vs_service_put(svc); + } else + ret = -ESRCH; + return ret; +} + +static inline void +__ip_vs_get_timeouts(struct ip_vs_timeout_user *u) +{ +#ifdef CONFIG_IP_VS_PROTO_TCP + u->tcp_timeout = + ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; + u->tcp_fin_timeout = + ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + u->udp_timeout = + ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ; +#endif +} + + +#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) +#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo)) +#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services)) +#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry)) +#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests)) +#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) +#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2) + +static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = { + [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64, + [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN, +}; + +static int +do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + unsigned char arg[128]; + int ret = 0; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (*len < get_arglen[GET_CMDID(cmd)]) { + IP_VS_ERR("get_ctl: len %u < %u\n", + *len, get_arglen[GET_CMDID(cmd)]); + return -EINVAL; + } + + if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0) + return -EFAULT; + + if (mutex_lock_interruptible(&__ip_vs_mutex)) + return -ERESTARTSYS; + + switch (cmd) { + case IP_VS_SO_GET_VERSION: + { + char buf[64]; + + sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", + NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); + if (copy_to_user(user, buf, strlen(buf)+1) != 0) { + ret = -EFAULT; + goto out; + } + *len = strlen(buf)+1; + } + break; + + case IP_VS_SO_GET_INFO: + { + struct ip_vs_getinfo info; + info.version = IP_VS_VERSION_CODE; + info.size = IP_VS_CONN_TAB_SIZE; + info.num_services = ip_vs_num_services; + if (copy_to_user(user, &info, sizeof(info)) != 0) + ret = -EFAULT; + } + break; + + case IP_VS_SO_GET_SERVICES: + { + struct ip_vs_get_services *get; + int size; + + get = (struct ip_vs_get_services *)arg; + size = sizeof(*get) + + sizeof(struct ip_vs_service_entry) * get->num_services; + if (*len != size) { + IP_VS_ERR("length: %u != %u\n", *len, size); + ret = -EINVAL; + goto out; + } + ret = __ip_vs_get_service_entries(get, user); + } + break; + + case IP_VS_SO_GET_SERVICE: + { + struct ip_vs_service_entry *entry; + struct ip_vs_service *svc; + union nf_inet_addr addr; + + entry = (struct ip_vs_service_entry *)arg; + addr.ip = entry->addr; + if (entry->fwmark) + svc = __ip_vs_svc_fwm_get(AF_INET, entry->fwmark); + else + svc = __ip_vs_service_get(AF_INET, entry->protocol, + &addr, entry->port); + if (svc) { + ip_vs_copy_service(entry, svc); + if (copy_to_user(user, entry, sizeof(*entry)) != 0) + ret = -EFAULT; + ip_vs_service_put(svc); + } else + ret = -ESRCH; + } + break; + + case IP_VS_SO_GET_DESTS: + { + struct ip_vs_get_dests *get; + int size; + + get = (struct ip_vs_get_dests *)arg; + size = sizeof(*get) + + sizeof(struct ip_vs_dest_entry) * get->num_dests; + if (*len != size) { + IP_VS_ERR("length: %u != %u\n", *len, size); + ret = -EINVAL; + goto out; + } + ret = __ip_vs_get_dest_entries(get, user); + } + break; + + case IP_VS_SO_GET_TIMEOUT: + { + struct ip_vs_timeout_user t; + + __ip_vs_get_timeouts(&t); + if (copy_to_user(user, &t, sizeof(t)) != 0) + ret = -EFAULT; + } + break; + + case IP_VS_SO_GET_DAEMON: + { + struct ip_vs_daemon_user d[2]; + + memset(&d, 0, sizeof(d)); + if (ip_vs_sync_state & IP_VS_STATE_MASTER) { + d[0].state = IP_VS_STATE_MASTER; + strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn)); + d[0].syncid = ip_vs_master_syncid; + } + if (ip_vs_sync_state & IP_VS_STATE_BACKUP) { + d[1].state = IP_VS_STATE_BACKUP; + strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn)); + d[1].syncid = ip_vs_backup_syncid; + } + if (copy_to_user(user, &d, sizeof(d)) != 0) + ret = -EFAULT; + } + break; + + default: + ret = -EINVAL; + } + + out: + mutex_unlock(&__ip_vs_mutex); + return ret; +} + + +static struct nf_sockopt_ops ip_vs_sockopts = { + .pf = PF_INET, + .set_optmin = IP_VS_BASE_CTL, + .set_optmax = IP_VS_SO_SET_MAX+1, + .set = do_ip_vs_set_ctl, + .get_optmin = IP_VS_BASE_CTL, + .get_optmax = IP_VS_SO_GET_MAX+1, + .get = do_ip_vs_get_ctl, + .owner = THIS_MODULE, +}; + +/* + * Generic Netlink interface + */ + +/* IPVS genetlink family */ +static struct genl_family ip_vs_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = IPVS_GENL_NAME, + .version = IPVS_GENL_VERSION, + .maxattr = IPVS_CMD_MAX, +}; + +/* Policy used for first-level command attributes */ +static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = { + [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED }, + [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED }, + [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED }, + [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 }, + [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 }, + [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 }, +}; + +/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */ +static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = { + [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 }, + [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING, + .len = IP_VS_IFNAME_MAXLEN }, + [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 }, +}; + +/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */ +static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = { + [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 }, + [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 }, + [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY, + .len = sizeof(union nf_inet_addr) }, + [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 }, + [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 }, + [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING, + .len = IP_VS_SCHEDNAME_MAXLEN }, + [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY, + .len = sizeof(struct ip_vs_flags) }, + [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 }, + [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 }, + [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED }, +}; + +/* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */ +static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = { + [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY, + .len = sizeof(union nf_inet_addr) }, + [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 }, + [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 }, + [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED }, +}; + +static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, + struct ip_vs_stats *stats) +{ + struct nlattr *nl_stats = nla_nest_start(skb, container_type); + if (!nl_stats) + return -EMSGSIZE; + + spin_lock_bh(&stats->lock); + + NLA_PUT_U32(skb, IPVS_STATS_ATTR_CONNS, stats->ustats.conns); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPKTS, stats->ustats.inpkts); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPKTS, stats->ustats.outpkts); + NLA_PUT_U64(skb, IPVS_STATS_ATTR_INBYTES, stats->ustats.inbytes); + NLA_PUT_U64(skb, IPVS_STATS_ATTR_OUTBYTES, stats->ustats.outbytes); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_CPS, stats->ustats.cps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INPPS, stats->ustats.inpps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTPPS, stats->ustats.outpps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_INBPS, stats->ustats.inbps); + NLA_PUT_U32(skb, IPVS_STATS_ATTR_OUTBPS, stats->ustats.outbps); + + spin_unlock_bh(&stats->lock); + + nla_nest_end(skb, nl_stats); + + return 0; + +nla_put_failure: + spin_unlock_bh(&stats->lock); + nla_nest_cancel(skb, nl_stats); + return -EMSGSIZE; +} + +static int ip_vs_genl_fill_service(struct sk_buff *skb, + struct ip_vs_service *svc) +{ + struct nlattr *nl_service; + struct ip_vs_flags flags = { .flags = svc->flags, + .mask = ~0 }; + + nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE); + if (!nl_service) + return -EMSGSIZE; + + NLA_PUT_U16(skb, IPVS_SVC_ATTR_AF, svc->af); + + if (svc->fwmark) { + NLA_PUT_U32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark); + } else { + NLA_PUT_U16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol); + NLA_PUT(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr); + NLA_PUT_U16(skb, IPVS_SVC_ATTR_PORT, svc->port); + } + + NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name); + NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags); + NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ); + NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask); + + if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats)) + goto nla_put_failure; + + nla_nest_end(skb, nl_service); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_service); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_service(struct sk_buff *skb, + struct ip_vs_service *svc, + struct netlink_callback *cb) +{ + void *hdr; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + &ip_vs_genl_family, NLM_F_MULTI, + IPVS_CMD_NEW_SERVICE); + if (!hdr) + return -EMSGSIZE; + + if (ip_vs_genl_fill_service(skb, svc) < 0) + goto nla_put_failure; + + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_services(struct sk_buff *skb, + struct netlink_callback *cb) +{ + int idx = 0, i; + int start = cb->args[0]; + struct ip_vs_service *svc; + + mutex_lock(&__ip_vs_mutex); + for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { + list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { + if (++idx <= start) + continue; + if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { + idx--; + goto nla_put_failure; + } + } + } + + for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { + if (++idx <= start) + continue; + if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { + idx--; + goto nla_put_failure; + } + } + } + +nla_put_failure: + mutex_unlock(&__ip_vs_mutex); + cb->args[0] = idx; + + return skb->len; +} + +static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, + struct nlattr *nla, int full_entry) +{ + struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1]; + struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr; + + /* Parse mandatory identifying service fields first */ + if (nla == NULL || + nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy)) + return -EINVAL; + + nla_af = attrs[IPVS_SVC_ATTR_AF]; + nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL]; + nla_addr = attrs[IPVS_SVC_ATTR_ADDR]; + nla_port = attrs[IPVS_SVC_ATTR_PORT]; + nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK]; + + if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr)))) + return -EINVAL; + + usvc->af = nla_get_u16(nla_af); +#ifdef CONFIG_IP_VS_IPV6 + if (usvc->af != AF_INET && usvc->af != AF_INET6) +#else + if (usvc->af != AF_INET) +#endif + return -EAFNOSUPPORT; + + if (nla_fwmark) { + usvc->protocol = IPPROTO_TCP; + usvc->fwmark = nla_get_u32(nla_fwmark); + } else { + usvc->protocol = nla_get_u16(nla_protocol); + nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr)); + usvc->port = nla_get_u16(nla_port); + usvc->fwmark = 0; + } + + /* If a full entry was requested, check for the additional fields */ + if (full_entry) { + struct nlattr *nla_sched, *nla_flags, *nla_timeout, + *nla_netmask; + struct ip_vs_flags flags; + struct ip_vs_service *svc; + + nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME]; + nla_flags = attrs[IPVS_SVC_ATTR_FLAGS]; + nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT]; + nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK]; + + if (!(nla_sched && nla_flags && nla_timeout && nla_netmask)) + return -EINVAL; + + nla_memcpy(&flags, nla_flags, sizeof(flags)); + + /* prefill flags from service if it already exists */ + if (usvc->fwmark) + svc = __ip_vs_svc_fwm_get(usvc->af, usvc->fwmark); + else + svc = __ip_vs_service_get(usvc->af, usvc->protocol, + &usvc->addr, usvc->port); + if (svc) { + usvc->flags = svc->flags; + ip_vs_service_put(svc); + } else + usvc->flags = 0; + + /* set new flags from userland */ + usvc->flags = (usvc->flags & ~flags.mask) | + (flags.flags & flags.mask); + usvc->sched_name = nla_data(nla_sched); + usvc->timeout = nla_get_u32(nla_timeout); + usvc->netmask = nla_get_u32(nla_netmask); + } + + return 0; +} + +static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla) +{ + struct ip_vs_service_user_kern usvc; + int ret; + + ret = ip_vs_genl_parse_service(&usvc, nla, 0); + if (ret) + return ERR_PTR(ret); + + if (usvc.fwmark) + return __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); + else + return __ip_vs_service_get(usvc.af, usvc.protocol, + &usvc.addr, usvc.port); +} + +static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) +{ + struct nlattr *nl_dest; + + nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST); + if (!nl_dest) + return -EMSGSIZE; + + NLA_PUT(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr); + NLA_PUT_U16(skb, IPVS_DEST_ATTR_PORT, dest->port); + + NLA_PUT_U32(skb, IPVS_DEST_ATTR_FWD_METHOD, + atomic_read(&dest->conn_flags) & IP_VS_CONN_F_FWD_MASK); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_WEIGHT, atomic_read(&dest->weight)); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS, + atomic_read(&dest->activeconns)); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_INACT_CONNS, + atomic_read(&dest->inactconns)); + NLA_PUT_U32(skb, IPVS_DEST_ATTR_PERSIST_CONNS, + atomic_read(&dest->persistconns)); + + if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats)) + goto nla_put_failure; + + nla_nest_end(skb, nl_dest); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_dest); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest, + struct netlink_callback *cb) +{ + void *hdr; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + &ip_vs_genl_family, NLM_F_MULTI, + IPVS_CMD_NEW_DEST); + if (!hdr) + return -EMSGSIZE; + + if (ip_vs_genl_fill_dest(skb, dest) < 0) + goto nla_put_failure; + + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_dests(struct sk_buff *skb, + struct netlink_callback *cb) +{ + int idx = 0; + int start = cb->args[0]; + struct ip_vs_service *svc; + struct ip_vs_dest *dest; + struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; + + mutex_lock(&__ip_vs_mutex); + + /* Try to find the service for which to dump destinations */ + if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs, + IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy)) + goto out_err; + + svc = ip_vs_genl_find_service(attrs[IPVS_CMD_ATTR_SERVICE]); + if (IS_ERR(svc) || svc == NULL) + goto out_err; + + /* Dump the destinations */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if (++idx <= start) + continue; + if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) { + idx--; + goto nla_put_failure; + } + } + +nla_put_failure: + cb->args[0] = idx; + ip_vs_service_put(svc); + +out_err: + mutex_unlock(&__ip_vs_mutex); + + return skb->len; +} + +static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, + struct nlattr *nla, int full_entry) +{ + struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1]; + struct nlattr *nla_addr, *nla_port; + + /* Parse mandatory identifying destination fields first */ + if (nla == NULL || + nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy)) + return -EINVAL; + + nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; + nla_port = attrs[IPVS_DEST_ATTR_PORT]; + + if (!(nla_addr && nla_port)) + return -EINVAL; + + nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr)); + udest->port = nla_get_u16(nla_port); + + /* If a full entry was requested, check for the additional fields */ + if (full_entry) { + struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh, + *nla_l_thresh; + + nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD]; + nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT]; + nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH]; + nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH]; + + if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh)) + return -EINVAL; + + udest->conn_flags = nla_get_u32(nla_fwd) + & IP_VS_CONN_F_FWD_MASK; + udest->weight = nla_get_u32(nla_weight); + udest->u_threshold = nla_get_u32(nla_u_thresh); + udest->l_threshold = nla_get_u32(nla_l_thresh); + } + + return 0; +} + +static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state, + const char *mcast_ifn, __be32 syncid) +{ + struct nlattr *nl_daemon; + + nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON); + if (!nl_daemon) + return -EMSGSIZE; + + NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_STATE, state); + NLA_PUT_STRING(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn); + NLA_PUT_U32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid); + + nla_nest_end(skb, nl_daemon); + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, nl_daemon); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state, + const char *mcast_ifn, __be32 syncid, + struct netlink_callback *cb) +{ + void *hdr; + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + &ip_vs_genl_family, NLM_F_MULTI, + IPVS_CMD_NEW_DAEMON); + if (!hdr) + return -EMSGSIZE; + + if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid)) + goto nla_put_failure; + + return genlmsg_end(skb, hdr); + +nla_put_failure: + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; +} + +static int ip_vs_genl_dump_daemons(struct sk_buff *skb, + struct netlink_callback *cb) +{ + mutex_lock(&__ip_vs_mutex); + if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { + if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, + ip_vs_master_mcast_ifn, + ip_vs_master_syncid, cb) < 0) + goto nla_put_failure; + + cb->args[0] = 1; + } + + if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { + if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, + ip_vs_backup_mcast_ifn, + ip_vs_backup_syncid, cb) < 0) + goto nla_put_failure; + + cb->args[1] = 1; + } + +nla_put_failure: + mutex_unlock(&__ip_vs_mutex); + + return skb->len; +} + +static int ip_vs_genl_new_daemon(struct nlattr **attrs) +{ + if (!(attrs[IPVS_DAEMON_ATTR_STATE] && + attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && + attrs[IPVS_DAEMON_ATTR_SYNC_ID])) + return -EINVAL; + + return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), + nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), + nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID])); +} + +static int ip_vs_genl_del_daemon(struct nlattr **attrs) +{ + if (!attrs[IPVS_DAEMON_ATTR_STATE]) + return -EINVAL; + + return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); +} + +static int ip_vs_genl_set_config(struct nlattr **attrs) +{ + struct ip_vs_timeout_user t; + + __ip_vs_get_timeouts(&t); + + if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) + t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); + + if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]) + t.tcp_fin_timeout = + nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]); + + if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) + t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); + + return ip_vs_set_timeout(&t); +} + +static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) +{ + struct ip_vs_service *svc = NULL; + struct ip_vs_service_user_kern usvc; + struct ip_vs_dest_user_kern udest; + int ret = 0, cmd; + int need_full_svc = 0, need_full_dest = 0; + + cmd = info->genlhdr->cmd; + + mutex_lock(&__ip_vs_mutex); + + if (cmd == IPVS_CMD_FLUSH) { + ret = ip_vs_flush(); + goto out; + } else if (cmd == IPVS_CMD_SET_CONFIG) { + ret = ip_vs_genl_set_config(info->attrs); + goto out; + } else if (cmd == IPVS_CMD_NEW_DAEMON || + cmd == IPVS_CMD_DEL_DAEMON) { + + struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1]; + + if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || + nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX, + info->attrs[IPVS_CMD_ATTR_DAEMON], + ip_vs_daemon_policy)) { + ret = -EINVAL; + goto out; + } + + if (cmd == IPVS_CMD_NEW_DAEMON) + ret = ip_vs_genl_new_daemon(daemon_attrs); + else + ret = ip_vs_genl_del_daemon(daemon_attrs); + goto out; + } else if (cmd == IPVS_CMD_ZERO && + !info->attrs[IPVS_CMD_ATTR_SERVICE]) { + ret = ip_vs_zero_all(); + goto out; + } + + /* All following commands require a service argument, so check if we + * received a valid one. We need a full service specification when + * adding / editing a service. Only identifying members otherwise. */ + if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) + need_full_svc = 1; + + ret = ip_vs_genl_parse_service(&usvc, + info->attrs[IPVS_CMD_ATTR_SERVICE], + need_full_svc); + if (ret) + goto out; + + /* Lookup the exact service by or fwmark */ + if (usvc.fwmark == 0) + svc = __ip_vs_service_get(usvc.af, usvc.protocol, + &usvc.addr, usvc.port); + else + svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); + + /* Unless we're adding a new service, the service must already exist */ + if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) { + ret = -ESRCH; + goto out; + } + + /* Destination commands require a valid destination argument. For + * adding / editing a destination, we need a full destination + * specification. */ + if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST || + cmd == IPVS_CMD_DEL_DEST) { + if (cmd != IPVS_CMD_DEL_DEST) + need_full_dest = 1; + + ret = ip_vs_genl_parse_dest(&udest, + info->attrs[IPVS_CMD_ATTR_DEST], + need_full_dest); + if (ret) + goto out; + } + + switch (cmd) { + case IPVS_CMD_NEW_SERVICE: + if (svc == NULL) + ret = ip_vs_add_service(&usvc, &svc); + else + ret = -EEXIST; + break; + case IPVS_CMD_SET_SERVICE: + ret = ip_vs_edit_service(svc, &usvc); + break; + case IPVS_CMD_DEL_SERVICE: + ret = ip_vs_del_service(svc); + break; + case IPVS_CMD_NEW_DEST: + ret = ip_vs_add_dest(svc, &udest); + break; + case IPVS_CMD_SET_DEST: + ret = ip_vs_edit_dest(svc, &udest); + break; + case IPVS_CMD_DEL_DEST: + ret = ip_vs_del_dest(svc, &udest); + break; + case IPVS_CMD_ZERO: + ret = ip_vs_zero_service(svc); + break; + default: + ret = -EINVAL; + } + +out: + if (svc) + ip_vs_service_put(svc); + mutex_unlock(&__ip_vs_mutex); + + return ret; +} + +static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) +{ + struct sk_buff *msg; + void *reply; + int ret, cmd, reply_cmd; + + cmd = info->genlhdr->cmd; + + if (cmd == IPVS_CMD_GET_SERVICE) + reply_cmd = IPVS_CMD_NEW_SERVICE; + else if (cmd == IPVS_CMD_GET_INFO) + reply_cmd = IPVS_CMD_SET_INFO; + else if (cmd == IPVS_CMD_GET_CONFIG) + reply_cmd = IPVS_CMD_SET_CONFIG; + else { + IP_VS_ERR("unknown Generic Netlink command\n"); + return -EINVAL; + } + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + mutex_lock(&__ip_vs_mutex); + + reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd); + if (reply == NULL) + goto nla_put_failure; + + switch (cmd) { + case IPVS_CMD_GET_SERVICE: + { + struct ip_vs_service *svc; + + svc = ip_vs_genl_find_service(info->attrs[IPVS_CMD_ATTR_SERVICE]); + if (IS_ERR(svc)) { + ret = PTR_ERR(svc); + goto out_err; + } else if (svc) { + ret = ip_vs_genl_fill_service(msg, svc); + ip_vs_service_put(svc); + if (ret) + goto nla_put_failure; + } else { + ret = -ESRCH; + goto out_err; + } + + break; + } + + case IPVS_CMD_GET_CONFIG: + { + struct ip_vs_timeout_user t; + + __ip_vs_get_timeouts(&t); +#ifdef CONFIG_IP_VS_PROTO_TCP + NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout); + NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, + t.tcp_fin_timeout); +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout); +#endif + + break; + } + + case IPVS_CMD_GET_INFO: + NLA_PUT_U32(msg, IPVS_INFO_ATTR_VERSION, IP_VS_VERSION_CODE); + NLA_PUT_U32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE, + IP_VS_CONN_TAB_SIZE); + break; + } + + genlmsg_end(msg, reply); + ret = genlmsg_unicast(msg, info->snd_pid); + goto out; + +nla_put_failure: + IP_VS_ERR("not enough space in Netlink message\n"); + ret = -EMSGSIZE; + +out_err: + nlmsg_free(msg); +out: + mutex_unlock(&__ip_vs_mutex); + + return ret; +} + + +static struct genl_ops ip_vs_genl_ops[] __read_mostly = { + { + .cmd = IPVS_CMD_NEW_SERVICE, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_SET_SERVICE, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_DEL_SERVICE, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_SERVICE, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_get_cmd, + .dumpit = ip_vs_genl_dump_services, + .policy = ip_vs_cmd_policy, + }, + { + .cmd = IPVS_CMD_NEW_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_SET_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_DEL_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_DEST, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .dumpit = ip_vs_genl_dump_dests, + }, + { + .cmd = IPVS_CMD_NEW_DAEMON, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_DEL_DAEMON, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_DAEMON, + .flags = GENL_ADMIN_PERM, + .dumpit = ip_vs_genl_dump_daemons, + }, + { + .cmd = IPVS_CMD_SET_CONFIG, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_GET_CONFIG, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_get_cmd, + }, + { + .cmd = IPVS_CMD_GET_INFO, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_get_cmd, + }, + { + .cmd = IPVS_CMD_ZERO, + .flags = GENL_ADMIN_PERM, + .policy = ip_vs_cmd_policy, + .doit = ip_vs_genl_set_cmd, + }, + { + .cmd = IPVS_CMD_FLUSH, + .flags = GENL_ADMIN_PERM, + .doit = ip_vs_genl_set_cmd, + }, +}; + +static int __init ip_vs_genl_register(void) +{ + int ret, i; + + ret = genl_register_family(&ip_vs_genl_family); + if (ret) + return ret; + + for (i = 0; i < ARRAY_SIZE(ip_vs_genl_ops); i++) { + ret = genl_register_ops(&ip_vs_genl_family, &ip_vs_genl_ops[i]); + if (ret) + goto err_out; + } + return 0; + +err_out: + genl_unregister_family(&ip_vs_genl_family); + return ret; +} + +static void ip_vs_genl_unregister(void) +{ + genl_unregister_family(&ip_vs_genl_family); +} + +/* End of Generic Netlink interface definitions */ + + +int __init ip_vs_control_init(void) +{ + int ret; + int idx; + + EnterFunction(2); + + ret = nf_register_sockopt(&ip_vs_sockopts); + if (ret) { + IP_VS_ERR("cannot register sockopt.\n"); + return ret; + } + + ret = ip_vs_genl_register(); + if (ret) { + IP_VS_ERR("cannot register Generic Netlink interface.\n"); + nf_unregister_sockopt(&ip_vs_sockopts); + return ret; + } + + proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops); + proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops); + + sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars); + + /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_svc_table[idx]); + INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); + } + for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_rtable[idx]); + } + + ip_vs_new_estimator(&ip_vs_stats); + + /* Hook the defense timer */ + schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); + + LeaveFunction(2); + return 0; +} + + +void ip_vs_control_cleanup(void) +{ + EnterFunction(2); + ip_vs_trash_cleanup(); + cancel_rearming_delayed_work(&defense_work); + cancel_work_sync(&defense_work.work); + ip_vs_kill_estimator(&ip_vs_stats); + unregister_sysctl_table(sysctl_header); + proc_net_remove(&init_net, "ip_vs_stats"); + proc_net_remove(&init_net, "ip_vs"); + ip_vs_genl_unregister(); + nf_unregister_sockopt(&ip_vs_sockopts); + LeaveFunction(2); +} diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c new file mode 100644 index 00000000000..a16943fd72f --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -0,0 +1,261 @@ +/* + * IPVS: Destination Hashing scheduling module + * + * Authors: Wensong Zhang + * + * Inspired by the consistent hashing scheduler patch from + * Thomas Proell + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The dh algorithm is to select server by the hash key of destination IP + * address. The pseudo code is as follows: + * + * n <- servernode[dest_ip]; + * if (n is dead) OR + * (n is overloaded) OR (n.weight <= 0) then + * return NULL; + * + * return n; + * + * Notes that servernode is a 256-bucket hash table that maps the hash + * index derived from packet destination IP address to the current server + * array. If the dh scheduler is used in cache cluster, it is good to + * combine it with cache_bypass feature. When the statically assigned + * server is dead or overloaded, the load balancer can bypass the cache + * server and send requests to the original server directly. + * + */ + +#include +#include +#include +#include + +#include + + +/* + * IPVS DH bucket + */ +struct ip_vs_dh_bucket { + struct ip_vs_dest *dest; /* real server (cache) */ +}; + +/* + * for IPVS DH entry hash table + */ +#ifndef CONFIG_IP_VS_DH_TAB_BITS +#define CONFIG_IP_VS_DH_TAB_BITS 8 +#endif +#define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS +#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS) +#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1) + + +/* + * Returns hash value for IPVS DH entry + */ +static inline unsigned ip_vs_dh_hashkey(__be32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_DH_TAB_MASK; +} + + +/* + * Get ip_vs_dest associated with supplied parameters. + */ +static inline struct ip_vs_dest * +ip_vs_dh_get(struct ip_vs_dh_bucket *tbl, __be32 addr) +{ + return (tbl[ip_vs_dh_hashkey(addr)]).dest; +} + + +/* + * Assign all the hash buckets of the specified table with the service. + */ +static int +ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc) +{ + int i; + struct ip_vs_dh_bucket *b; + struct list_head *p; + struct ip_vs_dest *dest; + + b = tbl; + p = &svc->destinations; + for (i=0; idest = NULL; + } else { + if (p == &svc->destinations) + p = p->next; + + dest = list_entry(p, struct ip_vs_dest, n_list); + atomic_inc(&dest->refcnt); + b->dest = dest; + + p = p->next; + } + b++; + } + return 0; +} + + +/* + * Flush all the hash buckets of the specified table. + */ +static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl) +{ + int i; + struct ip_vs_dh_bucket *b; + + b = tbl; + for (i=0; idest) { + atomic_dec(&b->dest->refcnt); + b->dest = NULL; + } + b++; + } +} + + +static int ip_vs_dh_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_bucket *tbl; + + /* allocate the DH table for this service */ + tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE, + GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); + + /* assign the hash buckets with the updated service */ + ip_vs_dh_assign(tbl, svc); + + return 0; +} + + +static int ip_vs_dh_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_dh_flush(tbl); + + /* release the table itself */ + kfree(svc->sched_data); + IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n", + sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); + + return 0; +} + + +static int ip_vs_dh_update_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_dh_flush(tbl); + + /* assign the hash buckets with the updated service */ + ip_vs_dh_assign(tbl, svc); + + return 0; +} + + +/* + * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, + * consider that the server is overloaded here. + */ +static inline int is_overloaded(struct ip_vs_dest *dest) +{ + return dest->flags & IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Destination hashing scheduling + */ +static struct ip_vs_dest * +ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest; + struct ip_vs_dh_bucket *tbl; + struct iphdr *iph = ip_hdr(skb); + + IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n"); + + tbl = (struct ip_vs_dh_bucket *)svc->sched_data; + dest = ip_vs_dh_get(tbl, iph->daddr); + if (!dest + || !(dest->flags & IP_VS_DEST_F_AVAILABLE) + || atomic_read(&dest->weight) <= 0 + || is_overloaded(dest)) { + return NULL; + } + + IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(iph->daddr), + NIPQUAD(dest->addr.ip), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS DH Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_dh_scheduler = +{ + .name = "dh", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 0, +#endif + .init_service = ip_vs_dh_init_svc, + .done_service = ip_vs_dh_done_svc, + .update_service = ip_vs_dh_update_svc, + .schedule = ip_vs_dh_schedule, +}; + + +static int __init ip_vs_dh_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_dh_scheduler); +} + + +static void __exit ip_vs_dh_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_dh_scheduler); +} + + +module_init(ip_vs_dh_init); +module_exit(ip_vs_dh_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c new file mode 100644 index 00000000000..2eb2860dabb --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -0,0 +1,166 @@ +/* + * ip_vs_est.c: simple rate estimator for IPVS + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + This code is to estimate rate in a shorter interval (such as 8 + seconds) for virtual services and real servers. For measure rate in a + long interval, it is easy to implement a user level daemon which + periodically reads those statistical counters and measure rate. + + Currently, the measurement is activated by slow timer handler. Hope + this measurement will not introduce too much load. + + We measure rate during the last 8 seconds every 2 seconds: + + avgrate = avgrate*(1-W) + rate*W + + where W = 2^(-2) + + NOTES. + + * The stored value for average bps is scaled by 2^5, so that maximal + rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10. + + * A lot code is taken from net/sched/estimator.c + */ + + +static void estimation_timer(unsigned long arg); + +static LIST_HEAD(est_list); +static DEFINE_SPINLOCK(est_lock); +static DEFINE_TIMER(est_timer, estimation_timer, 0, 0); + +static void estimation_timer(unsigned long arg) +{ + struct ip_vs_estimator *e; + struct ip_vs_stats *s; + u32 n_conns; + u32 n_inpkts, n_outpkts; + u64 n_inbytes, n_outbytes; + u32 rate; + + spin_lock(&est_lock); + list_for_each_entry(e, &est_list, list) { + s = container_of(e, struct ip_vs_stats, est); + + spin_lock(&s->lock); + n_conns = s->ustats.conns; + n_inpkts = s->ustats.inpkts; + n_outpkts = s->ustats.outpkts; + n_inbytes = s->ustats.inbytes; + n_outbytes = s->ustats.outbytes; + + /* scaled by 2^10, but divided 2 seconds */ + rate = (n_conns - e->last_conns)<<9; + e->last_conns = n_conns; + e->cps += ((long)rate - (long)e->cps)>>2; + s->ustats.cps = (e->cps+0x1FF)>>10; + + rate = (n_inpkts - e->last_inpkts)<<9; + e->last_inpkts = n_inpkts; + e->inpps += ((long)rate - (long)e->inpps)>>2; + s->ustats.inpps = (e->inpps+0x1FF)>>10; + + rate = (n_outpkts - e->last_outpkts)<<9; + e->last_outpkts = n_outpkts; + e->outpps += ((long)rate - (long)e->outpps)>>2; + s->ustats.outpps = (e->outpps+0x1FF)>>10; + + rate = (n_inbytes - e->last_inbytes)<<4; + e->last_inbytes = n_inbytes; + e->inbps += ((long)rate - (long)e->inbps)>>2; + s->ustats.inbps = (e->inbps+0xF)>>5; + + rate = (n_outbytes - e->last_outbytes)<<4; + e->last_outbytes = n_outbytes; + e->outbps += ((long)rate - (long)e->outbps)>>2; + s->ustats.outbps = (e->outbps+0xF)>>5; + spin_unlock(&s->lock); + } + spin_unlock(&est_lock); + mod_timer(&est_timer, jiffies + 2*HZ); +} + +void ip_vs_new_estimator(struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *est = &stats->est; + + INIT_LIST_HEAD(&est->list); + + est->last_conns = stats->ustats.conns; + est->cps = stats->ustats.cps<<10; + + est->last_inpkts = stats->ustats.inpkts; + est->inpps = stats->ustats.inpps<<10; + + est->last_outpkts = stats->ustats.outpkts; + est->outpps = stats->ustats.outpps<<10; + + est->last_inbytes = stats->ustats.inbytes; + est->inbps = stats->ustats.inbps<<5; + + est->last_outbytes = stats->ustats.outbytes; + est->outbps = stats->ustats.outbps<<5; + + spin_lock_bh(&est_lock); + list_add(&est->list, &est_list); + spin_unlock_bh(&est_lock); +} + +void ip_vs_kill_estimator(struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *est = &stats->est; + + spin_lock_bh(&est_lock); + list_del(&est->list); + spin_unlock_bh(&est_lock); +} + +void ip_vs_zero_estimator(struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *est = &stats->est; + + /* set counters zero, caller must hold the stats->lock lock */ + est->last_inbytes = 0; + est->last_outbytes = 0; + est->last_conns = 0; + est->last_inpkts = 0; + est->last_outpkts = 0; + est->cps = 0; + est->inpps = 0; + est->outpps = 0; + est->inbps = 0; + est->outbps = 0; +} + +int __init ip_vs_estimator_init(void) +{ + mod_timer(&est_timer, jiffies + 2 * HZ); + return 0; +} + +void ip_vs_estimator_cleanup(void) +{ + del_timer_sync(&est_timer); +} diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c new file mode 100644 index 00000000000..2e7dbd8b73a --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -0,0 +1,410 @@ +/* + * ip_vs_ftp.c: IPVS ftp application module + * + * Authors: Wensong Zhang + * + * Changes: + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference + * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp. + * + * IP_MASQ_FTP ftp masquerading module + * + * Version: @(#)ip_masq_ftp.c 0.04 02/05/96 + * + * Author: Wouter Gadeyne + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +#define SERVER_STRING "227 Entering Passive Mode (" +#define CLIENT_STRING "PORT " + + +/* + * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper + * First port is set to the default port. + */ +static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0}; +module_param_array(ports, ushort, NULL, 0); +MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands"); + + +/* Dummy variable */ +static int ip_vs_ftp_pasv; + + +static int +ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) +{ + return 0; +} + + +static int +ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) +{ + return 0; +} + + +/* + * Get from the string "xxx.xxx.xxx.xxx,ppp,ppp", started + * with the "pattern" and terminated with the "term" character. + * is in network order. + */ +static int ip_vs_ftp_get_addrport(char *data, char *data_limit, + const char *pattern, size_t plen, char term, + __be32 *addr, __be16 *port, + char **start, char **end) +{ + unsigned char p[6]; + int i = 0; + + if (data_limit - data < plen) { + /* check if there is partial match */ + if (strnicmp(data, pattern, data_limit - data) == 0) + return -1; + else + return 0; + } + + if (strnicmp(data, pattern, plen) != 0) { + return 0; + } + *start = data + plen; + + for (data = *start; *data != term; data++) { + if (data == data_limit) + return -1; + } + *end = data; + + memset(p, 0, sizeof(p)); + for (data = *start; data != *end; data++) { + if (*data >= '0' && *data <= '9') { + p[i] = p[i]*10 + *data - '0'; + } else if (*data == ',' && i < 5) { + i++; + } else { + /* unexpected character */ + return -1; + } + } + + if (i != 5) + return -1; + + *addr = get_unaligned((__be32 *)p); + *port = get_unaligned((__be16 *)(p + 4)); + return 1; +} + + +/* + * Look at outgoing ftp packets to catch the response to a PASV command + * from the server (inside-to-outside). + * When we see one, we build a connection entry with the client address, + * client port 0 (unknown at the moment), the server address and the + * server port. Mark the current connection entry as a control channel + * of the new entry. All this work is just to make the data connection + * can be scheduled to the right server later. + * + * The outgoing packet should be something like + * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)". + * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number. + */ +static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, + struct sk_buff *skb, int *diff) +{ + struct iphdr *iph; + struct tcphdr *th; + char *data, *data_limit; + char *start, *end; + union nf_inet_addr from; + __be16 port; + struct ip_vs_conn *n_cp; + char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ + unsigned buf_len; + int ret; + +#ifdef CONFIG_IP_VS_IPV6 + /* This application helper doesn't work with IPv6 yet, + * so turn this into a no-op for IPv6 packets + */ + if (cp->af == AF_INET6) + return 1; +#endif + + *diff = 0; + + /* Only useful for established sessions */ + if (cp->state != IP_VS_TCP_S_ESTABLISHED) + return 1; + + /* Linear packets are much easier to deal with. */ + if (!skb_make_writable(skb, skb->len)) + return 0; + + if (cp->app_data == &ip_vs_ftp_pasv) { + iph = ip_hdr(skb); + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + data = (char *)th + (th->doff << 2); + data_limit = skb_tail_pointer(skb); + + if (ip_vs_ftp_get_addrport(data, data_limit, + SERVER_STRING, + sizeof(SERVER_STRING)-1, ')', + &from.ip, &port, + &start, &end) != 1) + return 1; + + IP_VS_DBG(7, "PASV response (%u.%u.%u.%u:%d) -> " + "%u.%u.%u.%u:%d detected\n", + NIPQUAD(from.ip), ntohs(port), + NIPQUAD(cp->caddr.ip), 0); + + /* + * Now update or create an connection entry for it + */ + n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port, + &cp->caddr, 0); + if (!n_cp) { + n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP, + &cp->caddr, 0, + &cp->vaddr, port, + &from, port, + IP_VS_CONN_F_NO_CPORT, + cp->dest); + if (!n_cp) + return 0; + + /* add its controller */ + ip_vs_control_add(n_cp, cp); + } + + /* + * Replace the old passive address with the new one + */ + from.ip = n_cp->vaddr.ip; + port = n_cp->vport; + sprintf(buf, "%d,%d,%d,%d,%d,%d", NIPQUAD(from.ip), + (ntohs(port)>>8)&255, ntohs(port)&255); + buf_len = strlen(buf); + + /* + * Calculate required delta-offset to keep TCP happy + */ + *diff = buf_len - (end-start); + + if (*diff == 0) { + /* simply replace it with new passive address */ + memcpy(start, buf, buf_len); + ret = 1; + } else { + ret = !ip_vs_skb_replace(skb, GFP_ATOMIC, start, + end-start, buf, buf_len); + } + + cp->app_data = NULL; + ip_vs_tcp_conn_listen(n_cp); + ip_vs_conn_put(n_cp); + return ret; + } + return 1; +} + + +/* + * Look at incoming ftp packets to catch the PASV/PORT command + * (outside-to-inside). + * + * The incoming packet having the PORT command should be something like + * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n". + * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number. + * In this case, we create a connection entry using the client address and + * port, so that the active ftp data connection from the server can reach + * the client. + */ +static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, + struct sk_buff *skb, int *diff) +{ + struct iphdr *iph; + struct tcphdr *th; + char *data, *data_start, *data_limit; + char *start, *end; + union nf_inet_addr to; + __be16 port; + struct ip_vs_conn *n_cp; + +#ifdef CONFIG_IP_VS_IPV6 + /* This application helper doesn't work with IPv6 yet, + * so turn this into a no-op for IPv6 packets + */ + if (cp->af == AF_INET6) + return 1; +#endif + + /* no diff required for incoming packets */ + *diff = 0; + + /* Only useful for established sessions */ + if (cp->state != IP_VS_TCP_S_ESTABLISHED) + return 1; + + /* Linear packets are much easier to deal with. */ + if (!skb_make_writable(skb, skb->len)) + return 0; + + /* + * Detecting whether it is passive + */ + iph = ip_hdr(skb); + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + + /* Since there may be OPTIONS in the TCP packet and the HLEN is + the length of the header in 32-bit multiples, it is accurate + to calculate data address by th+HLEN*4 */ + data = data_start = (char *)th + (th->doff << 2); + data_limit = skb_tail_pointer(skb); + + while (data <= data_limit - 6) { + if (strnicmp(data, "PASV\r\n", 6) == 0) { + /* Passive mode on */ + IP_VS_DBG(7, "got PASV at %td of %td\n", + data - data_start, + data_limit - data_start); + cp->app_data = &ip_vs_ftp_pasv; + return 1; + } + data++; + } + + /* + * To support virtual FTP server, the scenerio is as follows: + * FTP client ----> Load Balancer ----> FTP server + * First detect the port number in the application data, + * then create a new connection entry for the coming data + * connection. + */ + if (ip_vs_ftp_get_addrport(data_start, data_limit, + CLIENT_STRING, sizeof(CLIENT_STRING)-1, + '\r', &to.ip, &port, + &start, &end) != 1) + return 1; + + IP_VS_DBG(7, "PORT %u.%u.%u.%u:%d detected\n", + NIPQUAD(to.ip), ntohs(port)); + + /* Passive mode off */ + cp->app_data = NULL; + + /* + * Now update or create a connection entry for it + */ + IP_VS_DBG(7, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n", + ip_vs_proto_name(iph->protocol), + NIPQUAD(to.ip), ntohs(port), NIPQUAD(cp->vaddr.ip), 0); + + n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol, + &to, port, + &cp->vaddr, htons(ntohs(cp->vport)-1)); + if (!n_cp) { + n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP, + &to, port, + &cp->vaddr, htons(ntohs(cp->vport)-1), + &cp->daddr, htons(ntohs(cp->dport)-1), + 0, + cp->dest); + if (!n_cp) + return 0; + + /* add its controller */ + ip_vs_control_add(n_cp, cp); + } + + /* + * Move tunnel to listen state + */ + ip_vs_tcp_conn_listen(n_cp); + ip_vs_conn_put(n_cp); + + return 1; +} + + +static struct ip_vs_app ip_vs_ftp = { + .name = "ftp", + .type = IP_VS_APP_TYPE_FTP, + .protocol = IPPROTO_TCP, + .module = THIS_MODULE, + .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list), + .init_conn = ip_vs_ftp_init_conn, + .done_conn = ip_vs_ftp_done_conn, + .bind_conn = NULL, + .unbind_conn = NULL, + .pkt_out = ip_vs_ftp_out, + .pkt_in = ip_vs_ftp_in, +}; + + +/* + * ip_vs_ftp initialization + */ +static int __init ip_vs_ftp_init(void) +{ + int i, ret; + struct ip_vs_app *app = &ip_vs_ftp; + + ret = register_ip_vs_app(app); + if (ret) + return ret; + + for (i=0; iprotocol, ports[i]); + if (ret) + break; + IP_VS_INFO("%s: loaded support on port[%d] = %d\n", + app->name, i, ports[i]); + } + + if (ret) + unregister_ip_vs_app(app); + + return ret; +} + + +/* + * ip_vs_ftp finish. + */ +static void __exit ip_vs_ftp_exit(void) +{ + unregister_ip_vs_app(&ip_vs_ftp); +} + + +module_init(ip_vs_ftp_init); +module_exit(ip_vs_ftp_exit); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c new file mode 100644 index 00000000000..6ecef3518ca --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -0,0 +1,555 @@ +/* + * IPVS: Locality-Based Least-Connection scheduling module + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Martin Hamilton : fixed the terrible locking bugs + * *lock(tbl->lock) ==> *lock(&tbl->lock) + * Wensong Zhang : fixed the uninitilized tbl->lock bug + * Wensong Zhang : added doing full expiration check to + * collect stale entries of 24+ hours when + * no partial expire check in a half hour + * Julian Anastasov : replaced del_timer call with del_timer_sync + * to avoid the possible race between timer + * handler and del_timer thread in SMP + * + */ + +/* + * The lblc algorithm is as follows (pseudo code): + * + * if cachenode[dest_ip] is null then + * n, cachenode[dest_ip] <- {weighted least-conn node}; + * else + * n <- cachenode[dest_ip]; + * if (n is dead) OR + * (n.conns>n.weight AND + * there is a node m with m.conns +#include +#include +#include +#include + +/* for sysctl */ +#include +#include + +#include + + +/* + * It is for garbage collection of stale IPVS lblc entries, + * when the table is full. + */ +#define CHECK_EXPIRE_INTERVAL (60*HZ) +#define ENTRY_TIMEOUT (6*60*HZ) + +/* + * It is for full expiration check. + * When there is no partial expiration check (garbage collection) + * in a half hour, do a full expiration check to collect stale + * entries that haven't been touched for a day. + */ +#define COUNT_FOR_FULL_EXPIRATION 30 +static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ; + + +/* + * for IPVS lblc entry hash table + */ +#ifndef CONFIG_IP_VS_LBLC_TAB_BITS +#define CONFIG_IP_VS_LBLC_TAB_BITS 10 +#endif +#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS +#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) +#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) + + +/* + * IPVS lblc entry represents an association between destination + * IP address and its destination server + */ +struct ip_vs_lblc_entry { + struct list_head list; + __be32 addr; /* destination IP address */ + struct ip_vs_dest *dest; /* real server (cache) */ + unsigned long lastuse; /* last used time */ +}; + + +/* + * IPVS lblc hash table + */ +struct ip_vs_lblc_table { + struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ + atomic_t entries; /* number of entries */ + int max_size; /* maximum size of entries */ + struct timer_list periodic_timer; /* collect stale entries */ + int rover; /* rover for expire check */ + int counter; /* counter for no expire */ +}; + + +/* + * IPVS LBLC sysctl table + */ + +static ctl_table vs_vars_table[] = { + { + .procname = "lblc_expiration", + .data = &sysctl_ip_vs_lblc_expiration, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { .ctl_name = 0 } +}; + +static struct ctl_table_header * sysctl_header; + +static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) +{ + list_del(&en->list); + /* + * We don't kfree dest because it is refered either by its service + * or the trash dest list. + */ + atomic_dec(&en->dest->refcnt); + kfree(en); +} + + +/* + * Returns hash value for IPVS LBLC entry + */ +static inline unsigned ip_vs_lblc_hashkey(__be32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK; +} + + +/* + * Hash an entry in the ip_vs_lblc_table. + * returns bool success. + */ +static void +ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) +{ + unsigned hash = ip_vs_lblc_hashkey(en->addr); + + list_add(&en->list, &tbl->bucket[hash]); + atomic_inc(&tbl->entries); +} + + +/* + * Get ip_vs_lblc_entry associated with supplied parameters. Called under read + * lock + */ +static inline struct ip_vs_lblc_entry * +ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr) +{ + unsigned hash = ip_vs_lblc_hashkey(addr); + struct ip_vs_lblc_entry *en; + + list_for_each_entry(en, &tbl->bucket[hash], list) + if (en->addr == addr) + return en; + + return NULL; +} + + +/* + * Create or update an ip_vs_lblc_entry, which is a mapping of a destination IP + * address to a server. Called under write lock. + */ +static inline struct ip_vs_lblc_entry * +ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, __be32 daddr, + struct ip_vs_dest *dest) +{ + struct ip_vs_lblc_entry *en; + + en = ip_vs_lblc_get(tbl, daddr); + if (!en) { + en = kmalloc(sizeof(*en), GFP_ATOMIC); + if (!en) { + IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); + return NULL; + } + + en->addr = daddr; + en->lastuse = jiffies; + + atomic_inc(&dest->refcnt); + en->dest = dest; + + ip_vs_lblc_hash(tbl, en); + } else if (en->dest != dest) { + atomic_dec(&en->dest->refcnt); + atomic_inc(&dest->refcnt); + en->dest = dest; + } + + return en; +} + + +/* + * Flush all the entries of the specified table. + */ +static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) +{ + struct ip_vs_lblc_entry *en, *nxt; + int i; + + for (i=0; ibucket[i], list) { + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + } + } +} + + +static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc) +{ + struct ip_vs_lblc_table *tbl = svc->sched_data; + struct ip_vs_lblc_entry *en, *nxt; + unsigned long now = jiffies; + int i, j; + + for (i=0, j=tbl->rover; isched_lock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + if (time_before(now, + en->lastuse + sysctl_ip_vs_lblc_expiration)) + continue; + + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&svc->sched_lock); + } + tbl->rover = j; +} + + +/* + * Periodical timer handler for IPVS lblc table + * It is used to collect stale entries when the number of entries + * exceeds the maximum size of the table. + * + * Fixme: we probably need more complicated algorithm to collect + * entries that have not been used for a long time even + * if the number of entries doesn't exceed the maximum size + * of the table. + * The full expiration check is for this purpose now. + */ +static void ip_vs_lblc_check_expire(unsigned long data) +{ + struct ip_vs_service *svc = (struct ip_vs_service *) data; + struct ip_vs_lblc_table *tbl = svc->sched_data; + unsigned long now = jiffies; + int goal; + int i, j; + struct ip_vs_lblc_entry *en, *nxt; + + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { + /* do full expiration check */ + ip_vs_lblc_full_check(svc); + tbl->counter = 1; + goto out; + } + + if (atomic_read(&tbl->entries) <= tbl->max_size) { + tbl->counter++; + goto out; + } + + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; + if (goal > tbl->max_size/2) + goal = tbl->max_size/2; + + for (i=0, j=tbl->rover; isched_lock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) + continue; + + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + goal--; + } + write_unlock(&svc->sched_lock); + if (goal <= 0) + break; + } + tbl->rover = j; + + out: + mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); +} + + +static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) +{ + int i; + struct ip_vs_lblc_table *tbl; + + /* + * Allocate the ip_vs_lblc_table for this service + */ + tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " + "current service\n", sizeof(*tbl)); + + /* + * Initialize the hash buckets + */ + for (i=0; ibucket[i]); + } + tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; + tbl->rover = 0; + tbl->counter = 1; + + /* + * Hook periodic timer for garbage collection + */ + setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, + (unsigned long)svc); + mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); + + return 0; +} + + +static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_lblc_table *tbl = svc->sched_data; + + /* remove periodic timer */ + del_timer_sync(&tbl->periodic_timer); + + /* got to clean up table entries here */ + ip_vs_lblc_flush(tbl); + + /* release the table itself */ + kfree(tbl); + IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", + sizeof(*tbl)); + + return 0; +} + + +static inline struct ip_vs_dest * +__ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct ip_vs_dest *dest, *least; + int loh, doh; + + /* + * We think the overhead of processing active connections is fifty + * times higher than that of inactive connections in average. (This + * fifty times might not be accurate, we will change it later.) We + * use the following formula to estimate the overhead: + * dest->activeconns*50 + dest->inactconns + * and the load: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connection. + */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + if (atomic_read(&dest->weight) > 0) { + least = dest; + loh = atomic_read(&least->activeconns) * 50 + + atomic_read(&least->inactconns); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr.ip), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +/* + * If this destination server is overloaded and there is a less loaded + * server, then return true. + */ +static inline int +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { + struct ip_vs_dest *d; + + list_for_each_entry(d, &svc->destinations, n_list) { + if (atomic_read(&d->activeconns)*2 + < atomic_read(&d->weight)) { + return 1; + } + } + } + return 0; +} + + +/* + * Locality-Based (weighted) Least-Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_lblc_table *tbl = svc->sched_data; + struct iphdr *iph = ip_hdr(skb); + struct ip_vs_dest *dest = NULL; + struct ip_vs_lblc_entry *en; + + IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); + + /* First look in our cache */ + read_lock(&svc->sched_lock); + en = ip_vs_lblc_get(tbl, iph->daddr); + if (en) { + /* We only hold a read lock, but this is atomic */ + en->lastuse = jiffies; + + /* + * If the destination is not available, i.e. it's in the trash, + * we must ignore it, as it may be removed from under our feet, + * if someone drops our reference count. Our caller only makes + * sure that destinations, that are not in the trash, are not + * moved to the trash, while we are scheduling. But anyone can + * free up entries from the trash at any time. + */ + + if (en->dest->flags & IP_VS_DEST_F_AVAILABLE) + dest = en->dest; + } + read_unlock(&svc->sched_lock); + + /* If the destination has a weight and is not overloaded, use it */ + if (dest && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc)) + goto out; + + /* No cache entry or it is invalid, time to schedule */ + dest = __ip_vs_lblc_schedule(svc, iph); + if (!dest) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + + /* If we fail to create a cache entry, we'll just use the valid dest */ + write_lock(&svc->sched_lock); + ip_vs_lblc_new(tbl, iph->daddr, dest); + write_unlock(&svc->sched_lock); + +out: + IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(iph->daddr), + NIPQUAD(dest->addr.ip), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS LBLC Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_lblc_scheduler = +{ + .name = "lblc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 0, +#endif + .init_service = ip_vs_lblc_init_svc, + .done_service = ip_vs_lblc_done_svc, + .schedule = ip_vs_lblc_schedule, +}; + + +static int __init ip_vs_lblc_init(void) +{ + int ret; + + sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); + ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler); + if (ret) + unregister_sysctl_table(sysctl_header); + return ret; +} + + +static void __exit ip_vs_lblc_cleanup(void) +{ + unregister_sysctl_table(sysctl_header); + unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); +} + + +module_init(ip_vs_lblc_init); +module_exit(ip_vs_lblc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c new file mode 100644 index 00000000000..1f75ea83bcf --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -0,0 +1,755 @@ +/* + * IPVS: Locality-Based Least-Connection with Replication scheduler + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Julian Anastasov : Added the missing (dest->weight>0) + * condition in the ip_vs_dest_set_max. + * + */ + +/* + * The lblc/r algorithm is as follows (pseudo code): + * + * if serverSet[dest_ip] is null then + * n, serverSet[dest_ip] <- {weighted least-conn node}; + * else + * n <- {least-conn (alive) node in serverSet[dest_ip]}; + * if (n is null) OR + * (n.conns>n.weight AND + * there is a node m with m.conns 1 AND + * now - serverSet[dest_ip].lastMod > T then + * m <- {most conn node in serverSet[dest_ip]}; + * remove m from serverSet[dest_ip]; + * if serverSet[dest_ip] changed then + * serverSet[dest_ip].lastMod <- now; + * + * return n; + * + */ + +#include +#include +#include +#include +#include + +/* for sysctl */ +#include +#include +#include + +#include + + +/* + * It is for garbage collection of stale IPVS lblcr entries, + * when the table is full. + */ +#define CHECK_EXPIRE_INTERVAL (60*HZ) +#define ENTRY_TIMEOUT (6*60*HZ) + +/* + * It is for full expiration check. + * When there is no partial expiration check (garbage collection) + * in a half hour, do a full expiration check to collect stale + * entries that haven't been touched for a day. + */ +#define COUNT_FOR_FULL_EXPIRATION 30 +static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ; + + +/* + * for IPVS lblcr entry hash table + */ +#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS +#define CONFIG_IP_VS_LBLCR_TAB_BITS 10 +#endif +#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS +#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS) +#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1) + + +/* + * IPVS destination set structure and operations + */ +struct ip_vs_dest_list { + struct ip_vs_dest_list *next; /* list link */ + struct ip_vs_dest *dest; /* destination server */ +}; + +struct ip_vs_dest_set { + atomic_t size; /* set size */ + unsigned long lastmod; /* last modified time */ + struct ip_vs_dest_list *list; /* destination list */ + rwlock_t lock; /* lock for this list */ +}; + + +static struct ip_vs_dest_list * +ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) +{ + struct ip_vs_dest_list *e; + + for (e=set->list; e!=NULL; e=e->next) { + if (e->dest == dest) + /* already existed */ + return NULL; + } + + e = kmalloc(sizeof(*e), GFP_ATOMIC); + if (e == NULL) { + IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n"); + return NULL; + } + + atomic_inc(&dest->refcnt); + e->dest = dest; + + /* link it to the list */ + e->next = set->list; + set->list = e; + atomic_inc(&set->size); + + set->lastmod = jiffies; + return e; +} + +static void +ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) +{ + struct ip_vs_dest_list *e, **ep; + + for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { + if (e->dest == dest) { + /* HIT */ + *ep = e->next; + atomic_dec(&set->size); + set->lastmod = jiffies; + atomic_dec(&e->dest->refcnt); + kfree(e); + break; + } + ep = &e->next; + } +} + +static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) +{ + struct ip_vs_dest_list *e, **ep; + + write_lock(&set->lock); + for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { + *ep = e->next; + /* + * We don't kfree dest because it is refered either + * by its service or by the trash dest list. + */ + atomic_dec(&e->dest->refcnt); + kfree(e); + } + write_unlock(&set->lock); +} + +/* get weighted least-connection node in the destination set */ +static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) +{ + register struct ip_vs_dest_list *e; + struct ip_vs_dest *dest, *least; + int loh, doh; + + if (set == NULL) + return NULL; + + /* select the first destination server, whose weight > 0 */ + for (e=set->list; e!=NULL; e=e->next) { + least = e->dest; + if (least->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + if ((atomic_read(&least->weight) > 0) + && (least->flags & IP_VS_DEST_F_AVAILABLE)) { + loh = atomic_read(&least->activeconns) * 50 + + atomic_read(&least->inactconns); + goto nextstage; + } + } + return NULL; + + /* find the destination with the weighted least load */ + nextstage: + for (e=e->next; e!=NULL; e=e->next) { + dest = e->dest; + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + if ((loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) + && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr.ip), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + return least; +} + + +/* get weighted most-connection node in the destination set */ +static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) +{ + register struct ip_vs_dest_list *e; + struct ip_vs_dest *dest, *most; + int moh, doh; + + if (set == NULL) + return NULL; + + /* select the first destination server, whose weight > 0 */ + for (e=set->list; e!=NULL; e=e->next) { + most = e->dest; + if (atomic_read(&most->weight) > 0) { + moh = atomic_read(&most->activeconns) * 50 + + atomic_read(&most->inactconns); + goto nextstage; + } + } + return NULL; + + /* find the destination with the weighted most load */ + nextstage: + for (e=e->next; e!=NULL; e=e->next) { + dest = e->dest; + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ + if ((moh * atomic_read(&dest->weight) < + doh * atomic_read(&most->weight)) + && (atomic_read(&dest->weight) > 0)) { + most = dest; + moh = doh; + } + } + + IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(most->addr.ip), ntohs(most->port), + atomic_read(&most->activeconns), + atomic_read(&most->refcnt), + atomic_read(&most->weight), moh); + return most; +} + + +/* + * IPVS lblcr entry represents an association between destination + * IP address and its destination server set + */ +struct ip_vs_lblcr_entry { + struct list_head list; + __be32 addr; /* destination IP address */ + struct ip_vs_dest_set set; /* destination server set */ + unsigned long lastuse; /* last used time */ +}; + + +/* + * IPVS lblcr hash table + */ +struct ip_vs_lblcr_table { + struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ + atomic_t entries; /* number of entries */ + int max_size; /* maximum size of entries */ + struct timer_list periodic_timer; /* collect stale entries */ + int rover; /* rover for expire check */ + int counter; /* counter for no expire */ +}; + + +/* + * IPVS LBLCR sysctl table + */ + +static ctl_table vs_vars_table[] = { + { + .procname = "lblcr_expiration", + .data = &sysctl_ip_vs_lblcr_expiration, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { .ctl_name = 0 } +}; + +static struct ctl_table_header * sysctl_header; + +static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) +{ + list_del(&en->list); + ip_vs_dest_set_eraseall(&en->set); + kfree(en); +} + + +/* + * Returns hash value for IPVS LBLCR entry + */ +static inline unsigned ip_vs_lblcr_hashkey(__be32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK; +} + + +/* + * Hash an entry in the ip_vs_lblcr_table. + * returns bool success. + */ +static void +ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) +{ + unsigned hash = ip_vs_lblcr_hashkey(en->addr); + + list_add(&en->list, &tbl->bucket[hash]); + atomic_inc(&tbl->entries); +} + + +/* + * Get ip_vs_lblcr_entry associated with supplied parameters. Called under + * read lock. + */ +static inline struct ip_vs_lblcr_entry * +ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr) +{ + unsigned hash = ip_vs_lblcr_hashkey(addr); + struct ip_vs_lblcr_entry *en; + + list_for_each_entry(en, &tbl->bucket[hash], list) + if (en->addr == addr) + return en; + + return NULL; +} + + +/* + * Create or update an ip_vs_lblcr_entry, which is a mapping of a destination + * IP address to a server. Called under write lock. + */ +static inline struct ip_vs_lblcr_entry * +ip_vs_lblcr_new(struct ip_vs_lblcr_table *tbl, __be32 daddr, + struct ip_vs_dest *dest) +{ + struct ip_vs_lblcr_entry *en; + + en = ip_vs_lblcr_get(tbl, daddr); + if (!en) { + en = kmalloc(sizeof(*en), GFP_ATOMIC); + if (!en) { + IP_VS_ERR("ip_vs_lblcr_new(): no memory\n"); + return NULL; + } + + en->addr = daddr; + en->lastuse = jiffies; + + /* initilize its dest set */ + atomic_set(&(en->set.size), 0); + en->set.list = NULL; + rwlock_init(&en->set.lock); + + ip_vs_lblcr_hash(tbl, en); + } + + write_lock(&en->set.lock); + ip_vs_dest_set_insert(&en->set, dest); + write_unlock(&en->set.lock); + + return en; +} + + +/* + * Flush all the entries of the specified table. + */ +static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl) +{ + int i; + struct ip_vs_lblcr_entry *en, *nxt; + + /* No locking required, only called during cleanup. */ + for (i=0; ibucket[i], list) { + ip_vs_lblcr_free(en); + } + } +} + + +static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc) +{ + struct ip_vs_lblcr_table *tbl = svc->sched_data; + unsigned long now = jiffies; + int i, j; + struct ip_vs_lblcr_entry *en, *nxt; + + for (i=0, j=tbl->rover; isched_lock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration, + now)) + continue; + + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&svc->sched_lock); + } + tbl->rover = j; +} + + +/* + * Periodical timer handler for IPVS lblcr table + * It is used to collect stale entries when the number of entries + * exceeds the maximum size of the table. + * + * Fixme: we probably need more complicated algorithm to collect + * entries that have not been used for a long time even + * if the number of entries doesn't exceed the maximum size + * of the table. + * The full expiration check is for this purpose now. + */ +static void ip_vs_lblcr_check_expire(unsigned long data) +{ + struct ip_vs_service *svc = (struct ip_vs_service *) data; + struct ip_vs_lblcr_table *tbl = svc->sched_data; + unsigned long now = jiffies; + int goal; + int i, j; + struct ip_vs_lblcr_entry *en, *nxt; + + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { + /* do full expiration check */ + ip_vs_lblcr_full_check(svc); + tbl->counter = 1; + goto out; + } + + if (atomic_read(&tbl->entries) <= tbl->max_size) { + tbl->counter++; + goto out; + } + + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; + if (goal > tbl->max_size/2) + goal = tbl->max_size/2; + + for (i=0, j=tbl->rover; isched_lock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) + continue; + + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + goal--; + } + write_unlock(&svc->sched_lock); + if (goal <= 0) + break; + } + tbl->rover = j; + + out: + mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); +} + +static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) +{ + int i; + struct ip_vs_lblcr_table *tbl; + + /* + * Allocate the ip_vs_lblcr_table for this service + */ + tbl = kmalloc(sizeof(*tbl), GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for " + "current service\n", sizeof(*tbl)); + + /* + * Initialize the hash buckets + */ + for (i=0; ibucket[i]); + } + tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; + tbl->rover = 0; + tbl->counter = 1; + + /* + * Hook periodic timer for garbage collection + */ + setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire, + (unsigned long)svc); + mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL); + + return 0; +} + + +static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_lblcr_table *tbl = svc->sched_data; + + /* remove periodic timer */ + del_timer_sync(&tbl->periodic_timer); + + /* got to clean up table entries here */ + ip_vs_lblcr_flush(tbl); + + /* release the table itself */ + kfree(tbl); + IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n", + sizeof(*tbl)); + + return 0; +} + + +static inline struct ip_vs_dest * +__ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct ip_vs_dest *dest, *least; + int loh, doh; + + /* + * We think the overhead of processing active connections is fifty + * times higher than that of inactive connections in average. (This + * fifty times might not be accurate, we will change it later.) We + * use the following formula to estimate the overhead: + * dest->activeconns*50 + dest->inactconns + * and the load: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connection. + */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + if (atomic_read(&dest->weight) > 0) { + least = dest; + loh = atomic_read(&least->activeconns) * 50 + + atomic_read(&least->inactconns); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr.ip), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +/* + * If this destination server is overloaded and there is a less loaded + * server, then return true. + */ +static inline int +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { + struct ip_vs_dest *d; + + list_for_each_entry(d, &svc->destinations, n_list) { + if (atomic_read(&d->activeconns)*2 + < atomic_read(&d->weight)) { + return 1; + } + } + } + return 0; +} + + +/* + * Locality-Based (weighted) Least-Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_lblcr_table *tbl = svc->sched_data; + struct iphdr *iph = ip_hdr(skb); + struct ip_vs_dest *dest = NULL; + struct ip_vs_lblcr_entry *en; + + IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n"); + + /* First look in our cache */ + read_lock(&svc->sched_lock); + en = ip_vs_lblcr_get(tbl, iph->daddr); + if (en) { + /* We only hold a read lock, but this is atomic */ + en->lastuse = jiffies; + + /* Get the least loaded destination */ + read_lock(&en->set.lock); + dest = ip_vs_dest_set_min(&en->set); + read_unlock(&en->set.lock); + + /* More than one destination + enough time passed by, cleanup */ + if (atomic_read(&en->set.size) > 1 && + time_after(jiffies, en->set.lastmod + + sysctl_ip_vs_lblcr_expiration)) { + struct ip_vs_dest *m; + + write_lock(&en->set.lock); + m = ip_vs_dest_set_max(&en->set); + if (m) + ip_vs_dest_set_erase(&en->set, m); + write_unlock(&en->set.lock); + } + + /* If the destination is not overloaded, use it */ + if (dest && !is_overloaded(dest, svc)) { + read_unlock(&svc->sched_lock); + goto out; + } + + /* The cache entry is invalid, time to schedule */ + dest = __ip_vs_lblcr_schedule(svc, iph); + if (!dest) { + IP_VS_DBG(1, "no destination available\n"); + read_unlock(&svc->sched_lock); + return NULL; + } + + /* Update our cache entry */ + write_lock(&en->set.lock); + ip_vs_dest_set_insert(&en->set, dest); + write_unlock(&en->set.lock); + } + read_unlock(&svc->sched_lock); + + if (dest) + goto out; + + /* No cache entry, time to schedule */ + dest = __ip_vs_lblcr_schedule(svc, iph); + if (!dest) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + + /* If we fail to create a cache entry, we'll just use the valid dest */ + write_lock(&svc->sched_lock); + ip_vs_lblcr_new(tbl, iph->daddr, dest); + write_unlock(&svc->sched_lock); + +out: + IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(iph->daddr), + NIPQUAD(dest->addr.ip), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS LBLCR Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_lblcr_scheduler = +{ + .name = "lblcr", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 0, +#endif + .init_service = ip_vs_lblcr_init_svc, + .done_service = ip_vs_lblcr_done_svc, + .schedule = ip_vs_lblcr_schedule, +}; + + +static int __init ip_vs_lblcr_init(void) +{ + int ret; + + sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); + ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); + if (ret) + unregister_sysctl_table(sysctl_header); + return ret; +} + + +static void __exit ip_vs_lblcr_cleanup(void) +{ + unregister_sysctl_table(sysctl_header); + unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); +} + + +module_init(ip_vs_lblcr_init); +module_exit(ip_vs_lblcr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c new file mode 100644 index 00000000000..b69f808ac46 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_lc.c @@ -0,0 +1,103 @@ +/* + * IPVS: Least-Connection Scheduling module + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : added the ip_vs_lc_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#include +#include + +#include + + +static inline unsigned int +ip_vs_lc_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We think the overhead of processing active connections is 256 + * times higher than that of inactive connections in average. (This + * 256 times might not be accurate, we will change it later) We + * use the following formula to estimate the overhead now: + * dest->activeconns*256 + dest->inactconns + */ + return (atomic_read(&dest->activeconns) << 8) + + atomic_read(&dest->inactconns); +} + + +/* + * Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest, *least = NULL; + unsigned int loh = 0, doh; + + IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n"); + + /* + * Simply select the server with the least number of + * (activeconns<<5) + inactconns + * Except whose weight is equal to zero. + * If the weight is equal to zero, it means that the server is + * quiesced, the existing connections to the server still get + * served, but no new connection is assigned to the server. + */ + + list_for_each_entry(dest, &svc->destinations, n_list) { + if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || + atomic_read(&dest->weight) == 0) + continue; + doh = ip_vs_lc_dest_overhead(dest); + if (!least || doh < loh) { + least = dest; + loh = doh; + } + } + + if (least) + IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d inactconns %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->inactconns)); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_lc_scheduler = { + .name = "lc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif + .schedule = ip_vs_lc_schedule, +}; + + +static int __init ip_vs_lc_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ; +} + +static void __exit ip_vs_lc_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_lc_scheduler); +} + +module_init(ip_vs_lc_init); +module_exit(ip_vs_lc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c new file mode 100644 index 00000000000..9a2d8033f08 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_nq.c @@ -0,0 +1,138 @@ +/* + * IPVS: Never Queue scheduling module + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The NQ algorithm adopts a two-speed model. When there is an idle server + * available, the job will be sent to the idle server, instead of waiting + * for a fast one. When there is no idle server available, the job will be + * sent to the server that minimize its expected delay (The Shortest + * Expected Delay scheduling algorithm). + * + * See the following paper for more information: + * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing + * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, + * pages 986-994, 1988. + * + * Thanks must go to Marko Buuri for talking NQ to me. + * + * The difference between NQ and SED is that NQ can improve overall + * system utilization. + * + */ + +#include +#include + +#include + + +static inline unsigned int +ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We only use the active connection number in the cost + * calculation here. + */ + return atomic_read(&dest->activeconns) + 1; +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest, *least = NULL; + unsigned int loh = 0, doh; + + IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n"); + + /* + * We calculate the load of each dest server as follows: + * (server expected overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + list_for_each_entry(dest, &svc->destinations, n_list) { + + if (dest->flags & IP_VS_DEST_F_OVERLOAD || + !atomic_read(&dest->weight)) + continue; + + doh = ip_vs_nq_dest_overhead(dest); + + /* return the server directly if it is idle */ + if (atomic_read(&dest->activeconns) == 0) { + least = dest; + loh = doh; + goto out; + } + + if (!least || + (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight))) { + least = dest; + loh = doh; + } + } + + if (!least) + return NULL; + + out: + IP_VS_DBG_BUF(6, "NQ: server %s:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_nq_scheduler = +{ + .name = "nq", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif + .schedule = ip_vs_nq_schedule, +}; + + +static int __init ip_vs_nq_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_nq_scheduler); +} + +static void __exit ip_vs_nq_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_nq_scheduler); +} + +module_init(ip_vs_nq_init); +module_exit(ip_vs_nq_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c new file mode 100644 index 00000000000..0791f9e08fe --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -0,0 +1,288 @@ +/* + * ip_vs_proto.c: transport protocol load balancing support for IPVS + * + * Authors: Wensong Zhang + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +/* + * IPVS protocols can only be registered/unregistered when the ipvs + * module is loaded/unloaded, so no lock is needed in accessing the + * ipvs protocol table. + */ + +#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ +#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1)) + +static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; + + +/* + * register an ipvs protocol + */ +static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) +{ + unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + + pp->next = ip_vs_proto_table[hash]; + ip_vs_proto_table[hash] = pp; + + if (pp->init != NULL) + pp->init(pp); + + return 0; +} + + +/* + * unregister an ipvs protocol + */ +static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) +{ + struct ip_vs_protocol **pp_p; + unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + + pp_p = &ip_vs_proto_table[hash]; + for (; *pp_p; pp_p = &(*pp_p)->next) { + if (*pp_p == pp) { + *pp_p = pp->next; + if (pp->exit != NULL) + pp->exit(pp); + return 0; + } + } + + return -ESRCH; +} + + +/* + * get ip_vs_protocol object by its proto. + */ +struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) +{ + struct ip_vs_protocol *pp; + unsigned hash = IP_VS_PROTO_HASH(proto); + + for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) { + if (pp->protocol == proto) + return pp; + } + + return NULL; +} + + +/* + * Propagate event for state change to all protocols + */ +void ip_vs_protocol_timeout_change(int flags) +{ + struct ip_vs_protocol *pp; + int i; + + for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { + for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) { + if (pp->timeout_change) + pp->timeout_change(pp, flags); + } + } +} + + +int * +ip_vs_create_timeout_table(int *table, int size) +{ + return kmemdup(table, size, GFP_ATOMIC); +} + + +/* + * Set timeout value for state specified by name + */ +int +ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to) +{ + int i; + + if (!table || !name || !to) + return -EINVAL; + + for (i = 0; i < num; i++) { + if (strcmp(names[i], name)) + continue; + table[i] = to * HZ; + return 0; + } + return -ENOENT; +} + + +const char * ip_vs_state_name(__u16 proto, int state) +{ + struct ip_vs_protocol *pp = ip_vs_proto_get(proto); + + if (pp == NULL || pp->state_name == NULL) + return (IPPROTO_IP == proto) ? "NONE" : "ERR!"; + return pp->state_name(state); +} + + +static void +ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ + char buf[128]; + struct iphdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "%s TRUNCATED", pp->name); + else if (ih->frag_off & htons(IP_OFFSET)) + sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag", + pp->name, NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr)); + else { + __be16 _ports[2], *pptr +; + pptr = skb_header_pointer(skb, offset + ih->ihl*4, + sizeof(_ports), _ports); + if (pptr == NULL) + sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u", + pp->name, + NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr)); + else + sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u", + pp->name, + NIPQUAD(ih->saddr), + ntohs(pptr[0]), + NIPQUAD(ih->daddr), + ntohs(pptr[1])); + } + + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} + +#ifdef CONFIG_IP_VS_IPV6 +static void +ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ + char buf[192]; + struct ipv6hdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "%s TRUNCATED", pp->name); + else if (ih->nexthdr == IPPROTO_FRAGMENT) + sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT " frag", + pp->name, NIP6(ih->saddr), + NIP6(ih->daddr)); + else { + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, offset + sizeof(struct ipv6hdr), + sizeof(_ports), _ports); + if (pptr == NULL) + sprintf(buf, "%s TRUNCATED " NIP6_FMT "->" NIP6_FMT, + pp->name, + NIP6(ih->saddr), + NIP6(ih->daddr)); + else + sprintf(buf, "%s " NIP6_FMT ":%u->" NIP6_FMT ":%u", + pp->name, + NIP6(ih->saddr), + ntohs(pptr[0]), + NIP6(ih->daddr), + ntohs(pptr[1])); + } + + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} +#endif + + +void +ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (skb->protocol == htons(ETH_P_IPV6)) + ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg); + else +#endif + ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg); +} + + +int __init ip_vs_protocol_init(void) +{ + char protocols[64]; +#define REGISTER_PROTOCOL(p) \ + do { \ + register_ip_vs_protocol(p); \ + strcat(protocols, ", "); \ + strcat(protocols, (p)->name); \ + } while (0) + + protocols[0] = '\0'; + protocols[2] = '\0'; +#ifdef CONFIG_IP_VS_PROTO_TCP + REGISTER_PROTOCOL(&ip_vs_protocol_tcp); +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + REGISTER_PROTOCOL(&ip_vs_protocol_udp); +#endif +#ifdef CONFIG_IP_VS_PROTO_AH + REGISTER_PROTOCOL(&ip_vs_protocol_ah); +#endif +#ifdef CONFIG_IP_VS_PROTO_ESP + REGISTER_PROTOCOL(&ip_vs_protocol_esp); +#endif + IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]); + + return 0; +} + + +void ip_vs_protocol_cleanup(void) +{ + struct ip_vs_protocol *pp; + int i; + + /* unregister all the ipvs protocols */ + for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { + while ((pp = ip_vs_proto_table[i]) != NULL) + unregister_ip_vs_protocol(pp); + } +} diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c new file mode 100644 index 00000000000..80ab0c8e5b4 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c @@ -0,0 +1,235 @@ +/* + * ip_vs_proto_ah_esp.c: AH/ESP IPSec load balancing support for IPVS + * + * Authors: Julian Anastasov , February 2002 + * Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation; + * + */ + +#include +#include +#include +#include +#include +#include + +#include + + +/* TODO: + +struct isakmp_hdr { + __u8 icookie[8]; + __u8 rcookie[8]; + __u8 np; + __u8 version; + __u8 xchgtype; + __u8 flags; + __u32 msgid; + __u32 length; +}; + +*/ + +#define PORT_ISAKMP 500 + + +static struct ip_vs_conn * +ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, unsigned int proto_off, + int inverse) +{ + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_in_get(af, IPPROTO_UDP, + &iph->saddr, + htons(PORT_ISAKMP), + &iph->daddr, + htons(PORT_ISAKMP)); + } else { + cp = ip_vs_conn_in_get(af, IPPROTO_UDP, + &iph->daddr, + htons(PORT_ISAKMP), + &iph->saddr, + htons(PORT_ISAKMP)); + } + + if (!cp) { + /* + * We are not sure if the packet is from our + * service, so our conn_schedule hook should return NF_ACCEPT + */ + IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet " + "%s%s %s->%s\n", + inverse ? "ICMP+" : "", + pp->name, + IP_VS_DBG_ADDR(af, &iph->saddr), + IP_VS_DBG_ADDR(af, &iph->daddr)); + } + + return cp; +} + + +static struct ip_vs_conn * +ah_esp_conn_out_get(int af, const struct sk_buff *skb, + struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, + unsigned int proto_off, + int inverse) +{ + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_out_get(af, IPPROTO_UDP, + &iph->saddr, + htons(PORT_ISAKMP), + &iph->daddr, + htons(PORT_ISAKMP)); + } else { + cp = ip_vs_conn_out_get(af, IPPROTO_UDP, + &iph->daddr, + htons(PORT_ISAKMP), + &iph->saddr, + htons(PORT_ISAKMP)); + } + + if (!cp) { + IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet " + "%s%s %s->%s\n", + inverse ? "ICMP+" : "", + pp->name, + IP_VS_DBG_ADDR(af, &iph->saddr), + IP_VS_DBG_ADDR(af, &iph->daddr)); + } + + return cp; +} + + +static int +ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, + int *verdict, struct ip_vs_conn **cpp) +{ + /* + * AH/ESP is only related traffic. Pass the packet to IP stack. + */ + *verdict = NF_ACCEPT; + return 0; +} + + +static void +ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb, + int offset, const char *msg) +{ + char buf[256]; + struct iphdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "%s TRUNCATED", pp->name); + else + sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", + pp->name, NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr)); + + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} + +#ifdef CONFIG_IP_VS_IPV6 +static void +ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb, + int offset, const char *msg) +{ + char buf[256]; + struct ipv6hdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "%s TRUNCATED", pp->name); + else + sprintf(buf, "%s " NIP6_FMT "->" NIP6_FMT, + pp->name, NIP6(ih->saddr), + NIP6(ih->daddr)); + + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} +#endif + +static void +ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, + int offset, const char *msg) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (skb->protocol == htons(ETH_P_IPV6)) + ah_esp_debug_packet_v6(pp, skb, offset, msg); + else +#endif + ah_esp_debug_packet_v4(pp, skb, offset, msg); +} + + +static void ah_esp_init(struct ip_vs_protocol *pp) +{ + /* nothing to do now */ +} + + +static void ah_esp_exit(struct ip_vs_protocol *pp) +{ + /* nothing to do now */ +} + + +#ifdef CONFIG_IP_VS_PROTO_AH +struct ip_vs_protocol ip_vs_protocol_ah = { + .name = "AH", + .protocol = IPPROTO_AH, + .num_states = 1, + .dont_defrag = 1, + .init = ah_esp_init, + .exit = ah_esp_exit, + .conn_schedule = ah_esp_conn_schedule, + .conn_in_get = ah_esp_conn_in_get, + .conn_out_get = ah_esp_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .csum_check = NULL, + .state_transition = NULL, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .debug_packet = ah_esp_debug_packet, + .timeout_change = NULL, /* ISAKMP */ + .set_state_timeout = NULL, +}; +#endif + +#ifdef CONFIG_IP_VS_PROTO_ESP +struct ip_vs_protocol ip_vs_protocol_esp = { + .name = "ESP", + .protocol = IPPROTO_ESP, + .num_states = 1, + .dont_defrag = 1, + .init = ah_esp_init, + .exit = ah_esp_exit, + .conn_schedule = ah_esp_conn_schedule, + .conn_in_get = ah_esp_conn_in_get, + .conn_out_get = ah_esp_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .csum_check = NULL, + .state_transition = NULL, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .debug_packet = ah_esp_debug_packet, + .timeout_change = NULL, /* ISAKMP */ +}; +#endif diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c new file mode 100644 index 00000000000..dd4566ea2bf --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -0,0 +1,732 @@ +/* + * ip_vs_proto_tcp.c: TCP load balancing support for IPVS + * + * Authors: Wensong Zhang + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include +#include +#include /* for tcphdr */ +#include +#include /* for csum_tcpudp_magic */ +#include +#include +#include + +#include + + +static struct ip_vs_conn * +tcp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, unsigned int proto_off, + int inverse) +{ + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + if (likely(!inverse)) { + return ip_vs_conn_in_get(af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1]); + } else { + return ip_vs_conn_in_get(af, iph->protocol, + &iph->daddr, pptr[1], + &iph->saddr, pptr[0]); + } +} + +static struct ip_vs_conn * +tcp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, unsigned int proto_off, + int inverse) +{ + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + if (likely(!inverse)) { + return ip_vs_conn_out_get(af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1]); + } else { + return ip_vs_conn_out_get(af, iph->protocol, + &iph->daddr, pptr[1], + &iph->saddr, pptr[0]); + } +} + + +static int +tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, + int *verdict, struct ip_vs_conn **cpp) +{ + struct ip_vs_service *svc; + struct tcphdr _tcph, *th; + struct ip_vs_iphdr iph; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + + th = skb_header_pointer(skb, iph.len, sizeof(_tcph), &_tcph); + if (th == NULL) { + *verdict = NF_DROP; + return 0; + } + + if (th->syn && + (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr, + th->dest))) { + if (ip_vs_todrop()) { + /* + * It seems that we are very loaded. + * We have to drop this packet :( + */ + ip_vs_service_put(svc); + *verdict = NF_DROP; + return 0; + } + + /* + * Let the virtual server select a real server for the + * incoming connection, and create a connection entry. + */ + *cpp = ip_vs_schedule(svc, skb); + if (!*cpp) { + *verdict = ip_vs_leave(svc, skb, pp); + return 0; + } + ip_vs_service_put(svc); + } + return 1; +} + + +static inline void +tcp_fast_csum_update(int af, struct tcphdr *tcph, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldport, __be16 newport) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + tcph->check = + csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(tcph->check)))); + else +#endif + tcph->check = + csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(tcph->check)))); +} + + +static inline void +tcp_partial_csum_update(int af, struct tcphdr *tcph, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldlen, __be16 newlen) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + tcph->check = + csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldlen, newlen, + ~csum_unfold(tcph->check)))); + else +#endif + tcph->check = + csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldlen, newlen, + ~csum_unfold(tcph->check)))); +} + + +static int +tcp_snat_handler(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + struct tcphdr *tcph; + unsigned int tcphoff; + int oldlen; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcphoff = sizeof(struct ipv6hdr); + else +#endif + tcphoff = ip_hdrlen(skb); + oldlen = skb->len - tcphoff; + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) + return 0; + + if (unlikely(cp->app != NULL)) { + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* Call application helper if needed */ + if (!ip_vs_app_pkt_out(cp, skb)) + return 0; + } + + tcph = (void *)skb_network_header(skb) + tcphoff; + tcph->source = cp->vport; + + /* Adjust TCP checksums */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, + htonl(oldlen), + htonl(skb->len - tcphoff)); + } else if (!cp->app) { + /* Only port and addr are changed, do fast csum update */ + tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, + cp->dport, cp->vport); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_NONE; + } else { + /* full checksum calculation */ + tcph->check = 0; + skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcph->check = csum_ipv6_magic(&cp->vaddr.in6, + &cp->caddr.in6, + skb->len - tcphoff, + cp->protocol, skb->csum); + else +#endif + tcph->check = csum_tcpudp_magic(cp->vaddr.ip, + cp->caddr.ip, + skb->len - tcphoff, + cp->protocol, + skb->csum); + + IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", + pp->name, tcph->check, + (char*)&(tcph->check) - (char*)tcph); + } + return 1; +} + + +static int +tcp_dnat_handler(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + struct tcphdr *tcph; + unsigned int tcphoff; + int oldlen; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcphoff = sizeof(struct ipv6hdr); + else +#endif + tcphoff = ip_hdrlen(skb); + oldlen = skb->len - tcphoff; + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) + return 0; + + if (unlikely(cp->app != NULL)) { + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* + * Attempt ip_vs_app call. + * It will fix ip_vs_conn and iph ack_seq stuff + */ + if (!ip_vs_app_pkt_in(cp, skb)) + return 0; + } + + tcph = (void *)skb_network_header(skb) + tcphoff; + tcph->dest = cp->dport; + + /* + * Adjust TCP checksums + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, + htonl(oldlen), + htonl(skb->len - tcphoff)); + } else if (!cp->app) { + /* Only port and addr are changed, do fast csum update */ + tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, + cp->vport, cp->dport); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_NONE; + } else { + /* full checksum calculation */ + tcph->check = 0; + skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + tcph->check = csum_ipv6_magic(&cp->caddr.in6, + &cp->daddr.in6, + skb->len - tcphoff, + cp->protocol, skb->csum); + else +#endif + tcph->check = csum_tcpudp_magic(cp->caddr.ip, + cp->daddr.ip, + skb->len - tcphoff, + cp->protocol, + skb->csum); + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + return 1; +} + + +static int +tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) +{ + unsigned int tcphoff; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + tcphoff = sizeof(struct ipv6hdr); + else +#endif + tcphoff = ip_hdrlen(skb); + + switch (skb->ip_summed) { + case CHECKSUM_NONE: + skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); + case CHECKSUM_COMPLETE: +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len - tcphoff, + ipv6_hdr(skb)->nexthdr, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "Failed checksum for"); + return 0; + } + } else +#endif + if (csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + skb->len - tcphoff, + ip_hdr(skb)->protocol, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "Failed checksum for"); + return 0; + } + break; + default: + /* No need to checksum. */ + break; + } + + return 1; +} + + +#define TCP_DIR_INPUT 0 +#define TCP_DIR_OUTPUT 4 +#define TCP_DIR_INPUT_ONLY 8 + +static const int tcp_state_off[IP_VS_DIR_LAST] = { + [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, + [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, + [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, +}; + +/* + * Timeout table[state] + */ +static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { + [IP_VS_TCP_S_NONE] = 2*HZ, + [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, + [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, + [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ, + [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ, + [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ, + [IP_VS_TCP_S_CLOSE] = 10*HZ, + [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, + [IP_VS_TCP_S_LAST_ACK] = 30*HZ, + [IP_VS_TCP_S_LISTEN] = 2*60*HZ, + [IP_VS_TCP_S_SYNACK] = 120*HZ, + [IP_VS_TCP_S_LAST] = 2*HZ, +}; + +static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { + [IP_VS_TCP_S_NONE] = "NONE", + [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", + [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT", + [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV", + [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT", + [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT", + [IP_VS_TCP_S_CLOSE] = "CLOSE", + [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT", + [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK", + [IP_VS_TCP_S_LISTEN] = "LISTEN", + [IP_VS_TCP_S_SYNACK] = "SYNACK", + [IP_VS_TCP_S_LAST] = "BUG!", +}; + +#define sNO IP_VS_TCP_S_NONE +#define sES IP_VS_TCP_S_ESTABLISHED +#define sSS IP_VS_TCP_S_SYN_SENT +#define sSR IP_VS_TCP_S_SYN_RECV +#define sFW IP_VS_TCP_S_FIN_WAIT +#define sTW IP_VS_TCP_S_TIME_WAIT +#define sCL IP_VS_TCP_S_CLOSE +#define sCW IP_VS_TCP_S_CLOSE_WAIT +#define sLA IP_VS_TCP_S_LAST_ACK +#define sLI IP_VS_TCP_S_LISTEN +#define sSA IP_VS_TCP_S_SYNACK + +struct tcp_states_t { + int next_state[IP_VS_TCP_S_LAST]; +}; + +static const char * tcp_state_name(int state) +{ + if (state >= IP_VS_TCP_S_LAST) + return "ERR!"; + return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?"; +} + +static struct tcp_states_t tcp_states [] = { +/* INPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, +/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, + +/* OUTPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }}, +/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, +/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, + +/* INPUT-ONLY */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, +/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, +}; + +static struct tcp_states_t tcp_states_dos [] = { +/* INPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, +/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, +/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, + +/* OUTPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }}, +/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, +/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, + +/* INPUT-ONLY */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, +/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, +}; + +static struct tcp_states_t *tcp_state_table = tcp_states; + + +static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags) +{ + int on = (flags & 1); /* secure_tcp */ + + /* + ** FIXME: change secure_tcp to independent sysctl var + ** or make it per-service or per-app because it is valid + ** for most if not for all of the applications. Something + ** like "capabilities" (flags) for each object. + */ + tcp_state_table = (on? tcp_states_dos : tcp_states); +} + +static int +tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) +{ + return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST, + tcp_state_name_table, sname, to); +} + +static inline int tcp_state_idx(struct tcphdr *th) +{ + if (th->rst) + return 3; + if (th->syn) + return 0; + if (th->fin) + return 1; + if (th->ack) + return 2; + return -1; +} + +static inline void +set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, + int direction, struct tcphdr *th) +{ + int state_idx; + int new_state = IP_VS_TCP_S_CLOSE; + int state_off = tcp_state_off[direction]; + + /* + * Update state offset to INPUT_ONLY if necessary + * or delete NO_OUTPUT flag if output packet detected + */ + if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { + if (state_off == TCP_DIR_OUTPUT) + cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; + else + state_off = TCP_DIR_INPUT_ONLY; + } + + if ((state_idx = tcp_state_idx(th)) < 0) { + IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx); + goto tcp_state_out; + } + + new_state = tcp_state_table[state_off+state_idx].next_state[cp->state]; + + tcp_state_out: + if (new_state != cp->state) { + struct ip_vs_dest *dest = cp->dest; + + IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->" + "%s:%d state: %s->%s conn->refcnt:%d\n", + pp->name, + ((state_off == TCP_DIR_OUTPUT) ? + "output " : "input "), + th->syn ? 'S' : '.', + th->fin ? 'F' : '.', + th->ack ? 'A' : '.', + th->rst ? 'R' : '.', + IP_VS_DBG_ADDR(cp->af, &cp->daddr), + ntohs(cp->dport), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + tcp_state_name(cp->state), + tcp_state_name(new_state), + atomic_read(&cp->refcnt)); + + if (dest) { + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (new_state != IP_VS_TCP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags |= IP_VS_CONN_F_INACTIVE; + } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && + (new_state == IP_VS_TCP_S_ESTABLISHED)) { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } + } + + cp->timeout = pp->timeout_table[cp->state = new_state]; +} + + +/* + * Handle state transitions + */ +static int +tcp_state_transition(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_protocol *pp) +{ + struct tcphdr _tcph, *th; + +#ifdef CONFIG_IP_VS_IPV6 + int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); +#else + int ihl = ip_hdrlen(skb); +#endif + + th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph); + if (th == NULL) + return 0; + + spin_lock(&cp->lock); + set_tcp_state(pp, cp, direction, th); + spin_unlock(&cp->lock); + + return 1; +} + + +/* + * Hash table for TCP application incarnations + */ +#define TCP_APP_TAB_BITS 4 +#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS) +#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1) + +static struct list_head tcp_apps[TCP_APP_TAB_SIZE]; +static DEFINE_SPINLOCK(tcp_app_lock); + +static inline __u16 tcp_app_hashkey(__be16 port) +{ + return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port) + & TCP_APP_TAB_MASK; +} + + +static int tcp_register_app(struct ip_vs_app *inc) +{ + struct ip_vs_app *i; + __u16 hash; + __be16 port = inc->port; + int ret = 0; + + hash = tcp_app_hashkey(port); + + spin_lock_bh(&tcp_app_lock); + list_for_each_entry(i, &tcp_apps[hash], p_list) { + if (i->port == port) { + ret = -EEXIST; + goto out; + } + } + list_add(&inc->p_list, &tcp_apps[hash]); + atomic_inc(&ip_vs_protocol_tcp.appcnt); + + out: + spin_unlock_bh(&tcp_app_lock); + return ret; +} + + +static void +tcp_unregister_app(struct ip_vs_app *inc) +{ + spin_lock_bh(&tcp_app_lock); + atomic_dec(&ip_vs_protocol_tcp.appcnt); + list_del(&inc->p_list); + spin_unlock_bh(&tcp_app_lock); +} + + +static int +tcp_app_conn_bind(struct ip_vs_conn *cp) +{ + int hash; + struct ip_vs_app *inc; + int result = 0; + + /* Default binding: bind app only for NAT */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return 0; + + /* Lookup application incarnations and bind the right one */ + hash = tcp_app_hashkey(cp->vport); + + spin_lock(&tcp_app_lock); + list_for_each_entry(inc, &tcp_apps[hash], p_list) { + if (inc->port == cp->vport) { + if (unlikely(!ip_vs_app_inc_get(inc))) + break; + spin_unlock(&tcp_app_lock); + + IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" + "%s:%u to app %s on port %u\n", + __func__, + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport), + inc->name, ntohs(inc->port)); + + cp->app = inc; + if (inc->init_conn) + result = inc->init_conn(inc, cp); + goto out; + } + } + spin_unlock(&tcp_app_lock); + + out: + return result; +} + + +/* + * Set LISTEN timeout. (ip_vs_conn_put will setup timer) + */ +void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) +{ + spin_lock(&cp->lock); + cp->state = IP_VS_TCP_S_LISTEN; + cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN]; + spin_unlock(&cp->lock); +} + + +static void ip_vs_tcp_init(struct ip_vs_protocol *pp) +{ + IP_VS_INIT_HASH_TABLE(tcp_apps); + pp->timeout_table = tcp_timeouts; +} + + +static void ip_vs_tcp_exit(struct ip_vs_protocol *pp) +{ +} + + +struct ip_vs_protocol ip_vs_protocol_tcp = { + .name = "TCP", + .protocol = IPPROTO_TCP, + .num_states = IP_VS_TCP_S_LAST, + .dont_defrag = 0, + .appcnt = ATOMIC_INIT(0), + .init = ip_vs_tcp_init, + .exit = ip_vs_tcp_exit, + .register_app = tcp_register_app, + .unregister_app = tcp_unregister_app, + .conn_schedule = tcp_conn_schedule, + .conn_in_get = tcp_conn_in_get, + .conn_out_get = tcp_conn_out_get, + .snat_handler = tcp_snat_handler, + .dnat_handler = tcp_dnat_handler, + .csum_check = tcp_csum_check, + .state_name = tcp_state_name, + .state_transition = tcp_state_transition, + .app_conn_bind = tcp_app_conn_bind, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = tcp_timeout_change, + .set_state_timeout = tcp_set_state_timeout, +}; diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c new file mode 100644 index 00000000000..6eb6039d634 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -0,0 +1,533 @@ +/* + * ip_vs_proto_udp.c: UDP load balancing support for IPVS + * + * Authors: Wensong Zhang + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static struct ip_vs_conn * +udp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, unsigned int proto_off, + int inverse) +{ + struct ip_vs_conn *cp; + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + if (likely(!inverse)) { + cp = ip_vs_conn_in_get(af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1]); + } else { + cp = ip_vs_conn_in_get(af, iph->protocol, + &iph->daddr, pptr[1], + &iph->saddr, pptr[0]); + } + + return cp; +} + + +static struct ip_vs_conn * +udp_conn_out_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct ip_vs_iphdr *iph, unsigned int proto_off, + int inverse) +{ + struct ip_vs_conn *cp; + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + if (likely(!inverse)) { + cp = ip_vs_conn_out_get(af, iph->protocol, + &iph->saddr, pptr[0], + &iph->daddr, pptr[1]); + } else { + cp = ip_vs_conn_out_get(af, iph->protocol, + &iph->daddr, pptr[1], + &iph->saddr, pptr[0]); + } + + return cp; +} + + +static int +udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, + int *verdict, struct ip_vs_conn **cpp) +{ + struct ip_vs_service *svc; + struct udphdr _udph, *uh; + struct ip_vs_iphdr iph; + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + + uh = skb_header_pointer(skb, iph.len, sizeof(_udph), &_udph); + if (uh == NULL) { + *verdict = NF_DROP; + return 0; + } + + svc = ip_vs_service_get(af, skb->mark, iph.protocol, + &iph.daddr, uh->dest); + if (svc) { + if (ip_vs_todrop()) { + /* + * It seems that we are very loaded. + * We have to drop this packet :( + */ + ip_vs_service_put(svc); + *verdict = NF_DROP; + return 0; + } + + /* + * Let the virtual server select a real server for the + * incoming connection, and create a connection entry. + */ + *cpp = ip_vs_schedule(svc, skb); + if (!*cpp) { + *verdict = ip_vs_leave(svc, skb, pp); + return 0; + } + ip_vs_service_put(svc); + } + return 1; +} + + +static inline void +udp_fast_csum_update(int af, struct udphdr *uhdr, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldport, __be16 newport) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + uhdr->check = + csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(uhdr->check)))); + else +#endif + uhdr->check = + csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldport, newport, + ~csum_unfold(uhdr->check)))); + if (!uhdr->check) + uhdr->check = CSUM_MANGLED_0; +} + +static inline void +udp_partial_csum_update(int af, struct udphdr *uhdr, + const union nf_inet_addr *oldip, + const union nf_inet_addr *newip, + __be16 oldlen, __be16 newlen) +{ +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + uhdr->check = + csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ip_vs_check_diff2(oldlen, newlen, + ~csum_unfold(uhdr->check)))); + else +#endif + uhdr->check = + csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ip_vs_check_diff2(oldlen, newlen, + ~csum_unfold(uhdr->check)))); +} + + +static int +udp_snat_handler(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + struct udphdr *udph; + unsigned int udphoff; + int oldlen; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udphoff = sizeof(struct ipv6hdr); + else +#endif + udphoff = ip_hdrlen(skb); + oldlen = skb->len - udphoff; + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, udphoff+sizeof(*udph))) + return 0; + + if (unlikely(cp->app != NULL)) { + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* + * Call application helper if needed + */ + if (!ip_vs_app_pkt_out(cp, skb)) + return 0; + } + + udph = (void *)skb_network_header(skb) + udphoff; + udph->source = cp->vport; + + /* + * Adjust UDP checksums + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, + htonl(oldlen), + htonl(skb->len - udphoff)); + } else if (!cp->app && (udph->check != 0)) { + /* Only port and addr are changed, do fast csum update */ + udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, + cp->dport, cp->vport); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_NONE; + } else { + /* full checksum calculation */ + udph->check = 0; + skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udph->check = csum_ipv6_magic(&cp->vaddr.in6, + &cp->caddr.in6, + skb->len - udphoff, + cp->protocol, skb->csum); + else +#endif + udph->check = csum_tcpudp_magic(cp->vaddr.ip, + cp->caddr.ip, + skb->len - udphoff, + cp->protocol, + skb->csum); + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", + pp->name, udph->check, + (char*)&(udph->check) - (char*)udph); + } + return 1; +} + + +static int +udp_dnat_handler(struct sk_buff *skb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + struct udphdr *udph; + unsigned int udphoff; + int oldlen; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udphoff = sizeof(struct ipv6hdr); + else +#endif + udphoff = ip_hdrlen(skb); + oldlen = skb->len - udphoff; + + /* csum_check requires unshared skb */ + if (!skb_make_writable(skb, udphoff+sizeof(*udph))) + return 0; + + if (unlikely(cp->app != NULL)) { + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) + return 0; + + /* + * Attempt ip_vs_app call. + * It will fix ip_vs_conn + */ + if (!ip_vs_app_pkt_in(cp, skb)) + return 0; + } + + udph = (void *)skb_network_header(skb) + udphoff; + udph->dest = cp->dport; + + /* + * Adjust UDP checksums + */ + if (skb->ip_summed == CHECKSUM_PARTIAL) { + udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, + htonl(oldlen), + htonl(skb->len - udphoff)); + } else if (!cp->app && (udph->check != 0)) { + /* Only port and addr are changed, do fast csum update */ + udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, + cp->vport, cp->dport); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_NONE; + } else { + /* full checksum calculation */ + udph->check = 0; + skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6) + udph->check = csum_ipv6_magic(&cp->caddr.in6, + &cp->daddr.in6, + skb->len - udphoff, + cp->protocol, skb->csum); + else +#endif + udph->check = csum_tcpudp_magic(cp->caddr.ip, + cp->daddr.ip, + skb->len - udphoff, + cp->protocol, + skb->csum); + if (udph->check == 0) + udph->check = CSUM_MANGLED_0; + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + return 1; +} + + +static int +udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) +{ + struct udphdr _udph, *uh; + unsigned int udphoff; + +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) + udphoff = sizeof(struct ipv6hdr); + else +#endif + udphoff = ip_hdrlen(skb); + + uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); + if (uh == NULL) + return 0; + + if (uh->check != 0) { + switch (skb->ip_summed) { + case CHECKSUM_NONE: + skb->csum = skb_checksum(skb, udphoff, + skb->len - udphoff, 0); + case CHECKSUM_COMPLETE: +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, + &ipv6_hdr(skb)->daddr, + skb->len - udphoff, + ipv6_hdr(skb)->nexthdr, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "Failed checksum for"); + return 0; + } + } else +#endif + if (csum_tcpudp_magic(ip_hdr(skb)->saddr, + ip_hdr(skb)->daddr, + skb->len - udphoff, + ip_hdr(skb)->protocol, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "Failed checksum for"); + return 0; + } + break; + default: + /* No need to checksum. */ + break; + } + } + return 1; +} + + +/* + * Note: the caller guarantees that only one of register_app, + * unregister_app or app_conn_bind is called each time. + */ + +#define UDP_APP_TAB_BITS 4 +#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS) +#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1) + +static struct list_head udp_apps[UDP_APP_TAB_SIZE]; +static DEFINE_SPINLOCK(udp_app_lock); + +static inline __u16 udp_app_hashkey(__be16 port) +{ + return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port) + & UDP_APP_TAB_MASK; +} + + +static int udp_register_app(struct ip_vs_app *inc) +{ + struct ip_vs_app *i; + __u16 hash; + __be16 port = inc->port; + int ret = 0; + + hash = udp_app_hashkey(port); + + + spin_lock_bh(&udp_app_lock); + list_for_each_entry(i, &udp_apps[hash], p_list) { + if (i->port == port) { + ret = -EEXIST; + goto out; + } + } + list_add(&inc->p_list, &udp_apps[hash]); + atomic_inc(&ip_vs_protocol_udp.appcnt); + + out: + spin_unlock_bh(&udp_app_lock); + return ret; +} + + +static void +udp_unregister_app(struct ip_vs_app *inc) +{ + spin_lock_bh(&udp_app_lock); + atomic_dec(&ip_vs_protocol_udp.appcnt); + list_del(&inc->p_list); + spin_unlock_bh(&udp_app_lock); +} + + +static int udp_app_conn_bind(struct ip_vs_conn *cp) +{ + int hash; + struct ip_vs_app *inc; + int result = 0; + + /* Default binding: bind app only for NAT */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return 0; + + /* Lookup application incarnations and bind the right one */ + hash = udp_app_hashkey(cp->vport); + + spin_lock(&udp_app_lock); + list_for_each_entry(inc, &udp_apps[hash], p_list) { + if (inc->port == cp->vport) { + if (unlikely(!ip_vs_app_inc_get(inc))) + break; + spin_unlock(&udp_app_lock); + + IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" + "%s:%u to app %s on port %u\n", + __func__, + IP_VS_DBG_ADDR(cp->af, &cp->caddr), + ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport), + inc->name, ntohs(inc->port)); + + cp->app = inc; + if (inc->init_conn) + result = inc->init_conn(inc, cp); + goto out; + } + } + spin_unlock(&udp_app_lock); + + out: + return result; +} + + +static int udp_timeouts[IP_VS_UDP_S_LAST+1] = { + [IP_VS_UDP_S_NORMAL] = 5*60*HZ, + [IP_VS_UDP_S_LAST] = 2*HZ, +}; + +static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = { + [IP_VS_UDP_S_NORMAL] = "UDP", + [IP_VS_UDP_S_LAST] = "BUG!", +}; + + +static int +udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) +{ + return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST, + udp_state_name_table, sname, to); +} + +static const char * udp_state_name(int state) +{ + if (state >= IP_VS_UDP_S_LAST) + return "ERR!"; + return udp_state_name_table[state] ? udp_state_name_table[state] : "?"; +} + +static int +udp_state_transition(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_protocol *pp) +{ + cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL]; + return 1; +} + +static void udp_init(struct ip_vs_protocol *pp) +{ + IP_VS_INIT_HASH_TABLE(udp_apps); + pp->timeout_table = udp_timeouts; +} + +static void udp_exit(struct ip_vs_protocol *pp) +{ +} + + +struct ip_vs_protocol ip_vs_protocol_udp = { + .name = "UDP", + .protocol = IPPROTO_UDP, + .num_states = IP_VS_UDP_S_LAST, + .dont_defrag = 0, + .init = udp_init, + .exit = udp_exit, + .conn_schedule = udp_conn_schedule, + .conn_in_get = udp_conn_in_get, + .conn_out_get = udp_conn_out_get, + .snat_handler = udp_snat_handler, + .dnat_handler = udp_dnat_handler, + .csum_check = udp_csum_check, + .state_transition = udp_state_transition, + .state_name = udp_state_name, + .register_app = udp_register_app, + .unregister_app = udp_unregister_app, + .app_conn_bind = udp_app_conn_bind, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = NULL, + .set_state_timeout = udp_set_state_timeout, +}; diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c new file mode 100644 index 00000000000..a22195f68ac --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_rr.c @@ -0,0 +1,112 @@ +/* + * IPVS: Round-Robin Scheduling module + * + * Authors: Wensong Zhang + * Peter Kese + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes/Changes: + * Wensong Zhang : changed the ip_vs_rr_schedule to return dest + * Julian Anastasov : fixed the NULL pointer access bug in debugging + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_rr_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#include +#include + +#include + + +static int ip_vs_rr_init_svc(struct ip_vs_service *svc) +{ + svc->sched_data = &svc->destinations; + return 0; +} + + +static int ip_vs_rr_update_svc(struct ip_vs_service *svc) +{ + svc->sched_data = &svc->destinations; + return 0; +} + + +/* + * Round-Robin Scheduling + */ +static struct ip_vs_dest * +ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct list_head *p, *q; + struct ip_vs_dest *dest; + + IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n"); + + write_lock(&svc->sched_lock); + p = (struct list_head *)svc->sched_data; + p = p->next; + q = p; + do { + /* skip list head */ + if (q == &svc->destinations) { + q = q->next; + continue; + } + + dest = list_entry(q, struct ip_vs_dest, n_list); + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) + /* HIT */ + goto out; + q = q->next; + } while (q != p); + write_unlock(&svc->sched_lock); + return NULL; + + out: + svc->sched_data = q; + write_unlock(&svc->sched_lock); + IP_VS_DBG_BUF(6, "RR: server %s:%u " + "activeconns %d refcnt %d weight %d\n", + IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), + atomic_read(&dest->activeconns), + atomic_read(&dest->refcnt), atomic_read(&dest->weight)); + + return dest; +} + + +static struct ip_vs_scheduler ip_vs_rr_scheduler = { + .name = "rr", /* name */ + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif + .init_service = ip_vs_rr_init_svc, + .update_service = ip_vs_rr_update_svc, + .schedule = ip_vs_rr_schedule, +}; + +static int __init ip_vs_rr_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_rr_scheduler); +} + +static void __exit ip_vs_rr_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_rr_scheduler); +} + +module_init(ip_vs_rr_init); +module_exit(ip_vs_rr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c new file mode 100644 index 00000000000..a46ad9e3501 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -0,0 +1,251 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang + * Peter Kese + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include +#include +#include +#include +#include +#include + +#include + +/* + * IPVS scheduler list + */ +static LIST_HEAD(ip_vs_schedulers); + +/* lock for service table */ +static DEFINE_RWLOCK(__ip_vs_sched_lock); + + +/* + * Bind a service with a scheduler + */ +int ip_vs_bind_scheduler(struct ip_vs_service *svc, + struct ip_vs_scheduler *scheduler) +{ + int ret; + + if (svc == NULL) { + IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n"); + return -EINVAL; + } + if (scheduler == NULL) { + IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n"); + return -EINVAL; + } + + svc->scheduler = scheduler; + + if (scheduler->init_service) { + ret = scheduler->init_service(svc); + if (ret) { + IP_VS_ERR("ip_vs_bind_scheduler(): init error\n"); + return ret; + } + } + + return 0; +} + + +/* + * Unbind a service with its scheduler + */ +int ip_vs_unbind_scheduler(struct ip_vs_service *svc) +{ + struct ip_vs_scheduler *sched; + + if (svc == NULL) { + IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n"); + return -EINVAL; + } + + sched = svc->scheduler; + if (sched == NULL) { + IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n"); + return -EINVAL; + } + + if (sched->done_service) { + if (sched->done_service(svc) != 0) { + IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n"); + return -EINVAL; + } + } + + svc->scheduler = NULL; + return 0; +} + + +/* + * Get scheduler in the scheduler list by name + */ +static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) +{ + struct ip_vs_scheduler *sched; + + IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n", + sched_name); + + read_lock_bh(&__ip_vs_sched_lock); + + list_for_each_entry(sched, &ip_vs_schedulers, n_list) { + /* + * Test and get the modules atomically + */ + if (sched->module && !try_module_get(sched->module)) { + /* + * This scheduler is just deleted + */ + continue; + } + if (strcmp(sched_name, sched->name)==0) { + /* HIT */ + read_unlock_bh(&__ip_vs_sched_lock); + return sched; + } + if (sched->module) + module_put(sched->module); + } + + read_unlock_bh(&__ip_vs_sched_lock); + return NULL; +} + + +/* + * Lookup scheduler and try to load it if it doesn't exist + */ +struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name) +{ + struct ip_vs_scheduler *sched; + + /* + * Search for the scheduler by sched_name + */ + sched = ip_vs_sched_getbyname(sched_name); + + /* + * If scheduler not found, load the module and search again + */ + if (sched == NULL) { + request_module("ip_vs_%s", sched_name); + sched = ip_vs_sched_getbyname(sched_name); + } + + return sched; +} + +void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) +{ + if (scheduler->module) + module_put(scheduler->module); +} + + +/* + * Register a scheduler in the scheduler list + */ +int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) +{ + struct ip_vs_scheduler *sched; + + if (!scheduler) { + IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n"); + return -EINVAL; + } + + if (!scheduler->name) { + IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n"); + return -EINVAL; + } + + /* increase the module use count */ + ip_vs_use_count_inc(); + + write_lock_bh(&__ip_vs_sched_lock); + + if (!list_empty(&scheduler->n_list)) { + write_unlock_bh(&__ip_vs_sched_lock); + ip_vs_use_count_dec(); + IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " + "already linked\n", scheduler->name); + return -EINVAL; + } + + /* + * Make sure that the scheduler with this name doesn't exist + * in the scheduler list. + */ + list_for_each_entry(sched, &ip_vs_schedulers, n_list) { + if (strcmp(scheduler->name, sched->name) == 0) { + write_unlock_bh(&__ip_vs_sched_lock); + ip_vs_use_count_dec(); + IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " + "already existed in the system\n", + scheduler->name); + return -EINVAL; + } + } + /* + * Add it into the d-linked scheduler list + */ + list_add(&scheduler->n_list, &ip_vs_schedulers); + write_unlock_bh(&__ip_vs_sched_lock); + + IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name); + + return 0; +} + + +/* + * Unregister a scheduler from the scheduler list + */ +int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) +{ + if (!scheduler) { + IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n"); + return -EINVAL; + } + + write_lock_bh(&__ip_vs_sched_lock); + if (list_empty(&scheduler->n_list)) { + write_unlock_bh(&__ip_vs_sched_lock); + IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler " + "is not in the list. failed\n", scheduler->name); + return -EINVAL; + } + + /* + * Remove it from the d-linked scheduler list + */ + list_del(&scheduler->n_list); + write_unlock_bh(&__ip_vs_sched_lock); + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name); + + return 0; +} diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c new file mode 100644 index 00000000000..7d2f22f04b8 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_sed.c @@ -0,0 +1,140 @@ +/* + * IPVS: Shortest Expected Delay scheduling module + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The SED algorithm attempts to minimize each job's expected delay until + * completion. The expected delay that the job will experience is + * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of + * jobs on the ith server and Ui is the fixed service rate (weight) of + * the ith server. The SED algorithm adopts a greedy policy that each does + * what is in its own best interest, i.e. to join the queue which would + * minimize its expected delay of completion. + * + * See the following paper for more information: + * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing + * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, + * pages 986-994, 1988. + * + * Thanks must go to Marko Buuri for talking SED to me. + * + * The difference between SED and WLC is that SED includes the incoming + * job in the cost function (the increment of 1). SED may outperform + * WLC, while scheduling big jobs under larger heterogeneous systems + * (the server weight varies a lot). + * + */ + +#include +#include + +#include + + +static inline unsigned int +ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We only use the active connection number in the cost + * calculation here. + */ + return atomic_read(&dest->activeconns) + 1; +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest, *least; + unsigned int loh, doh; + + IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n"); + + /* + * We calculate the load of each dest server as follows: + * (server expected overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + list_for_each_entry(dest, &svc->destinations, n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) { + least = dest; + loh = ip_vs_sed_dest_overhead(least); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + doh = ip_vs_sed_dest_overhead(dest); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG_BUF(6, "SED: server %s:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_sed_scheduler = +{ + .name = "sed", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif + .schedule = ip_vs_sed_schedule, +}; + + +static int __init ip_vs_sed_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_sed_scheduler); +} + +static void __exit ip_vs_sed_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_sed_scheduler); +} + +module_init(ip_vs_sed_init); +module_exit(ip_vs_sed_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c new file mode 100644 index 00000000000..1d96de27fef --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -0,0 +1,258 @@ +/* + * IPVS: Source Hashing scheduling module + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The sh algorithm is to select server by the hash key of source IP + * address. The pseudo code is as follows: + * + * n <- servernode[src_ip]; + * if (n is dead) OR + * (n is overloaded) or (n.weight <= 0) then + * return NULL; + * + * return n; + * + * Notes that servernode is a 256-bucket hash table that maps the hash + * index derived from packet source IP address to the current server + * array. If the sh scheduler is used in cache cluster, it is good to + * combine it with cache_bypass feature. When the statically assigned + * server is dead or overloaded, the load balancer can bypass the cache + * server and send requests to the original server directly. + * + */ + +#include +#include +#include +#include + +#include + + +/* + * IPVS SH bucket + */ +struct ip_vs_sh_bucket { + struct ip_vs_dest *dest; /* real server (cache) */ +}; + +/* + * for IPVS SH entry hash table + */ +#ifndef CONFIG_IP_VS_SH_TAB_BITS +#define CONFIG_IP_VS_SH_TAB_BITS 8 +#endif +#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS +#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS) +#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1) + + +/* + * Returns hash value for IPVS SH entry + */ +static inline unsigned ip_vs_sh_hashkey(__be32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK; +} + + +/* + * Get ip_vs_dest associated with supplied parameters. + */ +static inline struct ip_vs_dest * +ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __be32 addr) +{ + return (tbl[ip_vs_sh_hashkey(addr)]).dest; +} + + +/* + * Assign all the hash buckets of the specified table with the service. + */ +static int +ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc) +{ + int i; + struct ip_vs_sh_bucket *b; + struct list_head *p; + struct ip_vs_dest *dest; + + b = tbl; + p = &svc->destinations; + for (i=0; idest = NULL; + } else { + if (p == &svc->destinations) + p = p->next; + + dest = list_entry(p, struct ip_vs_dest, n_list); + atomic_inc(&dest->refcnt); + b->dest = dest; + + p = p->next; + } + b++; + } + return 0; +} + + +/* + * Flush all the hash buckets of the specified table. + */ +static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl) +{ + int i; + struct ip_vs_sh_bucket *b; + + b = tbl; + for (i=0; idest) { + atomic_dec(&b->dest->refcnt); + b->dest = NULL; + } + b++; + } +} + + +static int ip_vs_sh_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_bucket *tbl; + + /* allocate the SH table for this service */ + tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE, + GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); + + /* assign the hash buckets with the updated service */ + ip_vs_sh_assign(tbl, svc); + + return 0; +} + + +static int ip_vs_sh_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_sh_flush(tbl); + + /* release the table itself */ + kfree(svc->sched_data); + IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n", + sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); + + return 0; +} + + +static int ip_vs_sh_update_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_sh_flush(tbl); + + /* assign the hash buckets with the updated service */ + ip_vs_sh_assign(tbl, svc); + + return 0; +} + + +/* + * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, + * consider that the server is overloaded here. + */ +static inline int is_overloaded(struct ip_vs_dest *dest) +{ + return dest->flags & IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Source Hashing scheduling + */ +static struct ip_vs_dest * +ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest; + struct ip_vs_sh_bucket *tbl; + struct iphdr *iph = ip_hdr(skb); + + IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); + + tbl = (struct ip_vs_sh_bucket *)svc->sched_data; + dest = ip_vs_sh_get(tbl, iph->saddr); + if (!dest + || !(dest->flags & IP_VS_DEST_F_AVAILABLE) + || atomic_read(&dest->weight) <= 0 + || is_overloaded(dest)) { + return NULL; + } + + IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(iph->saddr), + NIPQUAD(dest->addr.ip), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS SH Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_sh_scheduler = +{ + .name = "sh", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 0, +#endif + .init_service = ip_vs_sh_init_svc, + .done_service = ip_vs_sh_done_svc, + .update_service = ip_vs_sh_update_svc, + .schedule = ip_vs_sh_schedule, +}; + + +static int __init ip_vs_sh_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_sh_scheduler); +} + + +static void __exit ip_vs_sh_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_sh_scheduler); +} + + +module_init(ip_vs_sh_init); +module_exit(ip_vs_sh_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c new file mode 100644 index 00000000000..de5e7e118ee --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -0,0 +1,942 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the NetFilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Authors: Wensong Zhang + * + * ip_vs_sync: sync connection info from master load balancer to backups + * through multicast + * + * Changes: + * Alexandre Cassen : Added master & backup support at a time. + * Alexandre Cassen : Added SyncID support for incoming sync + * messages filtering. + * Justin Ossevoort : Fix endian problem on sync message size. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for ip_mc_join_group */ +#include +#include +#include +#include +#include + +#include +#include + +#include + +#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ +#define IP_VS_SYNC_PORT 8848 /* multicast port */ + + +/* + * IPVS sync connection entry + */ +struct ip_vs_sync_conn { + __u8 reserved; + + /* Protocol, addresses and port numbers */ + __u8 protocol; /* Which protocol (TCP/UDP) */ + __be16 cport; + __be16 vport; + __be16 dport; + __be32 caddr; /* client address */ + __be32 vaddr; /* virtual address */ + __be32 daddr; /* destination address */ + + /* Flags and state transition */ + __be16 flags; /* status flags */ + __be16 state; /* state info */ + + /* The sequence options start here */ +}; + +struct ip_vs_sync_conn_options { + struct ip_vs_seq in_seq; /* incoming seq. struct */ + struct ip_vs_seq out_seq; /* outgoing seq. struct */ +}; + +struct ip_vs_sync_thread_data { + struct socket *sock; + char *buf; +}; + +#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn)) +#define FULL_CONN_SIZE \ +(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options)) + + +/* + The master mulitcasts messages to the backup load balancers in the + following format. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Count Conns | SyncID | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + | IPVS Sync Connection (1) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | . | + | . | + | . | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + | IPVS Sync Connection (n) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +*/ + +#define SYNC_MESG_HEADER_LEN 4 +#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */ + +struct ip_vs_sync_mesg { + __u8 nr_conns; + __u8 syncid; + __u16 size; + + /* ip_vs_sync_conn entries start here */ +}; + +/* the maximum length of sync (sending/receiving) message */ +static int sync_send_mesg_maxlen; +static int sync_recv_mesg_maxlen; + +struct ip_vs_sync_buff { + struct list_head list; + unsigned long firstuse; + + /* pointers for the message data */ + struct ip_vs_sync_mesg *mesg; + unsigned char *head; + unsigned char *end; +}; + + +/* the sync_buff list head and the lock */ +static LIST_HEAD(ip_vs_sync_queue); +static DEFINE_SPINLOCK(ip_vs_sync_lock); + +/* current sync_buff for accepting new conn entries */ +static struct ip_vs_sync_buff *curr_sb = NULL; +static DEFINE_SPINLOCK(curr_sb_lock); + +/* ipvs sync daemon state */ +volatile int ip_vs_sync_state = IP_VS_STATE_NONE; +volatile int ip_vs_master_syncid = 0; +volatile int ip_vs_backup_syncid = 0; + +/* multicast interface name */ +char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; +char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; + +/* sync daemon tasks */ +static struct task_struct *sync_master_thread; +static struct task_struct *sync_backup_thread; + +/* multicast addr */ +static struct sockaddr_in mcast_addr = { + .sin_family = AF_INET, + .sin_port = __constant_htons(IP_VS_SYNC_PORT), + .sin_addr.s_addr = __constant_htonl(IP_VS_SYNC_GROUP), +}; + + +static inline struct ip_vs_sync_buff *sb_dequeue(void) +{ + struct ip_vs_sync_buff *sb; + + spin_lock_bh(&ip_vs_sync_lock); + if (list_empty(&ip_vs_sync_queue)) { + sb = NULL; + } else { + sb = list_entry(ip_vs_sync_queue.next, + struct ip_vs_sync_buff, + list); + list_del(&sb->list); + } + spin_unlock_bh(&ip_vs_sync_lock); + + return sb; +} + +static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void) +{ + struct ip_vs_sync_buff *sb; + + if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) + return NULL; + + if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) { + kfree(sb); + return NULL; + } + sb->mesg->nr_conns = 0; + sb->mesg->syncid = ip_vs_master_syncid; + sb->mesg->size = 4; + sb->head = (unsigned char *)sb->mesg + 4; + sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen; + sb->firstuse = jiffies; + return sb; +} + +static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) +{ + kfree(sb->mesg); + kfree(sb); +} + +static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) +{ + spin_lock(&ip_vs_sync_lock); + if (ip_vs_sync_state & IP_VS_STATE_MASTER) + list_add_tail(&sb->list, &ip_vs_sync_queue); + else + ip_vs_sync_buff_release(sb); + spin_unlock(&ip_vs_sync_lock); +} + +/* + * Get the current sync buffer if it has been created for more + * than the specified time or the specified time is zero. + */ +static inline struct ip_vs_sync_buff * +get_curr_sync_buff(unsigned long time) +{ + struct ip_vs_sync_buff *sb; + + spin_lock_bh(&curr_sb_lock); + if (curr_sb && (time == 0 || + time_before(jiffies - curr_sb->firstuse, time))) { + sb = curr_sb; + curr_sb = NULL; + } else + sb = NULL; + spin_unlock_bh(&curr_sb_lock); + return sb; +} + + +/* + * Add an ip_vs_conn information into the current sync_buff. + * Called by ip_vs_in. + */ +void ip_vs_sync_conn(struct ip_vs_conn *cp) +{ + struct ip_vs_sync_mesg *m; + struct ip_vs_sync_conn *s; + int len; + + spin_lock(&curr_sb_lock); + if (!curr_sb) { + if (!(curr_sb=ip_vs_sync_buff_create())) { + spin_unlock(&curr_sb_lock); + IP_VS_ERR("ip_vs_sync_buff_create failed.\n"); + return; + } + } + + len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : + SIMPLE_CONN_SIZE; + m = curr_sb->mesg; + s = (struct ip_vs_sync_conn *)curr_sb->head; + + /* copy members */ + s->protocol = cp->protocol; + s->cport = cp->cport; + s->vport = cp->vport; + s->dport = cp->dport; + s->caddr = cp->caddr.ip; + s->vaddr = cp->vaddr.ip; + s->daddr = cp->daddr.ip; + s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); + s->state = htons(cp->state); + if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { + struct ip_vs_sync_conn_options *opt = + (struct ip_vs_sync_conn_options *)&s[1]; + memcpy(opt, &cp->in_seq, sizeof(*opt)); + } + + m->nr_conns++; + m->size += len; + curr_sb->head += len; + + /* check if there is a space for next one */ + if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) { + sb_queue_tail(curr_sb); + curr_sb = NULL; + } + spin_unlock(&curr_sb_lock); + + /* synchronize its controller if it has */ + if (cp->control) + ip_vs_sync_conn(cp->control); +} + + +/* + * Process received multicast message and create the corresponding + * ip_vs_conn entries. + */ +static void ip_vs_process_message(const char *buffer, const size_t buflen) +{ + struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer; + struct ip_vs_sync_conn *s; + struct ip_vs_sync_conn_options *opt; + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + struct ip_vs_dest *dest; + char *p; + int i; + + if (buflen < sizeof(struct ip_vs_sync_mesg)) { + IP_VS_ERR_RL("sync message header too short\n"); + return; + } + + /* Convert size back to host byte order */ + m->size = ntohs(m->size); + + if (buflen != m->size) { + IP_VS_ERR_RL("bogus sync message size\n"); + return; + } + + /* SyncID sanity check */ + if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) { + IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n", + m->syncid); + return; + } + + p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); + for (i=0; inr_conns; i++) { + unsigned flags, state; + + if (p + SIMPLE_CONN_SIZE > buffer+buflen) { + IP_VS_ERR_RL("bogus conn in sync message\n"); + return; + } + s = (struct ip_vs_sync_conn *) p; + flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; + flags &= ~IP_VS_CONN_F_HASHED; + if (flags & IP_VS_CONN_F_SEQ_MASK) { + opt = (struct ip_vs_sync_conn_options *)&s[1]; + p += FULL_CONN_SIZE; + if (p > buffer+buflen) { + IP_VS_ERR_RL("bogus conn options in sync message\n"); + return; + } + } else { + opt = NULL; + p += SIMPLE_CONN_SIZE; + } + + state = ntohs(s->state); + if (!(flags & IP_VS_CONN_F_TEMPLATE)) { + pp = ip_vs_proto_get(s->protocol); + if (!pp) { + IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n", + s->protocol); + continue; + } + if (state >= pp->num_states) { + IP_VS_DBG(2, "Invalid %s state %u in sync msg\n", + pp->name, state); + continue; + } + } else { + /* protocol in templates is not used for state/timeout */ + pp = NULL; + if (state > 0) { + IP_VS_DBG(2, "Invalid template state %u in sync msg\n", + state); + state = 0; + } + } + + if (!(flags & IP_VS_CONN_F_TEMPLATE)) + cp = ip_vs_conn_in_get(AF_INET, s->protocol, + (union nf_inet_addr *)&s->caddr, + s->cport, + (union nf_inet_addr *)&s->vaddr, + s->vport); + else + cp = ip_vs_ct_in_get(AF_INET, s->protocol, + (union nf_inet_addr *)&s->caddr, + s->cport, + (union nf_inet_addr *)&s->vaddr, + s->vport); + if (!cp) { + /* + * Find the appropriate destination for the connection. + * If it is not found the connection will remain unbound + * but still handled. + */ + dest = ip_vs_find_dest(AF_INET, + (union nf_inet_addr *)&s->daddr, + s->dport, + (union nf_inet_addr *)&s->vaddr, + s->vport, + s->protocol); + /* Set the approprite ativity flag */ + if (s->protocol == IPPROTO_TCP) { + if (state != IP_VS_TCP_S_ESTABLISHED) + flags |= IP_VS_CONN_F_INACTIVE; + else + flags &= ~IP_VS_CONN_F_INACTIVE; + } + cp = ip_vs_conn_new(AF_INET, s->protocol, + (union nf_inet_addr *)&s->caddr, + s->cport, + (union nf_inet_addr *)&s->vaddr, + s->vport, + (union nf_inet_addr *)&s->daddr, + s->dport, + flags, dest); + if (dest) + atomic_dec(&dest->refcnt); + if (!cp) { + IP_VS_ERR("ip_vs_conn_new failed\n"); + return; + } + } else if (!cp->dest) { + dest = ip_vs_try_bind_dest(cp); + if (dest) + atomic_dec(&dest->refcnt); + } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && + (cp->state != state)) { + /* update active/inactive flag for the connection */ + dest = cp->dest; + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (state != IP_VS_TCP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags |= IP_VS_CONN_F_INACTIVE; + } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && + (state == IP_VS_TCP_S_ESTABLISHED)) { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } + + if (opt) + memcpy(&cp->in_seq, opt, sizeof(*opt)); + atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); + cp->state = state; + cp->old_state = cp->state; + /* + * We can not recover the right timeout for templates + * in all cases, we can not find the right fwmark + * virtual service. If needed, we can do it for + * non-fwmark persistent services. + */ + if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table) + cp->timeout = pp->timeout_table[state]; + else + cp->timeout = (3*60*HZ); + ip_vs_conn_put(cp); + } +} + + +/* + * Setup loopback of outgoing multicasts on a sending socket + */ +static void set_mcast_loop(struct sock *sk, u_char loop) +{ + struct inet_sock *inet = inet_sk(sk); + + /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ + lock_sock(sk); + inet->mc_loop = loop ? 1 : 0; + release_sock(sk); +} + +/* + * Specify TTL for outgoing multicasts on a sending socket + */ +static void set_mcast_ttl(struct sock *sk, u_char ttl) +{ + struct inet_sock *inet = inet_sk(sk); + + /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ + lock_sock(sk); + inet->mc_ttl = ttl; + release_sock(sk); +} + +/* + * Specifiy default interface for outgoing multicasts + */ +static int set_mcast_if(struct sock *sk, char *ifname) +{ + struct net_device *dev; + struct inet_sock *inet = inet_sk(sk); + + if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) + return -ENODEV; + + if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) + return -EINVAL; + + lock_sock(sk); + inet->mc_index = dev->ifindex; + /* inet->mc_addr = 0; */ + release_sock(sk); + + return 0; +} + + +/* + * Set the maximum length of sync message according to the + * specified interface's MTU. + */ +static int set_sync_mesg_maxlen(int sync_state) +{ + struct net_device *dev; + int num; + + if (sync_state == IP_VS_STATE_MASTER) { + if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL) + return -ENODEV; + + num = (dev->mtu - sizeof(struct iphdr) - + sizeof(struct udphdr) - + SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; + sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN + + SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF); + IP_VS_DBG(7, "setting the maximum length of sync sending " + "message %d.\n", sync_send_mesg_maxlen); + } else if (sync_state == IP_VS_STATE_BACKUP) { + if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL) + return -ENODEV; + + sync_recv_mesg_maxlen = dev->mtu - + sizeof(struct iphdr) - sizeof(struct udphdr); + IP_VS_DBG(7, "setting the maximum length of sync receiving " + "message %d.\n", sync_recv_mesg_maxlen); + } + + return 0; +} + + +/* + * Join a multicast group. + * the group is specified by a class D multicast address 224.0.0.0/8 + * in the in_addr structure passed in as a parameter. + */ +static int +join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) +{ + struct ip_mreqn mreq; + struct net_device *dev; + int ret; + + memset(&mreq, 0, sizeof(mreq)); + memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); + + if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) + return -ENODEV; + if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) + return -EINVAL; + + mreq.imr_ifindex = dev->ifindex; + + lock_sock(sk); + ret = ip_mc_join_group(sk, &mreq); + release_sock(sk); + + return ret; +} + + +static int bind_mcastif_addr(struct socket *sock, char *ifname) +{ + struct net_device *dev; + __be32 addr; + struct sockaddr_in sin; + + if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) + return -ENODEV; + + addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); + if (!addr) + IP_VS_ERR("You probably need to specify IP address on " + "multicast interface.\n"); + + IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n", + ifname, NIPQUAD(addr)); + + /* Now bind the socket with the address of multicast interface */ + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = addr; + sin.sin_port = 0; + + return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); +} + +/* + * Set up sending multicast socket over UDP + */ +static struct socket * make_send_sock(void) +{ + struct socket *sock; + int result; + + /* First create a socket */ + result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + if (result < 0) { + IP_VS_ERR("Error during creation of socket; terminating\n"); + return ERR_PTR(result); + } + + result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn); + if (result < 0) { + IP_VS_ERR("Error setting outbound mcast interface\n"); + goto error; + } + + set_mcast_loop(sock->sk, 0); + set_mcast_ttl(sock->sk, 1); + + result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn); + if (result < 0) { + IP_VS_ERR("Error binding address of the mcast interface\n"); + goto error; + } + + result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, + sizeof(struct sockaddr), 0); + if (result < 0) { + IP_VS_ERR("Error connecting to the multicast addr\n"); + goto error; + } + + return sock; + + error: + sock_release(sock); + return ERR_PTR(result); +} + + +/* + * Set up receiving multicast socket over UDP + */ +static struct socket * make_receive_sock(void) +{ + struct socket *sock; + int result; + + /* First create a socket */ + result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); + if (result < 0) { + IP_VS_ERR("Error during creation of socket; terminating\n"); + return ERR_PTR(result); + } + + /* it is equivalent to the REUSEADDR option in user-space */ + sock->sk->sk_reuse = 1; + + result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr, + sizeof(struct sockaddr)); + if (result < 0) { + IP_VS_ERR("Error binding to the multicast addr\n"); + goto error; + } + + /* join the multicast group */ + result = join_mcast_group(sock->sk, + (struct in_addr *) &mcast_addr.sin_addr, + ip_vs_backup_mcast_ifn); + if (result < 0) { + IP_VS_ERR("Error joining to the multicast group\n"); + goto error; + } + + return sock; + + error: + sock_release(sock); + return ERR_PTR(result); +} + + +static int +ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) +{ + struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; + struct kvec iov; + int len; + + EnterFunction(7); + iov.iov_base = (void *)buffer; + iov.iov_len = length; + + len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); + + LeaveFunction(7); + return len; +} + +static void +ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) +{ + int msize; + + msize = msg->size; + + /* Put size in network byte order */ + msg->size = htons(msg->size); + + if (ip_vs_send_async(sock, (char *)msg, msize) != msize) + IP_VS_ERR("ip_vs_send_async error\n"); +} + +static int +ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) +{ + struct msghdr msg = {NULL,}; + struct kvec iov; + int len; + + EnterFunction(7); + + /* Receive a packet */ + iov.iov_base = buffer; + iov.iov_len = (size_t)buflen; + + len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0); + + if (len < 0) + return -1; + + LeaveFunction(7); + return len; +} + + +static int sync_thread_master(void *data) +{ + struct ip_vs_sync_thread_data *tinfo = data; + struct ip_vs_sync_buff *sb; + + IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, " + "syncid = %d\n", + ip_vs_master_mcast_ifn, ip_vs_master_syncid); + + while (!kthread_should_stop()) { + while ((sb = sb_dequeue())) { + ip_vs_send_sync_msg(tinfo->sock, sb->mesg); + ip_vs_sync_buff_release(sb); + } + + /* check if entries stay in curr_sb for 2 seconds */ + sb = get_curr_sync_buff(2 * HZ); + if (sb) { + ip_vs_send_sync_msg(tinfo->sock, sb->mesg); + ip_vs_sync_buff_release(sb); + } + + schedule_timeout_interruptible(HZ); + } + + /* clean up the sync_buff queue */ + while ((sb=sb_dequeue())) { + ip_vs_sync_buff_release(sb); + } + + /* clean up the current sync_buff */ + if ((sb = get_curr_sync_buff(0))) { + ip_vs_sync_buff_release(sb); + } + + /* release the sending multicast socket */ + sock_release(tinfo->sock); + kfree(tinfo); + + return 0; +} + + +static int sync_thread_backup(void *data) +{ + struct ip_vs_sync_thread_data *tinfo = data; + int len; + + IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, " + "syncid = %d\n", + ip_vs_backup_mcast_ifn, ip_vs_backup_syncid); + + while (!kthread_should_stop()) { + wait_event_interruptible(*tinfo->sock->sk->sk_sleep, + !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue) + || kthread_should_stop()); + + /* do we have data now? */ + while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { + len = ip_vs_receive(tinfo->sock, tinfo->buf, + sync_recv_mesg_maxlen); + if (len <= 0) { + IP_VS_ERR("receiving message error\n"); + break; + } + + /* disable bottom half, because it accesses the data + shared by softirq while getting/creating conns */ + local_bh_disable(); + ip_vs_process_message(tinfo->buf, len); + local_bh_enable(); + } + } + + /* release the sending multicast socket */ + sock_release(tinfo->sock); + kfree(tinfo->buf); + kfree(tinfo); + + return 0; +} + + +int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) +{ + struct ip_vs_sync_thread_data *tinfo; + struct task_struct **realtask, *task; + struct socket *sock; + char *name, *buf = NULL; + int (*threadfn)(void *data); + int result = -ENOMEM; + + IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); + IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", + sizeof(struct ip_vs_sync_conn)); + + if (state == IP_VS_STATE_MASTER) { + if (sync_master_thread) + return -EEXIST; + + strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, + sizeof(ip_vs_master_mcast_ifn)); + ip_vs_master_syncid = syncid; + realtask = &sync_master_thread; + name = "ipvs_syncmaster"; + threadfn = sync_thread_master; + sock = make_send_sock(); + } else if (state == IP_VS_STATE_BACKUP) { + if (sync_backup_thread) + return -EEXIST; + + strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, + sizeof(ip_vs_backup_mcast_ifn)); + ip_vs_backup_syncid = syncid; + realtask = &sync_backup_thread; + name = "ipvs_syncbackup"; + threadfn = sync_thread_backup; + sock = make_receive_sock(); + } else { + return -EINVAL; + } + + if (IS_ERR(sock)) { + result = PTR_ERR(sock); + goto out; + } + + set_sync_mesg_maxlen(state); + if (state == IP_VS_STATE_BACKUP) { + buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL); + if (!buf) + goto outsocket; + } + + tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); + if (!tinfo) + goto outbuf; + + tinfo->sock = sock; + tinfo->buf = buf; + + task = kthread_run(threadfn, tinfo, name); + if (IS_ERR(task)) { + result = PTR_ERR(task); + goto outtinfo; + } + + /* mark as active */ + *realtask = task; + ip_vs_sync_state |= state; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + return 0; + +outtinfo: + kfree(tinfo); +outbuf: + kfree(buf); +outsocket: + sock_release(sock); +out: + return result; +} + + +int stop_sync_thread(int state) +{ + IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); + + if (state == IP_VS_STATE_MASTER) { + if (!sync_master_thread) + return -ESRCH; + + IP_VS_INFO("stopping master sync thread %d ...\n", + task_pid_nr(sync_master_thread)); + + /* + * The lock synchronizes with sb_queue_tail(), so that we don't + * add sync buffers to the queue, when we are already in + * progress of stopping the master sync daemon. + */ + + spin_lock_bh(&ip_vs_sync_lock); + ip_vs_sync_state &= ~IP_VS_STATE_MASTER; + spin_unlock_bh(&ip_vs_sync_lock); + kthread_stop(sync_master_thread); + sync_master_thread = NULL; + } else if (state == IP_VS_STATE_BACKUP) { + if (!sync_backup_thread) + return -ESRCH; + + IP_VS_INFO("stopping backup sync thread %d ...\n", + task_pid_nr(sync_backup_thread)); + + ip_vs_sync_state &= ~IP_VS_STATE_BACKUP; + kthread_stop(sync_backup_thread); + sync_backup_thread = NULL; + } else { + return -EINVAL; + } + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + return 0; +} diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c new file mode 100644 index 00000000000..8c596e71259 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_wlc.c @@ -0,0 +1,128 @@ +/* + * IPVS: Weighted Least-Connection Scheduling module + * + * Authors: Wensong Zhang + * Peter Kese + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest + * Wensong Zhang : changed to use the inactconns in scheduling + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_wlc_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#include +#include + +#include + + +static inline unsigned int +ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We think the overhead of processing active connections is 256 + * times higher than that of inactive connections in average. (This + * 256 times might not be accurate, we will change it later) We + * use the following formula to estimate the overhead now: + * dest->activeconns*256 + dest->inactconns + */ + return (atomic_read(&dest->activeconns) << 8) + + atomic_read(&dest->inactconns); +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest, *least; + unsigned int loh, doh; + + IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n"); + + /* + * We calculate the load of each dest server as follows: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + list_for_each_entry(dest, &svc->destinations, n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) { + least = dest; + loh = ip_vs_wlc_dest_overhead(least); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + doh = ip_vs_wlc_dest_overhead(dest); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG_BUF(6, "WLC: server %s:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + IP_VS_DBG_ADDR(svc->af, &least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_wlc_scheduler = +{ + .name = "wlc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif + .schedule = ip_vs_wlc_schedule, +}; + + +static int __init ip_vs_wlc_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_wlc_scheduler); +} + +static void __exit ip_vs_wlc_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler); +} + +module_init(ip_vs_wlc_init); +module_exit(ip_vs_wlc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c new file mode 100644 index 00000000000..7ea92fed50b --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_wrr.c @@ -0,0 +1,237 @@ +/* + * IPVS: Weighted Round-Robin Scheduling module + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_wrr_update_svc + * Julian Anastasov : fixed the bug of returning destination + * with weight 0 when all weights are zero + * + */ + +#include +#include +#include + +#include + +/* + * current destination pointer for weighted round-robin scheduling + */ +struct ip_vs_wrr_mark { + struct list_head *cl; /* current list head */ + int cw; /* current weight */ + int mw; /* maximum weight */ + int di; /* decreasing interval */ +}; + + +/* + * Get the gcd of server weights + */ +static int gcd(int a, int b) +{ + int c; + + while ((c = a % b)) { + a = b; + b = c; + } + return b; +} + +static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + int weight; + int g = 0; + + list_for_each_entry(dest, &svc->destinations, n_list) { + weight = atomic_read(&dest->weight); + if (weight > 0) { + if (g > 0) + g = gcd(weight, g); + else + g = weight; + } + } + return g ? g : 1; +} + + +/* + * Get the maximum weight of the service destinations. + */ +static int ip_vs_wrr_max_weight(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + int weight = 0; + + list_for_each_entry(dest, &svc->destinations, n_list) { + if (atomic_read(&dest->weight) > weight) + weight = atomic_read(&dest->weight); + } + + return weight; +} + + +static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_wrr_mark *mark; + + /* + * Allocate the mark variable for WRR scheduling + */ + mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC); + if (mark == NULL) { + IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n"); + return -ENOMEM; + } + mark->cl = &svc->destinations; + mark->cw = 0; + mark->mw = ip_vs_wrr_max_weight(svc); + mark->di = ip_vs_wrr_gcd_weight(svc); + svc->sched_data = mark; + + return 0; +} + + +static int ip_vs_wrr_done_svc(struct ip_vs_service *svc) +{ + /* + * Release the mark variable + */ + kfree(svc->sched_data); + + return 0; +} + + +static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) +{ + struct ip_vs_wrr_mark *mark = svc->sched_data; + + mark->cl = &svc->destinations; + mark->mw = ip_vs_wrr_max_weight(svc); + mark->di = ip_vs_wrr_gcd_weight(svc); + if (mark->cw > mark->mw) + mark->cw = 0; + return 0; +} + + +/* + * Weighted Round-Robin Scheduling + */ +static struct ip_vs_dest * +ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest; + struct ip_vs_wrr_mark *mark = svc->sched_data; + struct list_head *p; + + IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n"); + + /* + * This loop will always terminate, because mark->cw in (0, max_weight] + * and at least one server has its weight equal to max_weight. + */ + write_lock(&svc->sched_lock); + p = mark->cl; + while (1) { + if (mark->cl == &svc->destinations) { + /* it is at the head of the destination list */ + + if (mark->cl == mark->cl->next) { + /* no dest entry */ + dest = NULL; + goto out; + } + + mark->cl = svc->destinations.next; + mark->cw -= mark->di; + if (mark->cw <= 0) { + mark->cw = mark->mw; + /* + * Still zero, which means no available servers. + */ + if (mark->cw == 0) { + mark->cl = &svc->destinations; + IP_VS_ERR_RL("ip_vs_wrr_schedule(): " + "no available servers\n"); + dest = NULL; + goto out; + } + } + } else + mark->cl = mark->cl->next; + + if (mark->cl != &svc->destinations) { + /* not at the head of the list */ + dest = list_entry(mark->cl, struct ip_vs_dest, n_list); + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) >= mark->cw) { + /* got it */ + break; + } + } + + if (mark->cl == p && mark->cw == mark->di) { + /* back to the start, and no dest is found. + It is only possible when all dests are OVERLOADED */ + dest = NULL; + goto out; + } + } + + IP_VS_DBG_BUF(6, "WRR: server %s:%u " + "activeconns %d refcnt %d weight %d\n", + IP_VS_DBG_ADDR(svc->af, &dest->addr), ntohs(dest->port), + atomic_read(&dest->activeconns), + atomic_read(&dest->refcnt), + atomic_read(&dest->weight)); + + out: + write_unlock(&svc->sched_lock); + return dest; +} + + +static struct ip_vs_scheduler ip_vs_wrr_scheduler = { + .name = "wrr", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list), +#ifdef CONFIG_IP_VS_IPV6 + .supports_ipv6 = 1, +#endif + .init_service = ip_vs_wrr_init_svc, + .done_service = ip_vs_wrr_done_svc, + .update_service = ip_vs_wrr_update_svc, + .schedule = ip_vs_wrr_schedule, +}; + +static int __init ip_vs_wrr_init(void) +{ + return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ; +} + +static void __exit ip_vs_wrr_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler); +} + +module_init(ip_vs_wrr_init); +module_exit(ip_vs_wrr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c new file mode 100644 index 00000000000..02ddc2b3ce2 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -0,0 +1,1004 @@ +/* + * ip_vs_xmit.c: various packet transmitters for IPVS + * + * Authors: Wensong Zhang + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include +#include /* for tcphdr */ +#include +#include /* for csum_tcpudp_magic */ +#include +#include /* for icmp_send */ +#include /* for ip_route_output */ +#include +#include +#include +#include +#include + +#include + + +/* + * Destination cache to speed up outgoing route lookup + */ +static inline void +__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst) +{ + struct dst_entry *old_dst; + + old_dst = dest->dst_cache; + dest->dst_cache = dst; + dest->dst_rtos = rtos; + dst_release(old_dst); +} + +static inline struct dst_entry * +__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) +{ + struct dst_entry *dst = dest->dst_cache; + + if (!dst) + return NULL; + if ((dst->obsolete + || (dest->af == AF_INET && rtos != dest->dst_rtos)) && + dst->ops->check(dst, cookie) == NULL) { + dest->dst_cache = NULL; + dst_release(dst); + return NULL; + } + dst_hold(dst); + return dst; +} + +static struct rtable * +__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) +{ + struct rtable *rt; /* Route to the other host */ + struct ip_vs_dest *dest = cp->dest; + + if (dest) { + spin_lock(&dest->dst_lock); + if (!(rt = (struct rtable *) + __ip_vs_dst_check(dest, rtos, 0))) { + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip4_u = { + .daddr = dest->addr.ip, + .saddr = 0, + .tos = rtos, } }, + }; + + if (ip_route_output_key(&init_net, &rt, &fl)) { + spin_unlock(&dest->dst_lock); + IP_VS_DBG_RL("ip_route_output error, " + "dest: %u.%u.%u.%u\n", + NIPQUAD(dest->addr.ip)); + return NULL; + } + __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst)); + IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n", + NIPQUAD(dest->addr.ip), + atomic_read(&rt->u.dst.__refcnt), rtos); + } + spin_unlock(&dest->dst_lock); + } else { + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip4_u = { + .daddr = cp->daddr.ip, + .saddr = 0, + .tos = rtos, } }, + }; + + if (ip_route_output_key(&init_net, &rt, &fl)) { + IP_VS_DBG_RL("ip_route_output error, dest: " + "%u.%u.%u.%u\n", NIPQUAD(cp->daddr.ip)); + return NULL; + } + } + + return rt; +} + +#ifdef CONFIG_IP_VS_IPV6 +static struct rt6_info * +__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp) +{ + struct rt6_info *rt; /* Route to the other host */ + struct ip_vs_dest *dest = cp->dest; + + if (dest) { + spin_lock(&dest->dst_lock); + rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0); + if (!rt) { + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip6_u = { + .daddr = dest->addr.in6, + .saddr = { + .s6_addr32 = + { 0, 0, 0, 0 }, + }, + }, + }, + }; + + rt = (struct rt6_info *)ip6_route_output(&init_net, + NULL, &fl); + if (!rt) { + spin_unlock(&dest->dst_lock); + IP_VS_DBG_RL("ip6_route_output error, " + "dest: " NIP6_FMT "\n", + NIP6(dest->addr.in6)); + return NULL; + } + __ip_vs_dst_set(dest, 0, dst_clone(&rt->u.dst)); + IP_VS_DBG(10, "new dst " NIP6_FMT ", refcnt=%d\n", + NIP6(dest->addr.in6), + atomic_read(&rt->u.dst.__refcnt)); + } + spin_unlock(&dest->dst_lock); + } else { + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip6_u = { + .daddr = cp->daddr.in6, + .saddr = { + .s6_addr32 = { 0, 0, 0, 0 }, + }, + }, + }, + }; + + rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); + if (!rt) { + IP_VS_DBG_RL("ip6_route_output error, dest: " + NIP6_FMT "\n", NIP6(cp->daddr.in6)); + return NULL; + } + } + + return rt; +} +#endif + + +/* + * Release dest->dst_cache before a dest is removed + */ +void +ip_vs_dst_reset(struct ip_vs_dest *dest) +{ + struct dst_entry *old_dst; + + old_dst = dest->dst_cache; + dest->dst_cache = NULL; + dst_release(old_dst); +} + +#define IP_VS_XMIT(pf, skb, rt) \ +do { \ + (skb)->ipvs_property = 1; \ + skb_forward_csum(skb); \ + NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ + (rt)->u.dst.dev, dst_output); \ +} while (0) + + +/* + * NULL transmitter (do nothing except return NF_ACCEPT) + */ +int +ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + /* we do not touch skb and do not need pskb ptr */ + return NF_ACCEPT; +} + + +/* + * Bypass transmitter + * Let packets bypass the destination when the destination is not + * available, it may be only used in transparent cache cluster. + */ +int +ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + struct iphdr *iph = ip_hdr(skb); + u8 tos = iph->tos; + int mtu; + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip4_u = { + .daddr = iph->daddr, + .saddr = 0, + .tos = RT_TOS(tos), } }, + }; + + EnterFunction(10); + + if (ip_route_output_key(&init_net, &rt, &fl)) { + IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, " + "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr)); + goto tx_error_icmp; + } + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { + ip_rt_put(rt); + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n"); + goto tx_error; + } + + /* + * Call ip_send_check because we are not sure it is called + * after ip_defrag. Is copy-on-write needed? + */ + if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { + ip_rt_put(rt); + return NF_STOLEN; + } + ip_send_check(ip_hdr(skb)); + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET, skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rt6_info *rt; /* Route to the other host */ + struct ipv6hdr *iph = ipv6_hdr(skb); + int mtu; + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip6_u = { + .daddr = iph->daddr, + .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } }, + }; + + EnterFunction(10); + + rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); + if (!rt) { + IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): ip6_route_output error, " + "dest: " NIP6_FMT "\n", NIP6(iph->daddr)); + goto tx_error_icmp; + } + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if (skb->len > mtu) { + dst_release(&rt->u.dst); + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + IP_VS_DBG_RL("ip_vs_bypass_xmit_v6(): frag needed\n"); + goto tx_error; + } + + /* + * Call ip_send_check because we are not sure it is called + * after ip_defrag. Is copy-on-write needed? + */ + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(skb == NULL)) { + dst_release(&rt->u.dst); + return NF_STOLEN; + } + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET6, skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} +#endif + +/* + * NAT transmitter (only for outside-to-inside nat forwarding) + * Not used for related ICMP + */ +int +ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + int mtu; + struct iphdr *iph = ip_hdr(skb); + + EnterFunction(10); + + /* check if it is a connection of no-client-port */ + if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { + __be16 _pt, *p; + p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt); + if (p == NULL) + goto tx_error; + ip_vs_conn_fill_cport(cp, *p); + IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); + } + + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { + ip_rt_put(rt); + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); + goto tx_error; + } + + /* copy-on-write the packet before mangling it */ + if (!skb_make_writable(skb, sizeof(struct iphdr))) + goto tx_error_put; + + if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) + goto tx_error_put; + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* mangle the packet */ + if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) + goto tx_error; + ip_hdr(skb)->daddr = cp->daddr.ip; + ip_send_check(ip_hdr(skb)); + + IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); + + /* FIXME: when application helper enlarges the packet and the length + is larger than the MTU of outgoing device, there will be still + MTU problem. */ + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET, skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + LeaveFunction(10); + kfree_skb(skb); + return NF_STOLEN; + tx_error_put: + ip_rt_put(rt); + goto tx_error; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rt6_info *rt; /* Route to the other host */ + int mtu; + + EnterFunction(10); + + /* check if it is a connection of no-client-port */ + if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { + __be16 _pt, *p; + p = skb_header_pointer(skb, sizeof(struct ipv6hdr), + sizeof(_pt), &_pt); + if (p == NULL) + goto tx_error; + ip_vs_conn_fill_cport(cp, *p); + IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); + } + + rt = __ip_vs_get_out_rt_v6(cp); + if (!rt) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if (skb->len > mtu) { + dst_release(&rt->u.dst); + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "ip_vs_nat_xmit_v6(): frag needed for"); + goto tx_error; + } + + /* copy-on-write the packet before mangling it */ + if (!skb_make_writable(skb, sizeof(struct ipv6hdr))) + goto tx_error_put; + + if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) + goto tx_error_put; + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* mangle the packet */ + if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) + goto tx_error; + ipv6_hdr(skb)->daddr = cp->daddr.in6; + + IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); + + /* FIXME: when application helper enlarges the packet and the length + is larger than the MTU of outgoing device, there will be still + MTU problem. */ + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET6, skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + LeaveFunction(10); + kfree_skb(skb); + return NF_STOLEN; +tx_error_put: + dst_release(&rt->u.dst); + goto tx_error; +} +#endif + + +/* + * IP Tunneling transmitter + * + * This function encapsulates the packet in a new IP packet, its + * destination will be set to cp->daddr. Most code of this function + * is taken from ipip.c. + * + * It is used in VS/TUN cluster. The load balancer selects a real + * server from a cluster based on a scheduling algorithm, + * encapsulates the request packet and forwards it to the selected + * server. For example, all real servers are configured with + * "ifconfig tunl0 up". When the server receives + * the encapsulated packet, it will decapsulate the packet, processe + * the request and return the response packets directly to the client + * without passing the load balancer. This can greatly increase the + * scalability of virtual server. + * + * Used for ANY protocol + */ +int +ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + struct net_device *tdev; /* Device to other host */ + struct iphdr *old_iph = ip_hdr(skb); + u8 tos = old_iph->tos; + __be16 df = old_iph->frag_off; + sk_buff_data_t old_transport_header = skb->transport_header; + struct iphdr *iph; /* Our new IP header */ + unsigned int max_headroom; /* The extra header space needed */ + int mtu; + + EnterFunction(10); + + if (skb->protocol != htons(ETH_P_IP)) { + IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, " + "ETH_P_IP: %d, skb protocol: %d\n", + htons(ETH_P_IP), skb->protocol); + goto tx_error; + } + + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos)))) + goto tx_error_icmp; + + tdev = rt->u.dst.dev; + + mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); + if (mtu < 68) { + ip_rt_put(rt); + IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n"); + goto tx_error; + } + if (skb->dst) + skb->dst->ops->update_pmtu(skb->dst, mtu); + + df |= (old_iph->frag_off & htons(IP_DF)); + + if ((old_iph->frag_off & htons(IP_DF)) + && mtu < ntohs(old_iph->tot_len)) { + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n"); + goto tx_error; + } + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); + + if (skb_headroom(skb) < max_headroom + || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = + skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + ip_rt_put(rt); + kfree_skb(skb); + IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n"); + return NF_STOLEN; + } + kfree_skb(skb); + skb = new_skb; + old_iph = ip_hdr(skb); + } + + skb->transport_header = old_transport_header; + + /* fix old IP header checksum */ + ip_send_check(old_iph); + + skb_push(skb, sizeof(struct iphdr)); + skb_reset_network_header(skb); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* + * Push down and install the IPIP header. + */ + iph = ip_hdr(skb); + iph->version = 4; + iph->ihl = sizeof(struct iphdr)>>2; + iph->frag_off = df; + iph->protocol = IPPROTO_IPIP; + iph->tos = tos; + iph->daddr = rt->rt_dst; + iph->saddr = rt->rt_src; + iph->ttl = old_iph->ttl; + ip_select_ident(iph, &rt->u.dst, NULL); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + ip_local_out(skb); + + LeaveFunction(10); + + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rt6_info *rt; /* Route to the other host */ + struct net_device *tdev; /* Device to other host */ + struct ipv6hdr *old_iph = ipv6_hdr(skb); + sk_buff_data_t old_transport_header = skb->transport_header; + struct ipv6hdr *iph; /* Our new IP header */ + unsigned int max_headroom; /* The extra header space needed */ + int mtu; + + EnterFunction(10); + + if (skb->protocol != htons(ETH_P_IPV6)) { + IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): protocol error, " + "ETH_P_IPV6: %d, skb protocol: %d\n", + htons(ETH_P_IPV6), skb->protocol); + goto tx_error; + } + + rt = __ip_vs_get_out_rt_v6(cp); + if (!rt) + goto tx_error_icmp; + + tdev = rt->u.dst.dev; + + mtu = dst_mtu(&rt->u.dst) - sizeof(struct ipv6hdr); + /* TODO IPv6: do we need this check in IPv6? */ + if (mtu < 1280) { + dst_release(&rt->u.dst); + IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): mtu less than 1280\n"); + goto tx_error; + } + if (skb->dst) + skb->dst->ops->update_pmtu(skb->dst, mtu); + + if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + dst_release(&rt->u.dst); + IP_VS_DBG_RL("ip_vs_tunnel_xmit_v6(): frag needed\n"); + goto tx_error; + } + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr); + + if (skb_headroom(skb) < max_headroom + || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = + skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + dst_release(&rt->u.dst); + kfree_skb(skb); + IP_VS_ERR_RL("ip_vs_tunnel_xmit_v6(): no memory\n"); + return NF_STOLEN; + } + kfree_skb(skb); + skb = new_skb; + old_iph = ipv6_hdr(skb); + } + + skb->transport_header = old_transport_header; + + skb_push(skb, sizeof(struct ipv6hdr)); + skb_reset_network_header(skb); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* + * Push down and install the IPIP header. + */ + iph = ipv6_hdr(skb); + iph->version = 6; + iph->nexthdr = IPPROTO_IPV6; + iph->payload_len = old_iph->payload_len + sizeof(old_iph); + iph->priority = old_iph->priority; + memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); + iph->daddr = rt->rt6i_dst.addr; + iph->saddr = cp->vaddr.in6; /* rt->rt6i_src.addr; */ + iph->hop_limit = old_iph->hop_limit; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + ip6_local_out(skb); + + LeaveFunction(10); + + return NF_STOLEN; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} +#endif + + +/* + * Direct Routing transmitter + * Used for ANY protocol + */ +int +ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + struct iphdr *iph = ip_hdr(skb); + int mtu; + + EnterFunction(10); + + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) { + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n"); + goto tx_error; + } + + /* + * Call ip_send_check because we are not sure it is called + * after ip_defrag. Is copy-on-write needed? + */ + if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { + ip_rt_put(rt); + return NF_STOLEN; + } + ip_send_check(ip_hdr(skb)); + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET, skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rt6_info *rt; /* Route to the other host */ + int mtu; + + EnterFunction(10); + + rt = __ip_vs_get_out_rt_v6(cp); + if (!rt) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if (skb->len > mtu) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + dst_release(&rt->u.dst); + IP_VS_DBG_RL("ip_vs_dr_xmit_v6(): frag needed\n"); + goto tx_error; + } + + /* + * Call ip_send_check because we are not sure it is called + * after ip_defrag. Is copy-on-write needed? + */ + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(skb == NULL)) { + dst_release(&rt->u.dst); + return NF_STOLEN; + } + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET6, skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} +#endif + + +/* + * ICMP packet transmitter + * called by the ip_vs_in_icmp + */ +int +ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, int offset) +{ + struct rtable *rt; /* Route to the other host */ + int mtu; + int rc; + + EnterFunction(10); + + /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be + forwarded directly here, because there is no need to + translate address/port back */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { + if (cp->packet_xmit) + rc = cp->packet_xmit(skb, cp, pp); + else + rc = NF_ACCEPT; + /* do not touch skb anymore */ + atomic_inc(&cp->in_pkts); + goto out; + } + + /* + * mangle and send the packet here (only for VS/NAT) + */ + + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos)))) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) { + ip_rt_put(rt); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); + goto tx_error; + } + + /* copy-on-write the packet before mangling it */ + if (!skb_make_writable(skb, offset)) + goto tx_error_put; + + if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) + goto tx_error_put; + + /* drop the old route when skb is not shared */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + ip_vs_nat_icmp(skb, pp, cp, 0); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET, skb, rt); + + rc = NF_STOLEN; + goto out; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + dev_kfree_skb(skb); + rc = NF_STOLEN; + out: + LeaveFunction(10); + return rc; + tx_error_put: + ip_rt_put(rt); + goto tx_error; +} + +#ifdef CONFIG_IP_VS_IPV6 +int +ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, int offset) +{ + struct rt6_info *rt; /* Route to the other host */ + int mtu; + int rc; + + EnterFunction(10); + + /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be + forwarded directly here, because there is no need to + translate address/port back */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { + if (cp->packet_xmit) + rc = cp->packet_xmit(skb, cp, pp); + else + rc = NF_ACCEPT; + /* do not touch skb anymore */ + atomic_inc(&cp->in_pkts); + goto out; + } + + /* + * mangle and send the packet here (only for VS/NAT) + */ + + rt = __ip_vs_get_out_rt_v6(cp); + if (!rt) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if (skb->len > mtu) { + dst_release(&rt->u.dst); + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); + goto tx_error; + } + + /* copy-on-write the packet before mangling it */ + if (!skb_make_writable(skb, offset)) + goto tx_error_put; + + if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) + goto tx_error_put; + + /* drop the old route when skb is not shared */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + ip_vs_nat_icmp_v6(skb, pp, cp, 0); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(PF_INET6, skb, rt); + + rc = NF_STOLEN; + goto out; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + dev_kfree_skb(skb); + rc = NF_STOLEN; +out: + LeaveFunction(10); + return rc; +tx_error_put: + dst_release(&rt->u.dst); + goto tx_error; +} +#endif -- cgit v1.2.3 From 76108cea065cda58366d16a7eb6ca90d717a1396 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:00 +0200 Subject: netfilter: Use unsigned types for hooknum and pf vars and (try to) consistently use u_int8_t for the L3 family. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/netfilter/core.c | 4 +-- net/netfilter/nf_conntrack_core.c | 6 ++-- net/netfilter/nf_conntrack_expect.c | 2 +- net/netfilter/nf_conntrack_h323_main.c | 3 +- net/netfilter/nf_conntrack_proto_dccp.c | 4 +-- net/netfilter/nf_conntrack_proto_generic.c | 2 +- net/netfilter/nf_conntrack_proto_gre.c | 2 +- net/netfilter/nf_conntrack_proto_sctp.c | 2 +- net/netfilter/nf_conntrack_proto_tcp.c | 6 ++-- net/netfilter/nf_conntrack_proto_udp.c | 4 +-- net/netfilter/nf_conntrack_proto_udplite.c | 4 +-- net/netfilter/nf_internals.h | 4 +-- net/netfilter/nf_log.c | 6 ++-- net/netfilter/nf_queue.c | 10 +++---- net/netfilter/nf_sockopt.c | 15 +++++----- net/netfilter/nfnetlink_log.c | 4 +-- net/netfilter/x_tables.c | 47 ++++++++++++++++-------------- net/netfilter/xt_connlimit.c | 2 +- net/netfilter/xt_conntrack.c | 8 ++--- net/netfilter/xt_hashlimit.c | 11 ++++--- 20 files changed, 75 insertions(+), 71 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 292fa28146f..26b8f489d7a 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -113,7 +113,7 @@ EXPORT_SYMBOL(nf_unregister_hooks); unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb, - int hook, + unsigned int hook, const struct net_device *indev, const struct net_device *outdev, struct list_head **i, @@ -155,7 +155,7 @@ unsigned int nf_iterate(struct list_head *head, /* Returns 1 if okfn() needs to be executed by the caller, * -EPERM for NF_DROP, 0 otherwise. */ -int nf_hook_slow(int pf, unsigned int hook, struct sk_buff *skb, +int nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *), diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 9d1830da8e8..6aaf64b5ded 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -665,7 +665,7 @@ resolve_normal_ct(struct sk_buff *skb, } unsigned int -nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff *skb) +nf_conntrack_in(u_int8_t pf, unsigned int hooknum, struct sk_buff *skb) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; @@ -683,7 +683,7 @@ nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff *skb) } /* rcu_read_lock()ed by nf_hook_slow */ - l3proto = __nf_ct_l3proto_find((u_int16_t)pf); + l3proto = __nf_ct_l3proto_find(pf); ret = l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff, &protonum); if (ret <= 0) { @@ -693,7 +693,7 @@ nf_conntrack_in(int pf, unsigned int hooknum, struct sk_buff *skb) return -ret; } - l4proto = __nf_ct_l4proto_find((u_int16_t)pf, protonum); + l4proto = __nf_ct_l4proto_find(pf, protonum); /* It may be an special packet, error, unclean... * inverse of the return code tells to the netfilter diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index e8f0dead267..990fa12f2ee 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -241,7 +241,7 @@ struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me) EXPORT_SYMBOL_GPL(nf_ct_expect_alloc); void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class, - int family, + u_int8_t family, const union nf_inet_addr *saddr, const union nf_inet_addr *daddr, u_int8_t proto, const __be16 *src, const __be16 *dst) diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 2f83c158934..5dc0478108a 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -709,7 +709,8 @@ static int expect_h245(struct sk_buff *skb, struct nf_conn *ct, /* If the calling party is on the same side of the forward-to party, * we don't need to track the second call */ static int callforward_do_filter(const union nf_inet_addr *src, - const union nf_inet_addr *dst, int family) + const union nf_inet_addr *dst, + u_int8_t family) { const struct nf_afinfo *afinfo; struct flowi fl1, fl2; diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index e7866dd3cde..edc30358dc1 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -461,7 +461,7 @@ static u64 dccp_ack_seq(const struct dccp_hdr *dh) static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - int pf, unsigned int hooknum) + u_int8_t pf, unsigned int hooknum) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct dccp_hdr _dh, *dh; @@ -546,7 +546,7 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, } static int dccp_error(struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info *ctinfo, int pf, + enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) { struct dccp_hdr _dh, *dh; diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index e31b0e7bd0b..dbe680af85d 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -45,7 +45,7 @@ static int packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - int pf, + u_int8_t pf, unsigned int hooknum) { nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_generic_timeout); diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 9bd03967fea..c5a78220fa3 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -219,7 +219,7 @@ static int gre_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - int pf, + u_int8_t pf, unsigned int hooknum) { /* If we've seen traffic both ways, this is a GRE connection. diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 30aa5b94a77..b5a90596d3f 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -287,7 +287,7 @@ static int sctp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - int pf, + u_int8_t pf, unsigned int hooknum) { enum sctp_conntrack new_state, old_state; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 6f61261888e..539a8202025 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -486,7 +486,7 @@ static bool tcp_in_window(const struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, const struct tcphdr *tcph, - int pf) + u_int8_t pf) { struct ip_ct_tcp_state *sender = &state->seen[dir]; struct ip_ct_tcp_state *receiver = &state->seen[!dir]; @@ -749,7 +749,7 @@ static const u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_ACK|TH_URG) + 1] = static int tcp_error(struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info *ctinfo, - int pf, + u_int8_t pf, unsigned int hooknum) { const struct tcphdr *th; @@ -804,7 +804,7 @@ static int tcp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - int pf, + u_int8_t pf, unsigned int hooknum) { struct nf_conntrack_tuple *tuple; diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 8b21762e65d..2a965c4a0ea 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -66,7 +66,7 @@ static int udp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - int pf, + u_int8_t pf, unsigned int hooknum) { /* If we've seen traffic both ways, this is some kind of UDP @@ -91,7 +91,7 @@ static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb, static int udp_error(struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info *ctinfo, - int pf, + u_int8_t pf, unsigned int hooknum) { unsigned int udplen = skb->len - dataoff; diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c index 1fa62f3c24f..4fb6c8d83a8 100644 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -65,7 +65,7 @@ static int udplite_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - int pf, + u_int8_t pf, unsigned int hooknum) { /* If we've seen traffic both ways, this is some kind of UDP @@ -91,7 +91,7 @@ static bool udplite_new(struct nf_conn *ct, const struct sk_buff *skb, static int udplite_error(struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info *ctinfo, - int pf, + u_int8_t pf, unsigned int hooknum) { unsigned int udplen = skb->len - dataoff; diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index 196269c1e58..bf6609978af 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -15,7 +15,7 @@ /* core.c */ extern unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb, - int hook, + unsigned int hook, const struct net_device *indev, const struct net_device *outdev, struct list_head **i, @@ -25,7 +25,7 @@ extern unsigned int nf_iterate(struct list_head *head, /* nf_queue.c */ extern int nf_queue(struct sk_buff *skb, struct list_head *elem, - int pf, unsigned int hook, + u_int8_t pf, unsigned int hook, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *), diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 9fda6ee95a3..5c2f7332015 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -20,7 +20,7 @@ static DEFINE_MUTEX(nf_log_mutex); /* return EBUSY if somebody else is registered, EEXIST if the same logger * is registred, 0 on success. */ -int nf_log_register(int pf, const struct nf_logger *logger) +int nf_log_register(u_int8_t pf, const struct nf_logger *logger) { int ret; @@ -45,7 +45,7 @@ int nf_log_register(int pf, const struct nf_logger *logger) } EXPORT_SYMBOL(nf_log_register); -void nf_log_unregister_pf(int pf) +void nf_log_unregister_pf(u_int8_t pf) { if (pf >= NPROTO) return; @@ -73,7 +73,7 @@ void nf_log_unregister(const struct nf_logger *logger) } EXPORT_SYMBOL(nf_log_unregister); -void nf_log_packet(int pf, +void nf_log_packet(u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 582ec3efc8a..f285086f629 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -22,7 +22,7 @@ static DEFINE_MUTEX(queue_handler_mutex); /* return EBUSY when somebody else is registered, return EEXIST if the * same handler is registered, return 0 in case of success. */ -int nf_register_queue_handler(int pf, const struct nf_queue_handler *qh) +int nf_register_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh) { int ret; @@ -45,7 +45,7 @@ int nf_register_queue_handler(int pf, const struct nf_queue_handler *qh) EXPORT_SYMBOL(nf_register_queue_handler); /* The caller must flush their queue before this */ -int nf_unregister_queue_handler(int pf, const struct nf_queue_handler *qh) +int nf_unregister_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh) { if (pf >= NPROTO) return -EINVAL; @@ -67,7 +67,7 @@ EXPORT_SYMBOL(nf_unregister_queue_handler); void nf_unregister_queue_handlers(const struct nf_queue_handler *qh) { - int pf; + u_int8_t pf; mutex_lock(&queue_handler_mutex); for (pf = 0; pf < NPROTO; pf++) { @@ -107,7 +107,7 @@ static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) */ static int __nf_queue(struct sk_buff *skb, struct list_head *elem, - int pf, unsigned int hook, + u_int8_t pf, unsigned int hook, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *), @@ -191,7 +191,7 @@ err: int nf_queue(struct sk_buff *skb, struct list_head *elem, - int pf, unsigned int hook, + u_int8_t pf, unsigned int hook, struct net_device *indev, struct net_device *outdev, int (*okfn)(struct sk_buff *), diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c index 01489681fa9..f9b46de6a3d 100644 --- a/net/netfilter/nf_sockopt.c +++ b/net/netfilter/nf_sockopt.c @@ -60,7 +60,7 @@ void nf_unregister_sockopt(struct nf_sockopt_ops *reg) } EXPORT_SYMBOL(nf_unregister_sockopt); -static struct nf_sockopt_ops *nf_sockopt_find(struct sock *sk, int pf, +static struct nf_sockopt_ops *nf_sockopt_find(struct sock *sk, u_int8_t pf, int val, int get) { struct nf_sockopt_ops *ops; @@ -96,7 +96,7 @@ out: } /* Call get/setsockopt() */ -static int nf_sockopt(struct sock *sk, int pf, int val, +static int nf_sockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt, int *len, int get) { struct nf_sockopt_ops *ops; @@ -115,21 +115,22 @@ static int nf_sockopt(struct sock *sk, int pf, int val, return ret; } -int nf_setsockopt(struct sock *sk, int pf, int val, char __user *opt, +int nf_setsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt, int len) { return nf_sockopt(sk, pf, val, opt, &len, 0); } EXPORT_SYMBOL(nf_setsockopt); -int nf_getsockopt(struct sock *sk, int pf, int val, char __user *opt, int *len) +int nf_getsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt, + int *len) { return nf_sockopt(sk, pf, val, opt, len, 1); } EXPORT_SYMBOL(nf_getsockopt); #ifdef CONFIG_COMPAT -static int compat_nf_sockopt(struct sock *sk, int pf, int val, +static int compat_nf_sockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt, int *len, int get) { struct nf_sockopt_ops *ops; @@ -155,14 +156,14 @@ static int compat_nf_sockopt(struct sock *sk, int pf, int val, return ret; } -int compat_nf_setsockopt(struct sock *sk, int pf, +int compat_nf_setsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt, int len) { return compat_nf_sockopt(sk, pf, val, opt, &len, 0); } EXPORT_SYMBOL(compat_nf_setsockopt); -int compat_nf_getsockopt(struct sock *sk, int pf, +int compat_nf_getsockopt(struct sock *sk, u_int8_t pf, int val, char __user *opt, int *len) { return compat_nf_sockopt(sk, pf, val, opt, len, 1); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 9a35b57ab76..41e0105d382 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -359,7 +359,7 @@ static inline int __build_packet_message(struct nfulnl_instance *inst, const struct sk_buff *skb, unsigned int data_len, - unsigned int pf, + u_int8_t pf, unsigned int hooknum, const struct net_device *indev, const struct net_device *outdev, @@ -534,7 +534,7 @@ static struct nf_loginfo default_loginfo = { /* log handler for internal netfilter logging api */ static void -nfulnl_log_packet(unsigned int pf, +nfulnl_log_packet(u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 5d75cd86ebb..cf2f3e90cef 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -68,7 +68,8 @@ static const char *const xt_prefix[NPROTO] = { int xt_register_target(struct xt_target *target) { - int ret, af = target->family; + u_int8_t af = target->family; + int ret; ret = mutex_lock_interruptible(&xt[af].mutex); if (ret != 0) @@ -82,7 +83,7 @@ EXPORT_SYMBOL(xt_register_target); void xt_unregister_target(struct xt_target *target) { - int af = target->family; + u_int8_t af = target->family; mutex_lock(&xt[af].mutex); list_del(&target->list); @@ -123,7 +124,8 @@ EXPORT_SYMBOL(xt_unregister_targets); int xt_register_match(struct xt_match *match) { - int ret, af = match->family; + u_int8_t af = match->family; + int ret; ret = mutex_lock_interruptible(&xt[af].mutex); if (ret != 0) @@ -139,7 +141,7 @@ EXPORT_SYMBOL(xt_register_match); void xt_unregister_match(struct xt_match *match) { - int af = match->family; + u_int8_t af = match->family; mutex_lock(&xt[af].mutex); list_del(&match->list); @@ -185,7 +187,7 @@ EXPORT_SYMBOL(xt_unregister_matches); */ /* Find match, grabs ref. Returns ERR_PTR() on error. */ -struct xt_match *xt_find_match(int af, const char *name, u8 revision) +struct xt_match *xt_find_match(u8 af, const char *name, u8 revision) { struct xt_match *m; int err = 0; @@ -210,7 +212,7 @@ struct xt_match *xt_find_match(int af, const char *name, u8 revision) EXPORT_SYMBOL(xt_find_match); /* Find target, grabs ref. Returns ERR_PTR() on error. */ -struct xt_target *xt_find_target(int af, const char *name, u8 revision) +struct xt_target *xt_find_target(u8 af, const char *name, u8 revision) { struct xt_target *t; int err = 0; @@ -234,7 +236,7 @@ struct xt_target *xt_find_target(int af, const char *name, u8 revision) } EXPORT_SYMBOL(xt_find_target); -struct xt_target *xt_request_find_target(int af, const char *name, u8 revision) +struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision) { struct xt_target *target; @@ -246,7 +248,7 @@ struct xt_target *xt_request_find_target(int af, const char *name, u8 revision) } EXPORT_SYMBOL_GPL(xt_request_find_target); -static int match_revfn(int af, const char *name, u8 revision, int *bestp) +static int match_revfn(u8 af, const char *name, u8 revision, int *bestp) { const struct xt_match *m; int have_rev = 0; @@ -262,7 +264,7 @@ static int match_revfn(int af, const char *name, u8 revision, int *bestp) return have_rev; } -static int target_revfn(int af, const char *name, u8 revision, int *bestp) +static int target_revfn(u8 af, const char *name, u8 revision, int *bestp) { const struct xt_target *t; int have_rev = 0; @@ -279,7 +281,7 @@ static int target_revfn(int af, const char *name, u8 revision, int *bestp) } /* Returns true or false (if no such extension at all) */ -int xt_find_revision(int af, const char *name, u8 revision, int target, +int xt_find_revision(u8 af, const char *name, u8 revision, int target, int *err) { int have_rev, best = -1; @@ -337,7 +339,7 @@ int xt_check_match(const struct xt_match *match, unsigned short family, EXPORT_SYMBOL_GPL(xt_check_match); #ifdef CONFIG_COMPAT -int xt_compat_add_offset(int af, unsigned int offset, short delta) +int xt_compat_add_offset(u_int8_t af, unsigned int offset, short delta) { struct compat_delta *tmp; @@ -359,7 +361,7 @@ int xt_compat_add_offset(int af, unsigned int offset, short delta) } EXPORT_SYMBOL_GPL(xt_compat_add_offset); -void xt_compat_flush_offsets(int af) +void xt_compat_flush_offsets(u_int8_t af) { struct compat_delta *tmp, *next; @@ -373,7 +375,7 @@ void xt_compat_flush_offsets(int af) } EXPORT_SYMBOL_GPL(xt_compat_flush_offsets); -short xt_compat_calc_jump(int af, unsigned int offset) +short xt_compat_calc_jump(u_int8_t af, unsigned int offset) { struct compat_delta *tmp; short delta; @@ -590,7 +592,8 @@ void xt_free_table_info(struct xt_table_info *info) EXPORT_SYMBOL(xt_free_table_info); /* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ -struct xt_table *xt_find_table_lock(struct net *net, int af, const char *name) +struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, + const char *name) { struct xt_table *t; @@ -612,13 +615,13 @@ void xt_table_unlock(struct xt_table *table) EXPORT_SYMBOL_GPL(xt_table_unlock); #ifdef CONFIG_COMPAT -void xt_compat_lock(int af) +void xt_compat_lock(u_int8_t af) { mutex_lock(&xt[af].compat_mutex); } EXPORT_SYMBOL_GPL(xt_compat_lock); -void xt_compat_unlock(int af) +void xt_compat_unlock(u_int8_t af) { mutex_unlock(&xt[af].compat_mutex); } @@ -722,13 +725,13 @@ EXPORT_SYMBOL_GPL(xt_unregister_table); #ifdef CONFIG_PROC_FS struct xt_names_priv { struct seq_net_private p; - int af; + u_int8_t af; }; static void *xt_table_seq_start(struct seq_file *seq, loff_t *pos) { struct xt_names_priv *priv = seq->private; struct net *net = seq_file_net(seq); - int af = priv->af; + u_int8_t af = priv->af; mutex_lock(&xt[af].mutex); return seq_list_start(&net->xt.tables[af], *pos); @@ -738,7 +741,7 @@ static void *xt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct xt_names_priv *priv = seq->private; struct net *net = seq_file_net(seq); - int af = priv->af; + u_int8_t af = priv->af; return seq_list_next(v, &net->xt.tables[af], pos); } @@ -746,7 +749,7 @@ static void *xt_table_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void xt_table_seq_stop(struct seq_file *seq, void *v) { struct xt_names_priv *priv = seq->private; - int af = priv->af; + u_int8_t af = priv->af; mutex_unlock(&xt[af].mutex); } @@ -922,7 +925,7 @@ static const struct file_operations xt_target_ops = { #endif /* CONFIG_PROC_FS */ -int xt_proto_init(struct net *net, int af) +int xt_proto_init(struct net *net, u_int8_t af) { #ifdef CONFIG_PROC_FS char buf[XT_FUNCTION_MAXNAMELEN]; @@ -974,7 +977,7 @@ out: } EXPORT_SYMBOL_GPL(xt_proto_init); -void xt_proto_fini(struct net *net, int af) +void xt_proto_fini(struct net *net, u_int8_t af) { #ifdef CONFIG_PROC_FS char buf[XT_FUNCTION_MAXNAMELEN]; diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 70907f6baac..1655e2cf25c 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -82,7 +82,7 @@ static inline bool already_closed(const struct nf_conn *conn) static inline unsigned int same_source_net(const union nf_inet_addr *addr, const union nf_inet_addr *mask, - const union nf_inet_addr *u3, unsigned int family) + const union nf_inet_addr *u3, u_int8_t family) { if (family == AF_INET) { return (addr->ip & mask->ip) == (u3->ip & mask->ip); diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index d61412f58ef..28a42a3fbff 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -133,7 +133,7 @@ conntrack_addrcmp(const union nf_inet_addr *kaddr, static inline bool conntrack_mt_origsrc(const struct nf_conn *ct, const struct xt_conntrack_mtinfo1 *info, - unsigned int family) + u_int8_t family) { return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3, &info->origsrc_addr, &info->origsrc_mask, family); @@ -142,7 +142,7 @@ conntrack_mt_origsrc(const struct nf_conn *ct, static inline bool conntrack_mt_origdst(const struct nf_conn *ct, const struct xt_conntrack_mtinfo1 *info, - unsigned int family) + u_int8_t family) { return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3, &info->origdst_addr, &info->origdst_mask, family); @@ -151,7 +151,7 @@ conntrack_mt_origdst(const struct nf_conn *ct, static inline bool conntrack_mt_replsrc(const struct nf_conn *ct, const struct xt_conntrack_mtinfo1 *info, - unsigned int family) + u_int8_t family) { return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3, &info->replsrc_addr, &info->replsrc_mask, family); @@ -160,7 +160,7 @@ conntrack_mt_replsrc(const struct nf_conn *ct, static inline bool conntrack_mt_repldst(const struct nf_conn *ct, const struct xt_conntrack_mtinfo1 *info, - unsigned int family) + u_int8_t family) { return conntrack_addrcmp(&ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3, &info->repldst_addr, &info->repldst_mask, family); diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index d9418a26781..0c9268fd2e1 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -80,7 +80,7 @@ struct dsthash_ent { struct xt_hashlimit_htable { struct hlist_node node; /* global list of all htables */ atomic_t use; - int family; + u_int8_t family; struct hashlimit_cfg1 cfg; /* config */ @@ -185,7 +185,7 @@ dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent) } static void htable_gc(unsigned long htlong); -static int htable_create_v0(struct xt_hashlimit_info *minfo, int family) +static int htable_create_v0(struct xt_hashlimit_info *minfo, u_int8_t family) { struct xt_hashlimit_htable *hinfo; unsigned int size; @@ -258,8 +258,7 @@ static int htable_create_v0(struct xt_hashlimit_info *minfo, int family) return 0; } -static int htable_create(struct xt_hashlimit_mtinfo1 *minfo, - unsigned int family) +static int htable_create(struct xt_hashlimit_mtinfo1 *minfo, u_int8_t family) { struct xt_hashlimit_htable *hinfo; unsigned int size; @@ -378,7 +377,7 @@ static void htable_destroy(struct xt_hashlimit_htable *hinfo) } static struct xt_hashlimit_htable *htable_find_get(const char *name, - int family) + u_int8_t family) { struct xt_hashlimit_htable *hinfo; struct hlist_node *pos; @@ -901,7 +900,7 @@ static void dl_seq_stop(struct seq_file *s, void *v) spin_unlock_bh(&htable->lock); } -static int dl_seq_real_show(struct dsthash_ent *ent, int family, +static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, struct seq_file *s) { /* recalculate to show accurate numbers */ -- cgit v1.2.3 From e948b20a71a06a740c925d6ea22b59b4e17cfa0c Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:00 +0200 Subject: netfilter: rename ipt_recent to xt_recent Like with other modules (such as ipt_state), ipt_recent.h is changed to forward definitions to (IOW include) xt_recent.h, and xt_recent.c is changed to use the new constant names. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/netfilter/Kconfig | 11 + net/netfilter/Makefile | 1 + net/netfilter/xt_recent.c | 502 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 514 insertions(+) create mode 100644 net/netfilter/xt_recent.c (limited to 'net/netfilter') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index ee898e74808..ccc78b07a1a 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -732,6 +732,17 @@ config NETFILTER_XT_MATCH_REALM If you want to compile it as a module, say M here and read . If unsure, say `N'. +config NETFILTER_XT_MATCH_RECENT + tristate '"recent" match support' + depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED + ---help--- + This match is used for creating one or many lists of recently + used addresses and then matching against that/those list(s). + + Short options are available by using 'iptables -m recent -h' + Official Website: + config NETFILTER_XT_MATCH_SCTP tristate '"sctp" protocol match support (EXPERIMENTAL)' depends on NETFILTER_XTABLES && EXPERIMENTAL diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 3bd2cc556ae..f101cf61e6f 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -76,6 +76,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o obj-$(CONFIG_NETFILTER_XT_MATCH_RATEEST) += xt_rateest.o obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o +obj-$(CONFIG_NETFILTER_XT_MATCH_RECENT) += xt_recent.o obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o obj-$(CONFIG_NETFILTER_XT_MATCH_STATISTIC) += xt_statistic.o diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c new file mode 100644 index 00000000000..422c0e4d66b --- /dev/null +++ b/net/netfilter/xt_recent.c @@ -0,0 +1,502 @@ +/* + * Copyright (c) 2006 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This is a replacement of the old ipt_recent module, which carried the + * following copyright notice: + * + * Author: Stephen Frost + * Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +MODULE_AUTHOR("Patrick McHardy "); +MODULE_DESCRIPTION("Xtables: \"recently-seen\" host matching for IPv4"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("ipt_recent"); + +static unsigned int ip_list_tot = 100; +static unsigned int ip_pkt_list_tot = 20; +static unsigned int ip_list_hash_size = 0; +static unsigned int ip_list_perms = 0644; +static unsigned int ip_list_uid = 0; +static unsigned int ip_list_gid = 0; +module_param(ip_list_tot, uint, 0400); +module_param(ip_pkt_list_tot, uint, 0400); +module_param(ip_list_hash_size, uint, 0400); +module_param(ip_list_perms, uint, 0400); +module_param(ip_list_uid, uint, 0400); +module_param(ip_list_gid, uint, 0400); +MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list"); +MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP to remember (max. 255)"); +MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs"); +MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/ipt_recent/* files"); +MODULE_PARM_DESC(ip_list_uid,"owner of /proc/net/ipt_recent/* files"); +MODULE_PARM_DESC(ip_list_gid,"owning group of /proc/net/ipt_recent/* files"); + +struct recent_entry { + struct list_head list; + struct list_head lru_list; + __be32 addr; + u_int8_t ttl; + u_int8_t index; + u_int16_t nstamps; + unsigned long stamps[0]; +}; + +struct recent_table { + struct list_head list; + char name[XT_RECENT_NAME_LEN]; +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc; +#endif + unsigned int refcnt; + unsigned int entries; + struct list_head lru_list; + struct list_head iphash[0]; +}; + +static LIST_HEAD(tables); +static DEFINE_SPINLOCK(recent_lock); +static DEFINE_MUTEX(recent_mutex); + +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry *proc_dir; +static const struct file_operations recent_fops; +#endif + +static u_int32_t hash_rnd; +static int hash_rnd_initted; + +static unsigned int recent_entry_hash(__be32 addr) +{ + if (!hash_rnd_initted) { + get_random_bytes(&hash_rnd, 4); + hash_rnd_initted = 1; + } + return jhash_1word((__force u32)addr, hash_rnd) & (ip_list_hash_size - 1); +} + +static struct recent_entry * +recent_entry_lookup(const struct recent_table *table, __be32 addr, u_int8_t ttl) +{ + struct recent_entry *e; + unsigned int h; + + h = recent_entry_hash(addr); + list_for_each_entry(e, &table->iphash[h], list) + if (e->addr == addr && (ttl == e->ttl || !ttl || !e->ttl)) + return e; + return NULL; +} + +static void recent_entry_remove(struct recent_table *t, struct recent_entry *e) +{ + list_del(&e->list); + list_del(&e->lru_list); + kfree(e); + t->entries--; +} + +static struct recent_entry * +recent_entry_init(struct recent_table *t, __be32 addr, u_int8_t ttl) +{ + struct recent_entry *e; + + if (t->entries >= ip_list_tot) { + e = list_entry(t->lru_list.next, struct recent_entry, lru_list); + recent_entry_remove(t, e); + } + e = kmalloc(sizeof(*e) + sizeof(e->stamps[0]) * ip_pkt_list_tot, + GFP_ATOMIC); + if (e == NULL) + return NULL; + e->addr = addr; + e->ttl = ttl; + e->stamps[0] = jiffies; + e->nstamps = 1; + e->index = 1; + list_add_tail(&e->list, &t->iphash[recent_entry_hash(addr)]); + list_add_tail(&e->lru_list, &t->lru_list); + t->entries++; + return e; +} + +static void recent_entry_update(struct recent_table *t, struct recent_entry *e) +{ + e->stamps[e->index++] = jiffies; + if (e->index > e->nstamps) + e->nstamps = e->index; + e->index %= ip_pkt_list_tot; + list_move_tail(&e->lru_list, &t->lru_list); +} + +static struct recent_table *recent_table_lookup(const char *name) +{ + struct recent_table *t; + + list_for_each_entry(t, &tables, list) + if (!strcmp(t->name, name)) + return t; + return NULL; +} + +static void recent_table_flush(struct recent_table *t) +{ + struct recent_entry *e, *next; + unsigned int i; + + for (i = 0; i < ip_list_hash_size; i++) + list_for_each_entry_safe(e, next, &t->iphash[i], list) + recent_entry_remove(t, e); +} + +static bool +recent_mt(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct xt_match *match, + const void *matchinfo, int offset, unsigned int protoff, + bool *hotdrop) +{ + const struct xt_recent_mtinfo *info = matchinfo; + struct recent_table *t; + struct recent_entry *e; + __be32 addr; + u_int8_t ttl; + bool ret = info->invert; + + if (info->side == XT_RECENT_DEST) + addr = ip_hdr(skb)->daddr; + else + addr = ip_hdr(skb)->saddr; + + ttl = ip_hdr(skb)->ttl; + /* use TTL as seen before forwarding */ + if (out && !skb->sk) + ttl++; + + spin_lock_bh(&recent_lock); + t = recent_table_lookup(info->name); + e = recent_entry_lookup(t, addr, + info->check_set & XT_RECENT_TTL ? ttl : 0); + if (e == NULL) { + if (!(info->check_set & XT_RECENT_SET)) + goto out; + e = recent_entry_init(t, addr, ttl); + if (e == NULL) + *hotdrop = true; + ret = !ret; + goto out; + } + + if (info->check_set & XT_RECENT_SET) + ret = !ret; + else if (info->check_set & XT_RECENT_REMOVE) { + recent_entry_remove(t, e); + ret = !ret; + } else if (info->check_set & (XT_RECENT_CHECK | XT_RECENT_UPDATE)) { + unsigned long time = jiffies - info->seconds * HZ; + unsigned int i, hits = 0; + + for (i = 0; i < e->nstamps; i++) { + if (info->seconds && time_after(time, e->stamps[i])) + continue; + if (++hits >= info->hit_count) { + ret = !ret; + break; + } + } + } + + if (info->check_set & XT_RECENT_SET || + (info->check_set & XT_RECENT_UPDATE && ret)) { + recent_entry_update(t, e); + e->ttl = ttl; + } +out: + spin_unlock_bh(&recent_lock); + return ret; +} + +static bool +recent_mt_check(const char *tablename, const void *ip, + const struct xt_match *match, void *matchinfo, + unsigned int hook_mask) +{ + const struct xt_recent_mtinfo *info = matchinfo; + struct recent_table *t; + unsigned i; + bool ret = false; + + if (hweight8(info->check_set & + (XT_RECENT_SET | XT_RECENT_REMOVE | + XT_RECENT_CHECK | XT_RECENT_UPDATE)) != 1) + return false; + if ((info->check_set & (XT_RECENT_SET | XT_RECENT_REMOVE)) && + (info->seconds || info->hit_count)) + return false; + if (info->hit_count > ip_pkt_list_tot) + return false; + if (info->name[0] == '\0' || + strnlen(info->name, XT_RECENT_NAME_LEN) == XT_RECENT_NAME_LEN) + return false; + + mutex_lock(&recent_mutex); + t = recent_table_lookup(info->name); + if (t != NULL) { + t->refcnt++; + ret = true; + goto out; + } + + t = kzalloc(sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size, + GFP_KERNEL); + if (t == NULL) + goto out; + t->refcnt = 1; + strcpy(t->name, info->name); + INIT_LIST_HEAD(&t->lru_list); + for (i = 0; i < ip_list_hash_size; i++) + INIT_LIST_HEAD(&t->iphash[i]); +#ifdef CONFIG_PROC_FS + t->proc = proc_create(t->name, ip_list_perms, proc_dir, &recent_fops); + if (t->proc == NULL) { + kfree(t); + goto out; + } + t->proc->uid = ip_list_uid; + t->proc->gid = ip_list_gid; + t->proc->data = t; +#endif + spin_lock_bh(&recent_lock); + list_add_tail(&t->list, &tables); + spin_unlock_bh(&recent_lock); + ret = true; +out: + mutex_unlock(&recent_mutex); + return ret; +} + +static void recent_mt_destroy(const struct xt_match *match, void *matchinfo) +{ + const struct xt_recent_mtinfo *info = matchinfo; + struct recent_table *t; + + mutex_lock(&recent_mutex); + t = recent_table_lookup(info->name); + if (--t->refcnt == 0) { + spin_lock_bh(&recent_lock); + list_del(&t->list); + spin_unlock_bh(&recent_lock); +#ifdef CONFIG_PROC_FS + remove_proc_entry(t->name, proc_dir); +#endif + recent_table_flush(t); + kfree(t); + } + mutex_unlock(&recent_mutex); +} + +#ifdef CONFIG_PROC_FS +struct recent_iter_state { + struct recent_table *table; + unsigned int bucket; +}; + +static void *recent_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(recent_lock) +{ + struct recent_iter_state *st = seq->private; + const struct recent_table *t = st->table; + struct recent_entry *e; + loff_t p = *pos; + + spin_lock_bh(&recent_lock); + + for (st->bucket = 0; st->bucket < ip_list_hash_size; st->bucket++) + list_for_each_entry(e, &t->iphash[st->bucket], list) + if (p-- == 0) + return e; + return NULL; +} + +static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct recent_iter_state *st = seq->private; + const struct recent_table *t = st->table; + struct recent_entry *e = v; + struct list_head *head = e->list.next; + + while (head == &t->iphash[st->bucket]) { + if (++st->bucket >= ip_list_hash_size) + return NULL; + head = t->iphash[st->bucket].next; + } + (*pos)++; + return list_entry(head, struct recent_entry, list); +} + +static void recent_seq_stop(struct seq_file *s, void *v) + __releases(recent_lock) +{ + spin_unlock_bh(&recent_lock); +} + +static int recent_seq_show(struct seq_file *seq, void *v) +{ + const struct recent_entry *e = v; + unsigned int i; + + i = (e->index - 1) % ip_pkt_list_tot; + seq_printf(seq, "src=%u.%u.%u.%u ttl: %u last_seen: %lu oldest_pkt: %u", + NIPQUAD(e->addr), e->ttl, e->stamps[i], e->index); + for (i = 0; i < e->nstamps; i++) + seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]); + seq_printf(seq, "\n"); + return 0; +} + +static const struct seq_operations recent_seq_ops = { + .start = recent_seq_start, + .next = recent_seq_next, + .stop = recent_seq_stop, + .show = recent_seq_show, +}; + +static int recent_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *pde = PDE(inode); + struct recent_iter_state *st; + + st = __seq_open_private(file, &recent_seq_ops, sizeof(*st)); + if (st == NULL) + return -ENOMEM; + + st->table = pde->data; + return 0; +} + +static ssize_t recent_proc_write(struct file *file, const char __user *input, + size_t size, loff_t *loff) +{ + const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + struct recent_table *t = pde->data; + struct recent_entry *e; + char buf[sizeof("+255.255.255.255")], *c = buf; + __be32 addr; + int add; + + if (size > sizeof(buf)) + size = sizeof(buf); + if (copy_from_user(buf, input, size)) + return -EFAULT; + while (isspace(*c)) + c++; + + if (size - (c - buf) < 5) + return c - buf; + if (!strncmp(c, "clear", 5)) { + c += 5; + spin_lock_bh(&recent_lock); + recent_table_flush(t); + spin_unlock_bh(&recent_lock); + return c - buf; + } + + switch (*c) { + case '-': + add = 0; + c++; + break; + case '+': + c++; + default: + add = 1; + break; + } + addr = in_aton(c); + + spin_lock_bh(&recent_lock); + e = recent_entry_lookup(t, addr, 0); + if (e == NULL) { + if (add) + recent_entry_init(t, addr, 0); + } else { + if (add) + recent_entry_update(t, e); + else + recent_entry_remove(t, e); + } + spin_unlock_bh(&recent_lock); + return size; +} + +static const struct file_operations recent_fops = { + .open = recent_seq_open, + .read = seq_read, + .write = recent_proc_write, + .release = seq_release_private, + .owner = THIS_MODULE, +}; +#endif /* CONFIG_PROC_FS */ + +static struct xt_match recent_mt_reg __read_mostly = { + .name = "recent", + .family = AF_INET, + .match = recent_mt, + .matchsize = sizeof(struct xt_recent_mtinfo), + .checkentry = recent_mt_check, + .destroy = recent_mt_destroy, + .me = THIS_MODULE, +}; + +static int __init recent_mt_init(void) +{ + int err; + + if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255) + return -EINVAL; + ip_list_hash_size = 1 << fls(ip_list_tot); + + err = xt_register_match(&recent_mt_reg); +#ifdef CONFIG_PROC_FS + if (err) + return err; + proc_dir = proc_mkdir("ipt_recent", init_net.proc_net); + if (proc_dir == NULL) { + xt_unregister_match(&recent_mt_reg); + err = -ENOMEM; + } +#endif + return err; +} + +static void __exit recent_mt_exit(void) +{ + BUG_ON(!list_empty(&tables)); + xt_unregister_match(&recent_mt_reg); +#ifdef CONFIG_PROC_FS + remove_proc_entry("ipt_recent", init_net.proc_net); +#endif +} + +module_init(recent_mt_init); +module_exit(recent_mt_exit); -- cgit v1.2.3 From 079aa88fe7172b7650c7cf2c0bc01662bafea236 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:00 +0200 Subject: netfilter: xt_recent: IPv6 support This updates xt_recent to support the IPv6 address family. The new /proc/net/xt_recent directory must be used for this. The old proc interface can also be configured out. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/netfilter/Kconfig | 7 ++ net/netfilter/xt_recent.c | 300 +++++++++++++++++++++++++++++++++++++--------- 2 files changed, 253 insertions(+), 54 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index ccc78b07a1a..4a464857f21 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -743,6 +743,13 @@ config NETFILTER_XT_MATCH_RECENT Short options are available by using 'iptables -m recent -h' Official Website: +config NETFILTER_XT_MATCH_RECENT_PROC_COMPAT + bool 'Enable obsolete /proc/net/ipt_recent' + depends on NETFILTER_XT_MATCH_RECENT && PROC_FS + ---help--- + This option enables the old /proc/net/ipt_recent interface, + which has been obsoleted by /proc/net/xt_recent. + config NETFILTER_XT_MATCH_SCTP tristate '"sctp" protocol match support (EXPERIMENTAL)' depends on NETFILTER_XTABLES && EXPERIMENTAL diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 422c0e4d66b..adc2e2f1b09 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -1,5 +1,6 @@ /* * Copyright (c) 2006 Patrick McHardy + * Copyright © CC Computer Consultants GmbH, 2007 - 2008 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -13,6 +14,8 @@ */ #include #include +#include +#include #include #include #include @@ -30,9 +33,11 @@ #include MODULE_AUTHOR("Patrick McHardy "); +MODULE_AUTHOR("Jan Engelhardt "); MODULE_DESCRIPTION("Xtables: \"recently-seen\" host matching for IPv4"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_recent"); +MODULE_ALIAS("ip6t_recent"); static unsigned int ip_list_tot = 100; static unsigned int ip_pkt_list_tot = 20; @@ -49,14 +54,15 @@ module_param(ip_list_gid, uint, 0400); MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list"); MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP to remember (max. 255)"); MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs"); -MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/ipt_recent/* files"); -MODULE_PARM_DESC(ip_list_uid,"owner of /proc/net/ipt_recent/* files"); -MODULE_PARM_DESC(ip_list_gid,"owning group of /proc/net/ipt_recent/* files"); +MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/xt_recent/* files"); +MODULE_PARM_DESC(ip_list_uid,"owner of /proc/net/xt_recent/* files"); +MODULE_PARM_DESC(ip_list_gid,"owning group of /proc/net/xt_recent/* files"); struct recent_entry { struct list_head list; struct list_head lru_list; - __be32 addr; + union nf_inet_addr addr; + u_int16_t family; u_int8_t ttl; u_int8_t index; u_int16_t nstamps; @@ -67,7 +73,7 @@ struct recent_table { struct list_head list; char name[XT_RECENT_NAME_LEN]; #ifdef CONFIG_PROC_FS - struct proc_dir_entry *proc; + struct proc_dir_entry *proc_old, *proc; #endif unsigned int refcnt; unsigned int entries; @@ -80,31 +86,53 @@ static DEFINE_SPINLOCK(recent_lock); static DEFINE_MUTEX(recent_mutex); #ifdef CONFIG_PROC_FS -static struct proc_dir_entry *proc_dir; -static const struct file_operations recent_fops; +#ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT +static struct proc_dir_entry *proc_old_dir; +#endif +static struct proc_dir_entry *recent_proc_dir; +static const struct file_operations recent_old_fops, recent_mt_fops; #endif static u_int32_t hash_rnd; -static int hash_rnd_initted; +static bool hash_rnd_initted; + +static unsigned int recent_entry_hash4(const union nf_inet_addr *addr) +{ + if (!hash_rnd_initted) { + get_random_bytes(&hash_rnd, sizeof(hash_rnd)); + hash_rnd_initted = true; + } + return jhash_1word((__force u32)addr->ip, hash_rnd) & + (ip_list_hash_size - 1); +} -static unsigned int recent_entry_hash(__be32 addr) +static unsigned int recent_entry_hash6(const union nf_inet_addr *addr) { if (!hash_rnd_initted) { - get_random_bytes(&hash_rnd, 4); - hash_rnd_initted = 1; + get_random_bytes(&hash_rnd, sizeof(hash_rnd)); + hash_rnd_initted = true; } - return jhash_1word((__force u32)addr, hash_rnd) & (ip_list_hash_size - 1); + return jhash2((u32 *)addr->ip6, ARRAY_SIZE(addr->ip6), hash_rnd) & + (ip_list_hash_size - 1); } static struct recent_entry * -recent_entry_lookup(const struct recent_table *table, __be32 addr, u_int8_t ttl) +recent_entry_lookup(const struct recent_table *table, + const union nf_inet_addr *addrp, u_int16_t family, + u_int8_t ttl) { struct recent_entry *e; unsigned int h; - h = recent_entry_hash(addr); + if (family == AF_INET) + h = recent_entry_hash4(addrp); + else + h = recent_entry_hash6(addrp); + list_for_each_entry(e, &table->iphash[h], list) - if (e->addr == addr && (ttl == e->ttl || !ttl || !e->ttl)) + if (e->family == family && + memcmp(&e->addr, addrp, sizeof(e->addr)) == 0 && + (ttl == e->ttl || ttl == 0 || e->ttl == 0)) return e; return NULL; } @@ -118,7 +146,8 @@ static void recent_entry_remove(struct recent_table *t, struct recent_entry *e) } static struct recent_entry * -recent_entry_init(struct recent_table *t, __be32 addr, u_int8_t ttl) +recent_entry_init(struct recent_table *t, const union nf_inet_addr *addr, + u_int16_t family, u_int8_t ttl) { struct recent_entry *e; @@ -130,12 +159,16 @@ recent_entry_init(struct recent_table *t, __be32 addr, u_int8_t ttl) GFP_ATOMIC); if (e == NULL) return NULL; - e->addr = addr; + memcpy(&e->addr, addr, sizeof(e->addr)); e->ttl = ttl; e->stamps[0] = jiffies; e->nstamps = 1; e->index = 1; - list_add_tail(&e->list, &t->iphash[recent_entry_hash(addr)]); + e->family = family; + if (family == AF_INET) + list_add_tail(&e->list, &t->iphash[recent_entry_hash4(addr)]); + else + list_add_tail(&e->list, &t->iphash[recent_entry_hash6(addr)]); list_add_tail(&e->lru_list, &t->lru_list); t->entries++; return e; @@ -179,28 +212,42 @@ recent_mt(const struct sk_buff *skb, const struct net_device *in, const struct xt_recent_mtinfo *info = matchinfo; struct recent_table *t; struct recent_entry *e; - __be32 addr; + union nf_inet_addr addr = {}; u_int8_t ttl; bool ret = info->invert; - if (info->side == XT_RECENT_DEST) - addr = ip_hdr(skb)->daddr; - else - addr = ip_hdr(skb)->saddr; + if (match->family == AF_INET) { + const struct iphdr *iph = ip_hdr(skb); + + if (info->side == XT_RECENT_DEST) + addr.ip = iph->daddr; + else + addr.ip = iph->saddr; + + ttl = iph->ttl; + } else { + const struct ipv6hdr *iph = ipv6_hdr(skb); + + if (info->side == XT_RECENT_DEST) + memcpy(&addr.in6, &iph->daddr, sizeof(addr.in6)); + else + memcpy(&addr.in6, &iph->saddr, sizeof(addr.in6)); + + ttl = iph->hop_limit; + } - ttl = ip_hdr(skb)->ttl; /* use TTL as seen before forwarding */ if (out && !skb->sk) ttl++; spin_lock_bh(&recent_lock); t = recent_table_lookup(info->name); - e = recent_entry_lookup(t, addr, - info->check_set & XT_RECENT_TTL ? ttl : 0); + e = recent_entry_lookup(t, &addr, match->family, + (info->check_set & XT_RECENT_TTL) ? ttl : 0); if (e == NULL) { if (!(info->check_set & XT_RECENT_SET)) goto out; - e = recent_entry_init(t, addr, ttl); + e = recent_entry_init(t, &addr, match->family, ttl); if (e == NULL) *hotdrop = true; ret = !ret; @@ -277,11 +324,24 @@ recent_mt_check(const char *tablename, const void *ip, for (i = 0; i < ip_list_hash_size; i++) INIT_LIST_HEAD(&t->iphash[i]); #ifdef CONFIG_PROC_FS - t->proc = proc_create(t->name, ip_list_perms, proc_dir, &recent_fops); + t->proc = proc_create(t->name, ip_list_perms, recent_proc_dir, + &recent_mt_fops); if (t->proc == NULL) { kfree(t); goto out; } +#ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT + t->proc_old = proc_create(t->name, ip_list_perms, proc_old_dir, + &recent_old_fops); + if (t->proc_old == NULL) { + remove_proc_entry(t->name, proc_old_dir); + kfree(t); + goto out; + } + t->proc_old->uid = ip_list_uid; + t->proc_old->gid = ip_list_gid; + t->proc_old->data = t; +#endif t->proc->uid = ip_list_uid; t->proc->gid = ip_list_gid; t->proc->data = t; @@ -307,7 +367,10 @@ static void recent_mt_destroy(const struct xt_match *match, void *matchinfo) list_del(&t->list); spin_unlock_bh(&recent_lock); #ifdef CONFIG_PROC_FS - remove_proc_entry(t->name, proc_dir); +#ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT + remove_proc_entry(t->name, proc_old_dir); +#endif + remove_proc_entry(t->name, recent_proc_dir); #endif recent_table_flush(t); kfree(t); @@ -317,7 +380,7 @@ static void recent_mt_destroy(const struct xt_match *match, void *matchinfo) #ifdef CONFIG_PROC_FS struct recent_iter_state { - struct recent_table *table; + const struct recent_table *table; unsigned int bucket; }; @@ -342,8 +405,8 @@ static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct recent_iter_state *st = seq->private; const struct recent_table *t = st->table; - struct recent_entry *e = v; - struct list_head *head = e->list.next; + const struct recent_entry *e = v; + const struct list_head *head = e->list.next; while (head == &t->iphash[st->bucket]) { if (++st->bucket >= ip_list_hash_size) @@ -366,8 +429,14 @@ static int recent_seq_show(struct seq_file *seq, void *v) unsigned int i; i = (e->index - 1) % ip_pkt_list_tot; - seq_printf(seq, "src=%u.%u.%u.%u ttl: %u last_seen: %lu oldest_pkt: %u", - NIPQUAD(e->addr), e->ttl, e->stamps[i], e->index); + if (e->family == AF_INET) + seq_printf(seq, "src=" NIPQUAD_FMT " ttl: %u last_seen: %lu " + "oldest_pkt: %u", NIPQUAD(e->addr.ip), e->ttl, + e->stamps[i], e->index); + else + seq_printf(seq, "src=" NIP6_FMT " ttl: %u last_seen: %lu " + "oldest_pkt: %u", NIP6(e->addr.in6), e->ttl, + e->stamps[i], e->index); for (i = 0; i < e->nstamps; i++) seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]); seq_printf(seq, "\n"); @@ -394,8 +463,22 @@ static int recent_seq_open(struct inode *inode, struct file *file) return 0; } -static ssize_t recent_proc_write(struct file *file, const char __user *input, - size_t size, loff_t *loff) +#ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT +static int recent_old_seq_open(struct inode *inode, struct file *filp) +{ + static bool warned_of_old; + + if (unlikely(!warned_of_old)) { + printk(KERN_INFO KBUILD_MODNAME ": Use of /proc/net/ipt_recent" + " is deprecated; use /proc/net/xt_recent.\n"); + warned_of_old = true; + } + return recent_seq_open(inode, filp); +} + +static ssize_t recent_old_proc_write(struct file *file, + const char __user *input, + size_t size, loff_t *loff) { const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); struct recent_table *t = pde->data; @@ -408,6 +491,7 @@ static ssize_t recent_proc_write(struct file *file, const char __user *input, size = sizeof(buf); if (copy_from_user(buf, input, size)) return -EFAULT; + while (isspace(*c)) c++; @@ -435,10 +519,10 @@ static ssize_t recent_proc_write(struct file *file, const char __user *input, addr = in_aton(c); spin_lock_bh(&recent_lock); - e = recent_entry_lookup(t, addr, 0); + e = recent_entry_lookup(t, (const void *)&addr, PF_INET, 0); if (e == NULL) { if (add) - recent_entry_init(t, addr, 0); + recent_entry_init(t, (const void *)&addr, PF_INET, 0); } else { if (add) recent_entry_update(t, e); @@ -449,23 +533,118 @@ static ssize_t recent_proc_write(struct file *file, const char __user *input, return size; } -static const struct file_operations recent_fops = { - .open = recent_seq_open, +static const struct file_operations recent_old_fops = { + .open = recent_old_seq_open, .read = seq_read, - .write = recent_proc_write, + .write = recent_old_proc_write, .release = seq_release_private, .owner = THIS_MODULE, }; +#endif + +static ssize_t +recent_mt_proc_write(struct file *file, const char __user *input, + size_t size, loff_t *loff) +{ + const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); + struct recent_table *t = pde->data; + struct recent_entry *e; + char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")]; + const char *c = buf; + union nf_inet_addr addr; + u_int16_t family; + bool add, succ; + + if (size == 0) + return 0; + if (size > sizeof(buf)) + size = sizeof(buf); + if (copy_from_user(buf, input, size) != 0) + return -EFAULT; + + /* Strict protocol! */ + if (*loff != 0) + return -ESPIPE; + switch (*c) { + case '/': /* flush table */ + spin_lock_bh(&recent_lock); + recent_table_flush(t); + spin_unlock_bh(&recent_lock); + return size; + case '-': /* remove address */ + add = false; + break; + case '+': /* add address */ + add = true; + break; + default: + printk(KERN_INFO KBUILD_MODNAME ": Need +ip, -ip or /\n"); + return -EINVAL; + } + + ++c; + --size; + if (strnchr(c, size, ':') != NULL) { + family = AF_INET6; + succ = in6_pton(c, size, (void *)&addr, '\n', NULL); + } else { + family = AF_INET; + succ = in4_pton(c, size, (void *)&addr, '\n', NULL); + } + + if (!succ) { + printk(KERN_INFO KBUILD_MODNAME ": illegal address written " + "to procfs\n"); + return -EINVAL; + } + + spin_lock_bh(&recent_lock); + e = recent_entry_lookup(t, &addr, family, 0); + if (e == NULL) { + if (add) + recent_entry_init(t, &addr, family, 0); + } else { + if (add) + recent_entry_update(t, e); + else + recent_entry_remove(t, e); + } + spin_unlock_bh(&recent_lock); + /* Note we removed one above */ + *loff += size + 1; + return size + 1; +} + +static const struct file_operations recent_mt_fops = { + .open = recent_seq_open, + .read = seq_read, + .write = recent_mt_proc_write, + .release = seq_release_private, + .owner = THIS_MODULE, +}; #endif /* CONFIG_PROC_FS */ -static struct xt_match recent_mt_reg __read_mostly = { - .name = "recent", - .family = AF_INET, - .match = recent_mt, - .matchsize = sizeof(struct xt_recent_mtinfo), - .checkentry = recent_mt_check, - .destroy = recent_mt_destroy, - .me = THIS_MODULE, +static struct xt_match recent_mt_reg[] __read_mostly = { + { + .name = "recent", + .revision = 0, + .family = AF_INET, + .match = recent_mt, + .matchsize = sizeof(struct xt_recent_mtinfo), + .checkentry = recent_mt_check, + .destroy = recent_mt_destroy, + .me = THIS_MODULE, + }, + { + .name = "recent", + .revision = 0, + .family = AF_INET6, + .match = recent_mt, + .matchsize = sizeof(struct xt_recent_mtinfo), + .checkentry = recent_mt_check, + .destroy = recent_mt_destroy, + .me = THIS_MODULE, + }, }; static int __init recent_mt_init(void) @@ -476,15 +655,25 @@ static int __init recent_mt_init(void) return -EINVAL; ip_list_hash_size = 1 << fls(ip_list_tot); - err = xt_register_match(&recent_mt_reg); + err = xt_register_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg)); #ifdef CONFIG_PROC_FS if (err) return err; - proc_dir = proc_mkdir("ipt_recent", init_net.proc_net); - if (proc_dir == NULL) { - xt_unregister_match(&recent_mt_reg); + recent_proc_dir = proc_mkdir("xt_recent", init_net.proc_net); + if (recent_proc_dir == NULL) { + xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg)); + err = -ENOMEM; + } +#ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT + if (err < 0) + return err; + proc_old_dir = proc_mkdir("ipt_recent", init_net.proc_net); + if (proc_old_dir == NULL) { + remove_proc_entry("xt_recent", init_net.proc_net); + xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg)); err = -ENOMEM; } +#endif #endif return err; } @@ -492,9 +681,12 @@ static int __init recent_mt_init(void) static void __exit recent_mt_exit(void) { BUG_ON(!list_empty(&tables)); - xt_unregister_match(&recent_mt_reg); + xt_unregister_matches(recent_mt_reg, ARRAY_SIZE(recent_mt_reg)); #ifdef CONFIG_PROC_FS +#ifdef CONFIG_NETFILTER_XT_MATCH_RECENT_PROC_COMPAT remove_proc_entry("ipt_recent", init_net.proc_net); +#endif + remove_proc_entry("xt_recent", init_net.proc_net); #endif } -- cgit v1.2.3 From 7e9c6eeb136a46dfd941852803b3a9dd78939b69 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:00 +0200 Subject: netfilter: Introduce NFPROTO_* constants The netfilter subsystem only supports a handful of protocols (much less than PF_*) and even non-PF protocols like ARP and pseudo-protocols like PF_BRIDGE. By creating NFPROTO_*, we can earn a few memory savings on arrays that previously were always PF_MAX-sized and keep the pseudo-protocols to ourselves. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/netfilter/core.c | 6 +++--- net/netfilter/nf_log.c | 12 ++++++------ net/netfilter/nf_queue.c | 12 ++++++------ net/netfilter/x_tables.c | 18 ++++++++++-------- 4 files changed, 25 insertions(+), 23 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 26b8f489d7a..b16cd79951c 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -26,7 +26,7 @@ static DEFINE_MUTEX(afinfo_mutex); -const struct nf_afinfo *nf_afinfo[NPROTO] __read_mostly; +const struct nf_afinfo *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly; EXPORT_SYMBOL(nf_afinfo); int nf_register_afinfo(const struct nf_afinfo *afinfo) @@ -51,7 +51,7 @@ void nf_unregister_afinfo(const struct nf_afinfo *afinfo) } EXPORT_SYMBOL_GPL(nf_unregister_afinfo); -struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS] __read_mostly; +struct list_head nf_hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS] __read_mostly; EXPORT_SYMBOL(nf_hooks); static DEFINE_MUTEX(nf_hook_mutex); @@ -264,7 +264,7 @@ EXPORT_SYMBOL(proc_net_netfilter); void __init netfilter_init(void) { int i, h; - for (i = 0; i < NPROTO; i++) { + for (i = 0; i < ARRAY_SIZE(nf_hooks); i++) { for (h = 0; h < NF_MAX_HOOKS; h++) INIT_LIST_HEAD(&nf_hooks[i][h]); } diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 5c2f7332015..fa8ae5d2659 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -15,7 +15,7 @@ #define NF_LOG_PREFIXLEN 128 -static const struct nf_logger *nf_loggers[NPROTO] __read_mostly; +static const struct nf_logger *nf_loggers[NFPROTO_NUMPROTO] __read_mostly; static DEFINE_MUTEX(nf_log_mutex); /* return EBUSY if somebody else is registered, EEXIST if the same logger @@ -24,7 +24,7 @@ int nf_log_register(u_int8_t pf, const struct nf_logger *logger) { int ret; - if (pf >= NPROTO) + if (pf >= ARRAY_SIZE(nf_loggers)) return -EINVAL; /* Any setup of logging members must be done before @@ -47,7 +47,7 @@ EXPORT_SYMBOL(nf_log_register); void nf_log_unregister_pf(u_int8_t pf) { - if (pf >= NPROTO) + if (pf >= ARRAY_SIZE(nf_loggers)) return; mutex_lock(&nf_log_mutex); rcu_assign_pointer(nf_loggers[pf], NULL); @@ -63,7 +63,7 @@ void nf_log_unregister(const struct nf_logger *logger) int i; mutex_lock(&nf_log_mutex); - for (i = 0; i < NPROTO; i++) { + for (i = 0; i < ARRAY_SIZE(nf_loggers); i++) { if (nf_loggers[i] == logger) rcu_assign_pointer(nf_loggers[i], NULL); } @@ -103,7 +103,7 @@ static void *seq_start(struct seq_file *seq, loff_t *pos) { rcu_read_lock(); - if (*pos >= NPROTO) + if (*pos >= ARRAY_SIZE(nf_loggers)) return NULL; return pos; @@ -113,7 +113,7 @@ static void *seq_next(struct seq_file *s, void *v, loff_t *pos) { (*pos)++; - if (*pos >= NPROTO) + if (*pos >= ARRAY_SIZE(nf_loggers)) return NULL; return pos; diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index f285086f629..4f2310c93e0 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -16,7 +16,7 @@ * long term mutex. The handler must provide an an outfn() to accept packets * for queueing and must reinject all packets it receives, no matter what. */ -static const struct nf_queue_handler *queue_handler[NPROTO]; +static const struct nf_queue_handler *queue_handler[NFPROTO_NUMPROTO] __read_mostly; static DEFINE_MUTEX(queue_handler_mutex); @@ -26,7 +26,7 @@ int nf_register_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh) { int ret; - if (pf >= NPROTO) + if (pf >= ARRAY_SIZE(queue_handler)) return -EINVAL; mutex_lock(&queue_handler_mutex); @@ -47,7 +47,7 @@ EXPORT_SYMBOL(nf_register_queue_handler); /* The caller must flush their queue before this */ int nf_unregister_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh) { - if (pf >= NPROTO) + if (pf >= ARRAY_SIZE(queue_handler)) return -EINVAL; mutex_lock(&queue_handler_mutex); @@ -70,7 +70,7 @@ void nf_unregister_queue_handlers(const struct nf_queue_handler *qh) u_int8_t pf; mutex_lock(&queue_handler_mutex); - for (pf = 0; pf < NPROTO; pf++) { + for (pf = 0; pf < ARRAY_SIZE(queue_handler); pf++) { if (queue_handler[pf] == qh) rcu_assign_pointer(queue_handler[pf], NULL); } @@ -285,7 +285,7 @@ EXPORT_SYMBOL(nf_reinject); #ifdef CONFIG_PROC_FS static void *seq_start(struct seq_file *seq, loff_t *pos) { - if (*pos >= NPROTO) + if (*pos >= ARRAY_SIZE(queue_handler)) return NULL; return pos; @@ -295,7 +295,7 @@ static void *seq_next(struct seq_file *s, void *v, loff_t *pos) { (*pos)++; - if (*pos >= NPROTO) + if (*pos >= ARRAY_SIZE(queue_handler)) return NULL; return pos; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index cf2f3e90cef..2a7eb1da5d0 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -58,10 +58,12 @@ static struct xt_af *xt; #define duprintf(format, args...) #endif -static const char *const xt_prefix[NPROTO] = { - [AF_INET] = "ip", - [AF_INET6] = "ip6", - [NF_ARP] = "arp", +static const char *const xt_prefix[NFPROTO_NUMPROTO] = { + [NFPROTO_UNSPEC] = "x", + [NFPROTO_IPV4] = "ip", + [NFPROTO_ARP] = "arp", + [NFPROTO_BRIDGE] = "eb", + [NFPROTO_IPV6] = "ip6", }; /* Registration hooks for targets. */ @@ -932,7 +934,7 @@ int xt_proto_init(struct net *net, u_int8_t af) struct proc_dir_entry *proc; #endif - if (af >= NPROTO) + if (af >= ARRAY_SIZE(xt_prefix)) return -EINVAL; @@ -1001,7 +1003,7 @@ static int __net_init xt_net_init(struct net *net) { int i; - for (i = 0; i < NPROTO; i++) + for (i = 0; i < NFPROTO_NUMPROTO; i++) INIT_LIST_HEAD(&net->xt.tables[i]); return 0; } @@ -1014,11 +1016,11 @@ static int __init xt_init(void) { int i, rv; - xt = kmalloc(sizeof(struct xt_af) * NPROTO, GFP_KERNEL); + xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL); if (!xt) return -ENOMEM; - for (i = 0; i < NPROTO; i++) { + for (i = 0; i < NFPROTO_NUMPROTO; i++) { mutex_init(&xt[i].mutex); #ifdef CONFIG_COMPAT mutex_init(&xt[i].compat_mutex); -- cgit v1.2.3 From ee999d8b9573df1b547aacdc6d79f86eb79c25cd Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:01 +0200 Subject: netfilter: x_tables: use NFPROTO_* in extensions Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/netfilter/xt_CLASSIFY.c | 4 ++-- net/netfilter/xt_CONNMARK.c | 8 ++++---- net/netfilter/xt_CONNSECMARK.c | 4 ++-- net/netfilter/xt_DSCP.c | 10 +++++----- net/netfilter/xt_MARK.c | 12 ++++++------ net/netfilter/xt_NFLOG.c | 4 ++-- net/netfilter/xt_NFQUEUE.c | 4 ++-- net/netfilter/xt_NOTRACK.c | 4 ++-- net/netfilter/xt_RATEEST.c | 4 ++-- net/netfilter/xt_SECMARK.c | 4 ++-- net/netfilter/xt_TCPMSS.c | 4 ++-- net/netfilter/xt_TCPOPTSTRIP.c | 4 ++-- net/netfilter/xt_TRACE.c | 4 ++-- net/netfilter/xt_comment.c | 4 ++-- net/netfilter/xt_connbytes.c | 4 ++-- net/netfilter/xt_connlimit.c | 10 +++++----- net/netfilter/xt_connmark.c | 8 ++++---- net/netfilter/xt_conntrack.c | 10 +++++----- net/netfilter/xt_dccp.c | 4 ++-- net/netfilter/xt_dscp.c | 12 ++++++------ net/netfilter/xt_esp.c | 4 ++-- net/netfilter/xt_hashlimit.c | 40 +++++++++++++++++++--------------------- net/netfilter/xt_helper.c | 4 ++-- net/netfilter/xt_iprange.c | 6 +++--- net/netfilter/xt_length.c | 4 ++-- net/netfilter/xt_limit.c | 4 ++-- net/netfilter/xt_mac.c | 4 ++-- net/netfilter/xt_mark.c | 8 ++++---- net/netfilter/xt_multiport.c | 8 ++++---- net/netfilter/xt_owner.c | 8 ++++---- net/netfilter/xt_physdev.c | 4 ++-- net/netfilter/xt_pkttype.c | 8 ++++---- net/netfilter/xt_policy.c | 8 ++++---- net/netfilter/xt_quota.c | 4 ++-- net/netfilter/xt_rateest.c | 4 ++-- net/netfilter/xt_realm.c | 2 +- net/netfilter/xt_recent.c | 21 +++++++++++---------- net/netfilter/xt_sctp.c | 4 ++-- net/netfilter/xt_state.c | 4 ++-- net/netfilter/xt_statistic.c | 4 ++-- net/netfilter/xt_string.c | 8 ++++---- net/netfilter/xt_tcpmss.c | 4 ++-- net/netfilter/xt_tcpudp.c | 12 ++++++------ net/netfilter/xt_time.c | 4 ++-- net/netfilter/xt_u32.c | 4 ++-- 45 files changed, 153 insertions(+), 154 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c index 77a52bf8322..9d68da1748b 100644 --- a/net/netfilter/xt_CLASSIFY.c +++ b/net/netfilter/xt_CLASSIFY.c @@ -39,7 +39,7 @@ classify_tg(struct sk_buff *skb, const struct net_device *in, static struct xt_target classify_tg_reg[] __read_mostly = { { - .family = AF_INET, + .family = NFPROTO_IPV4, .name = "CLASSIFY", .target = classify_tg, .targetsize = sizeof(struct xt_classify_target_info), @@ -51,7 +51,7 @@ static struct xt_target classify_tg_reg[] __read_mostly = { }, { .name = "CLASSIFY", - .family = AF_INET6, + .family = NFPROTO_IPV6, .target = classify_tg, .targetsize = sizeof(struct xt_classify_target_info), .table = "mangle", diff --git a/net/netfilter/xt_CONNMARK.c b/net/netfilter/xt_CONNMARK.c index 5fecfb4794b..e72e5d01752 100644 --- a/net/netfilter/xt_CONNMARK.c +++ b/net/netfilter/xt_CONNMARK.c @@ -197,7 +197,7 @@ static struct xt_target connmark_tg_reg[] __read_mostly = { { .name = "CONNMARK", .revision = 0, - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = connmark_tg_check_v0, .destroy = connmark_tg_destroy, .target = connmark_tg_v0, @@ -212,7 +212,7 @@ static struct xt_target connmark_tg_reg[] __read_mostly = { { .name = "CONNMARK", .revision = 0, - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = connmark_tg_check_v0, .destroy = connmark_tg_destroy, .target = connmark_tg_v0, @@ -227,7 +227,7 @@ static struct xt_target connmark_tg_reg[] __read_mostly = { { .name = "CONNMARK", .revision = 1, - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = connmark_tg_check, .target = connmark_tg, .targetsize = sizeof(struct xt_connmark_tginfo1), @@ -237,7 +237,7 @@ static struct xt_target connmark_tg_reg[] __read_mostly = { { .name = "CONNMARK", .revision = 1, - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = connmark_tg_check, .target = connmark_tg, .targetsize = sizeof(struct xt_connmark_tginfo1), diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c index 76ca1f2421e..ae939e54dfa 100644 --- a/net/netfilter/xt_CONNSECMARK.c +++ b/net/netfilter/xt_CONNSECMARK.c @@ -127,7 +127,7 @@ connsecmark_tg_destroy(const struct xt_target *target, void *targinfo) static struct xt_target connsecmark_tg_reg[] __read_mostly = { { .name = "CONNSECMARK", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = connsecmark_tg_check, .destroy = connsecmark_tg_destroy, .target = connsecmark_tg, @@ -136,7 +136,7 @@ static struct xt_target connsecmark_tg_reg[] __read_mostly = { }, { .name = "CONNSECMARK", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = connsecmark_tg_check, .destroy = connsecmark_tg_destroy, .target = connsecmark_tg, diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c index 97efd74c04f..f0b4958528e 100644 --- a/net/netfilter/xt_DSCP.c +++ b/net/netfilter/xt_DSCP.c @@ -165,7 +165,7 @@ tos_tg6(struct sk_buff *skb, const struct net_device *in, static struct xt_target dscp_tg_reg[] __read_mostly = { { .name = "DSCP", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = dscp_tg_check, .target = dscp_tg, .targetsize = sizeof(struct xt_DSCP_info), @@ -174,7 +174,7 @@ static struct xt_target dscp_tg_reg[] __read_mostly = { }, { .name = "DSCP", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = dscp_tg_check, .target = dscp_tg6, .targetsize = sizeof(struct xt_DSCP_info), @@ -184,7 +184,7 @@ static struct xt_target dscp_tg_reg[] __read_mostly = { { .name = "TOS", .revision = 0, - .family = AF_INET, + .family = NFPROTO_IPV4, .table = "mangle", .target = tos_tg_v0, .targetsize = sizeof(struct ipt_tos_target_info), @@ -194,7 +194,7 @@ static struct xt_target dscp_tg_reg[] __read_mostly = { { .name = "TOS", .revision = 1, - .family = AF_INET, + .family = NFPROTO_IPV4, .table = "mangle", .target = tos_tg, .targetsize = sizeof(struct xt_tos_target_info), @@ -203,7 +203,7 @@ static struct xt_target dscp_tg_reg[] __read_mostly = { { .name = "TOS", .revision = 1, - .family = AF_INET6, + .family = NFPROTO_IPV6, .table = "mangle", .target = tos_tg6, .targetsize = sizeof(struct xt_tos_target_info), diff --git a/net/netfilter/xt_MARK.c b/net/netfilter/xt_MARK.c index f9ce20b5898..55ef0796c76 100644 --- a/net/netfilter/xt_MARK.c +++ b/net/netfilter/xt_MARK.c @@ -161,7 +161,7 @@ static int mark_tg_compat_to_user_v1(void __user *dst, void *src) static struct xt_target mark_tg_reg[] __read_mostly = { { .name = "MARK", - .family = AF_INET, + .family = NFPROTO_IPV4, .revision = 0, .checkentry = mark_tg_check_v0, .target = mark_tg_v0, @@ -176,7 +176,7 @@ static struct xt_target mark_tg_reg[] __read_mostly = { }, { .name = "MARK", - .family = AF_INET, + .family = NFPROTO_IPV4, .revision = 1, .checkentry = mark_tg_check_v1, .target = mark_tg_v1, @@ -191,7 +191,7 @@ static struct xt_target mark_tg_reg[] __read_mostly = { }, { .name = "MARK", - .family = AF_INET6, + .family = NFPROTO_IPV6, .revision = 0, .checkentry = mark_tg_check_v0, .target = mark_tg_v0, @@ -206,7 +206,7 @@ static struct xt_target mark_tg_reg[] __read_mostly = { }, { .name = "MARK", - .family = AF_INET6, + .family = NFPROTO_IPV6, .revision = 1, .checkentry = mark_tg_check_v1, .target = mark_tg_v1, @@ -222,7 +222,7 @@ static struct xt_target mark_tg_reg[] __read_mostly = { { .name = "MARK", .revision = 2, - .family = AF_INET, + .family = NFPROTO_IPV4, .target = mark_tg, .targetsize = sizeof(struct xt_mark_tginfo2), .me = THIS_MODULE, @@ -230,7 +230,7 @@ static struct xt_target mark_tg_reg[] __read_mostly = { { .name = "MARK", .revision = 2, - .family = AF_INET6, + .family = NFPROTO_IPV6, .target = mark_tg, .targetsize = sizeof(struct xt_mark_tginfo2), .me = THIS_MODULE, diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index 19ae8efae65..9b095520176 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -55,7 +55,7 @@ nflog_tg_check(const char *tablename, const void *entry, static struct xt_target nflog_tg_reg[] __read_mostly = { { .name = "NFLOG", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = nflog_tg_check, .target = nflog_tg, .targetsize = sizeof(struct xt_nflog_info), @@ -63,7 +63,7 @@ static struct xt_target nflog_tg_reg[] __read_mostly = { }, { .name = "NFLOG", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = nflog_tg_check, .target = nflog_tg, .targetsize = sizeof(struct xt_nflog_info), diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c index beb24d19a56..c03c2e8d06f 100644 --- a/net/netfilter/xt_NFQUEUE.c +++ b/net/netfilter/xt_NFQUEUE.c @@ -36,14 +36,14 @@ nfqueue_tg(struct sk_buff *skb, const struct net_device *in, static struct xt_target nfqueue_tg_reg[] __read_mostly = { { .name = "NFQUEUE", - .family = AF_INET, + .family = NFPROTO_IPV4, .target = nfqueue_tg, .targetsize = sizeof(struct xt_NFQ_info), .me = THIS_MODULE, }, { .name = "NFQUEUE", - .family = AF_INET6, + .family = NFPROTO_IPV6, .target = nfqueue_tg, .targetsize = sizeof(struct xt_NFQ_info), .me = THIS_MODULE, diff --git a/net/netfilter/xt_NOTRACK.c b/net/netfilter/xt_NOTRACK.c index 6c9de611eb8..b9ee268b37c 100644 --- a/net/netfilter/xt_NOTRACK.c +++ b/net/netfilter/xt_NOTRACK.c @@ -35,14 +35,14 @@ notrack_tg(struct sk_buff *skb, const struct net_device *in, static struct xt_target notrack_tg_reg[] __read_mostly = { { .name = "NOTRACK", - .family = AF_INET, + .family = NFPROTO_IPV4, .target = notrack_tg, .table = "raw", .me = THIS_MODULE, }, { .name = "NOTRACK", - .family = AF_INET6, + .family = NFPROTO_IPV6, .target = notrack_tg, .table = "raw", .me = THIS_MODULE, diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 64d6ad38029..f7114fc5cfc 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -159,7 +159,7 @@ static void xt_rateest_tg_destroy(const struct xt_target *target, static struct xt_target xt_rateest_target[] __read_mostly = { { - .family = AF_INET, + .family = NFPROTO_IPV4, .name = "RATEEST", .target = xt_rateest_tg, .checkentry = xt_rateest_tg_checkentry, @@ -168,7 +168,7 @@ static struct xt_target xt_rateest_target[] __read_mostly = { .me = THIS_MODULE, }, { - .family = AF_INET6, + .family = NFPROTO_IPV6, .name = "RATEEST", .target = xt_rateest_tg, .checkentry = xt_rateest_tg_checkentry, diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c index 94f87ee7552..8f8f57b93a6 100644 --- a/net/netfilter/xt_SECMARK.c +++ b/net/netfilter/xt_SECMARK.c @@ -128,7 +128,7 @@ static void secmark_tg_destroy(const struct xt_target *target, void *targinfo) static struct xt_target secmark_tg_reg[] __read_mostly = { { .name = "SECMARK", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = secmark_tg_check, .destroy = secmark_tg_destroy, .target = secmark_tg, @@ -137,7 +137,7 @@ static struct xt_target secmark_tg_reg[] __read_mostly = { }, { .name = "SECMARK", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = secmark_tg_check, .destroy = secmark_tg_destroy, .target = secmark_tg, diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index beb5094703c..b868f995239 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -289,7 +289,7 @@ tcpmss_tg6_check(const char *tablename, const void *entry, static struct xt_target tcpmss_tg_reg[] __read_mostly = { { - .family = AF_INET, + .family = NFPROTO_IPV4, .name = "TCPMSS", .checkentry = tcpmss_tg4_check, .target = tcpmss_tg4, @@ -299,7 +299,7 @@ static struct xt_target tcpmss_tg_reg[] __read_mostly = { }, #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) { - .family = AF_INET6, + .family = NFPROTO_IPV6, .name = "TCPMSS", .checkentry = tcpmss_tg6_check, .target = tcpmss_tg6, diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c index 9685b6fcbc8..2e0ae6cc5d9 100644 --- a/net/netfilter/xt_TCPOPTSTRIP.c +++ b/net/netfilter/xt_TCPOPTSTRIP.c @@ -106,7 +106,7 @@ tcpoptstrip_tg6(struct sk_buff *skb, const struct net_device *in, static struct xt_target tcpoptstrip_tg_reg[] __read_mostly = { { .name = "TCPOPTSTRIP", - .family = AF_INET, + .family = NFPROTO_IPV4, .table = "mangle", .proto = IPPROTO_TCP, .target = tcpoptstrip_tg4, @@ -116,7 +116,7 @@ static struct xt_target tcpoptstrip_tg_reg[] __read_mostly = { #if defined(CONFIG_IP6_NF_MANGLE) || defined(CONFIG_IP6_NF_MANGLE_MODULE) { .name = "TCPOPTSTRIP", - .family = AF_INET6, + .family = NFPROTO_IPV6, .table = "mangle", .proto = IPPROTO_TCP, .target = tcpoptstrip_tg6, diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c index 30dab79a343..e1bcad57914 100644 --- a/net/netfilter/xt_TRACE.c +++ b/net/netfilter/xt_TRACE.c @@ -22,14 +22,14 @@ trace_tg(struct sk_buff *skb, const struct net_device *in, static struct xt_target trace_tg_reg[] __read_mostly = { { .name = "TRACE", - .family = AF_INET, + .family = NFPROTO_IPV4, .target = trace_tg, .table = "raw", .me = THIS_MODULE, }, { .name = "TRACE", - .family = AF_INET6, + .family = NFPROTO_IPV6, .target = trace_tg, .table = "raw", .me = THIS_MODULE, diff --git a/net/netfilter/xt_comment.c b/net/netfilter/xt_comment.c index 89f47364e84..fa211b2ab87 100644 --- a/net/netfilter/xt_comment.c +++ b/net/netfilter/xt_comment.c @@ -28,14 +28,14 @@ comment_mt(const struct sk_buff *skb, const struct net_device *in, static struct xt_match comment_mt_reg[] __read_mostly = { { .name = "comment", - .family = AF_INET, + .family = NFPROTO_IPV4, .match = comment_mt, .matchsize = sizeof(struct xt_comment_info), .me = THIS_MODULE }, { .name = "comment", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = comment_mt, .matchsize = sizeof(struct xt_comment_info), .me = THIS_MODULE diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c index 3e39c4fe193..d2cd22a49c9 100644 --- a/net/netfilter/xt_connbytes.c +++ b/net/netfilter/xt_connbytes.c @@ -130,7 +130,7 @@ connbytes_mt_destroy(const struct xt_match *match, void *matchinfo) static struct xt_match connbytes_mt_reg[] __read_mostly = { { .name = "connbytes", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = connbytes_mt_check, .match = connbytes_mt, .destroy = connbytes_mt_destroy, @@ -139,7 +139,7 @@ static struct xt_match connbytes_mt_reg[] __read_mostly = { }, { .name = "connbytes", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = connbytes_mt_check, .match = connbytes_mt, .destroy = connbytes_mt_destroy, diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 1655e2cf25c..d2453d182d6 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -84,7 +84,7 @@ same_source_net(const union nf_inet_addr *addr, const union nf_inet_addr *mask, const union nf_inet_addr *u3, u_int8_t family) { - if (family == AF_INET) { + if (family == NFPROTO_IPV4) { return (addr->ip & mask->ip) == (u3->ip & mask->ip); } else { union nf_inet_addr lh, rh; @@ -114,7 +114,7 @@ static int count_them(struct xt_connlimit_data *data, int matches = 0; - if (match->family == AF_INET6) + if (match->family == NFPROTO_IPV6) hash = &data->iphash[connlimit_iphash6(addr, mask)]; else hash = &data->iphash[connlimit_iphash(addr->ip & mask->ip)]; @@ -198,7 +198,7 @@ connlimit_mt(const struct sk_buff *skb, const struct net_device *in, match->family, &tuple)) goto hotdrop; - if (match->family == AF_INET6) { + if (match->family == NFPROTO_IPV6) { const struct ipv6hdr *iph = ipv6_hdr(skb); memcpy(&addr.ip6, &iph->saddr, sizeof(iph->saddr)); } else { @@ -276,7 +276,7 @@ connlimit_mt_destroy(const struct xt_match *match, void *matchinfo) static struct xt_match connlimit_mt_reg[] __read_mostly = { { .name = "connlimit", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = connlimit_mt_check, .match = connlimit_mt, .matchsize = sizeof(struct xt_connlimit_info), @@ -285,7 +285,7 @@ static struct xt_match connlimit_mt_reg[] __read_mostly = { }, { .name = "connlimit", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = connlimit_mt_check, .match = connlimit_mt, .matchsize = sizeof(struct xt_connlimit_info), diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index aaa1b96691f..0577b8ff4e1 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -140,7 +140,7 @@ static struct xt_match connmark_mt_reg[] __read_mostly = { { .name = "connmark", .revision = 0, - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = connmark_mt_check_v0, .match = connmark_mt_v0, .destroy = connmark_mt_destroy, @@ -155,7 +155,7 @@ static struct xt_match connmark_mt_reg[] __read_mostly = { { .name = "connmark", .revision = 0, - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = connmark_mt_check_v0, .match = connmark_mt_v0, .destroy = connmark_mt_destroy, @@ -170,7 +170,7 @@ static struct xt_match connmark_mt_reg[] __read_mostly = { { .name = "connmark", .revision = 1, - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = connmark_mt_check, .match = connmark_mt, .matchsize = sizeof(struct xt_connmark_mtinfo1), @@ -180,7 +180,7 @@ static struct xt_match connmark_mt_reg[] __read_mostly = { { .name = "connmark", .revision = 1, - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = connmark_mt_check, .match = connmark_mt, .matchsize = sizeof(struct xt_connmark_mtinfo1), diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index 28a42a3fbff..392b457f9c2 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -121,9 +121,9 @@ conntrack_addrcmp(const union nf_inet_addr *kaddr, const union nf_inet_addr *uaddr, const union nf_inet_addr *umask, unsigned int l3proto) { - if (l3proto == AF_INET) + if (l3proto == NFPROTO_IPV4) return ((kaddr->ip ^ uaddr->ip) & umask->ip) == 0; - else if (l3proto == AF_INET6) + else if (l3proto == NFPROTO_IPV6) return ipv6_masked_addr_cmp(&kaddr->in6, &umask->in6, &uaddr->in6) == 0; else @@ -356,7 +356,7 @@ static struct xt_match conntrack_mt_reg[] __read_mostly = { { .name = "conntrack", .revision = 0, - .family = AF_INET, + .family = NFPROTO_IPV4, .match = conntrack_mt_v0, .checkentry = conntrack_mt_check, .destroy = conntrack_mt_destroy, @@ -371,7 +371,7 @@ static struct xt_match conntrack_mt_reg[] __read_mostly = { { .name = "conntrack", .revision = 1, - .family = AF_INET, + .family = NFPROTO_IPV4, .matchsize = sizeof(struct xt_conntrack_mtinfo1), .match = conntrack_mt, .checkentry = conntrack_mt_check, @@ -381,7 +381,7 @@ static struct xt_match conntrack_mt_reg[] __read_mostly = { { .name = "conntrack", .revision = 1, - .family = AF_INET6, + .family = NFPROTO_IPV6, .matchsize = sizeof(struct xt_conntrack_mtinfo1), .match = conntrack_mt, .checkentry = conntrack_mt_check, diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c index 8b6522186d9..87971f47132 100644 --- a/net/netfilter/xt_dccp.c +++ b/net/netfilter/xt_dccp.c @@ -138,7 +138,7 @@ dccp_mt_check(const char *tablename, const void *inf, static struct xt_match dccp_mt_reg[] __read_mostly = { { .name = "dccp", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = dccp_mt_check, .match = dccp_mt, .matchsize = sizeof(struct xt_dccp_info), @@ -147,7 +147,7 @@ static struct xt_match dccp_mt_reg[] __read_mostly = { }, { .name = "dccp", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = dccp_mt_check, .match = dccp_mt, .matchsize = sizeof(struct xt_dccp_info), diff --git a/net/netfilter/xt_dscp.c b/net/netfilter/xt_dscp.c index 26f4aab9c42..7f03aa13a95 100644 --- a/net/netfilter/xt_dscp.c +++ b/net/netfilter/xt_dscp.c @@ -80,7 +80,7 @@ static bool tos_mt(const struct sk_buff *skb, const struct net_device *in, { const struct xt_tos_match_info *info = matchinfo; - if (match->family == AF_INET) + if (match->family == NFPROTO_IPV4) return ((ip_hdr(skb)->tos & info->tos_mask) == info->tos_value) ^ !!info->invert; else @@ -91,7 +91,7 @@ static bool tos_mt(const struct sk_buff *skb, const struct net_device *in, static struct xt_match dscp_mt_reg[] __read_mostly = { { .name = "dscp", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = dscp_mt_check, .match = dscp_mt, .matchsize = sizeof(struct xt_dscp_info), @@ -99,7 +99,7 @@ static struct xt_match dscp_mt_reg[] __read_mostly = { }, { .name = "dscp", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = dscp_mt_check, .match = dscp_mt6, .matchsize = sizeof(struct xt_dscp_info), @@ -108,7 +108,7 @@ static struct xt_match dscp_mt_reg[] __read_mostly = { { .name = "tos", .revision = 0, - .family = AF_INET, + .family = NFPROTO_IPV4, .match = tos_mt_v0, .matchsize = sizeof(struct ipt_tos_info), .me = THIS_MODULE, @@ -116,7 +116,7 @@ static struct xt_match dscp_mt_reg[] __read_mostly = { { .name = "tos", .revision = 1, - .family = AF_INET, + .family = NFPROTO_IPV4, .match = tos_mt, .matchsize = sizeof(struct xt_tos_match_info), .me = THIS_MODULE, @@ -124,7 +124,7 @@ static struct xt_match dscp_mt_reg[] __read_mostly = { { .name = "tos", .revision = 1, - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = tos_mt, .matchsize = sizeof(struct xt_tos_match_info), .me = THIS_MODULE, diff --git a/net/netfilter/xt_esp.c b/net/netfilter/xt_esp.c index a133eb9b23e..045c4deecaf 100644 --- a/net/netfilter/xt_esp.c +++ b/net/netfilter/xt_esp.c @@ -88,7 +88,7 @@ esp_mt_check(const char *tablename, const void *ip_void, static struct xt_match esp_mt_reg[] __read_mostly = { { .name = "esp", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = esp_mt_check, .match = esp_mt, .matchsize = sizeof(struct xt_esp), @@ -97,7 +97,7 @@ static struct xt_match esp_mt_reg[] __read_mostly = { }, { .name = "esp", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = esp_mt_check, .match = esp_mt, .matchsize = sizeof(struct xt_esp), diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 0c9268fd2e1..7bae369603d 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -218,7 +218,7 @@ static int htable_create_v0(struct xt_hashlimit_info *minfo, u_int8_t family) hinfo->cfg.gc_interval = minfo->cfg.gc_interval; hinfo->cfg.expire = minfo->cfg.expire; - if (family == AF_INET) + if (family == NFPROTO_IPV4) hinfo->cfg.srcmask = hinfo->cfg.dstmask = 32; else hinfo->cfg.srcmask = hinfo->cfg.dstmask = 128; @@ -237,11 +237,10 @@ static int htable_create_v0(struct xt_hashlimit_info *minfo, u_int8_t family) hinfo->family = family; hinfo->rnd_initialized = 0; spin_lock_init(&hinfo->lock); - hinfo->pde = - proc_create_data(minfo->name, 0, - family == AF_INET ? hashlimit_procdir4 : - hashlimit_procdir6, - &dl_file_ops, hinfo); + hinfo->pde = proc_create_data(minfo->name, 0, + (family == NFPROTO_IPV4) ? + hashlimit_procdir4 : hashlimit_procdir6, + &dl_file_ops, hinfo); if (!hinfo->pde) { vfree(hinfo); return -1; @@ -300,11 +299,10 @@ static int htable_create(struct xt_hashlimit_mtinfo1 *minfo, u_int8_t family) hinfo->rnd_initialized = 0; spin_lock_init(&hinfo->lock); - hinfo->pde = - proc_create_data(minfo->name, 0, - family == AF_INET ? hashlimit_procdir4 : - hashlimit_procdir6, - &dl_file_ops, hinfo); + hinfo->pde = proc_create_data(minfo->name, 0, + (family == NFPROTO_IPV4) ? + hashlimit_procdir4 : hashlimit_procdir6, + &dl_file_ops, hinfo); if (hinfo->pde == NULL) { vfree(hinfo); return -1; @@ -370,7 +368,7 @@ static void htable_destroy(struct xt_hashlimit_htable *hinfo) /* remove proc entry */ remove_proc_entry(hinfo->pde->name, - hinfo->family == AF_INET ? hashlimit_procdir4 : + hinfo->family == NFPROTO_IPV4 ? hashlimit_procdir4 : hashlimit_procdir6); htable_selective_cleanup(hinfo, select_all); vfree(hinfo); @@ -501,7 +499,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, memset(dst, 0, sizeof(*dst)); switch (hinfo->family) { - case AF_INET: + case NFPROTO_IPV4: if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DIP) dst->ip.dst = maskl(ip_hdr(skb)->daddr, hinfo->cfg.dstmask); @@ -515,7 +513,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, nexthdr = ip_hdr(skb)->protocol; break; #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) - case AF_INET6: + case NFPROTO_IPV6: if (hinfo->cfg.mode & XT_HASHLIMIT_HASH_DIP) { memcpy(&dst->ip6.dst, &ipv6_hdr(skb)->daddr, sizeof(dst->ip6.dst)); @@ -737,7 +735,7 @@ hashlimit_mt_check(const char *tablename, const void *inf, return false; if (info->name[sizeof(info->name)-1] != '\0') return false; - if (match->family == AF_INET) { + if (match->family == NFPROTO_IPV4) { if (info->cfg.srcmask > 32 || info->cfg.dstmask > 32) return false; } else { @@ -805,7 +803,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = { { .name = "hashlimit", .revision = 0, - .family = AF_INET, + .family = NFPROTO_IPV4, .match = hashlimit_mt_v0, .matchsize = sizeof(struct xt_hashlimit_info), #ifdef CONFIG_COMPAT @@ -820,7 +818,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = { { .name = "hashlimit", .revision = 1, - .family = AF_INET, + .family = NFPROTO_IPV4, .match = hashlimit_mt, .matchsize = sizeof(struct xt_hashlimit_mtinfo1), .checkentry = hashlimit_mt_check, @@ -830,7 +828,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = { #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) { .name = "hashlimit", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = hashlimit_mt_v0, .matchsize = sizeof(struct xt_hashlimit_info), #ifdef CONFIG_COMPAT @@ -845,7 +843,7 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = { { .name = "hashlimit", .revision = 1, - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = hashlimit_mt, .matchsize = sizeof(struct xt_hashlimit_mtinfo1), .checkentry = hashlimit_mt_check, @@ -907,7 +905,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, rateinfo_recalc(ent, jiffies); switch (family) { - case AF_INET: + case NFPROTO_IPV4: return seq_printf(s, "%ld %u.%u.%u.%u:%u->" "%u.%u.%u.%u:%u %u %u %u\n", (long)(ent->expires - jiffies)/HZ, @@ -918,7 +916,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, ent->rateinfo.credit, ent->rateinfo.credit_cap, ent->rateinfo.cost); #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) - case AF_INET6: + case NFPROTO_IPV6: return seq_printf(s, "%ld " NIP6_FMT ":%u->" NIP6_FMT ":%u %u %u %u\n", (long)(ent->expires - jiffies)/HZ, diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c index dada2905d66..134d94324eb 100644 --- a/net/netfilter/xt_helper.c +++ b/net/netfilter/xt_helper.c @@ -81,7 +81,7 @@ static void helper_mt_destroy(const struct xt_match *match, void *matchinfo) static struct xt_match helper_mt_reg[] __read_mostly = { { .name = "helper", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = helper_mt_check, .match = helper_mt, .destroy = helper_mt_destroy, @@ -90,7 +90,7 @@ static struct xt_match helper_mt_reg[] __read_mostly = { }, { .name = "helper", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = helper_mt_check, .match = helper_mt, .destroy = helper_mt_destroy, diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c index c63e9333c75..a7498cc48dc 100644 --- a/net/netfilter/xt_iprange.c +++ b/net/netfilter/xt_iprange.c @@ -141,7 +141,7 @@ static struct xt_match iprange_mt_reg[] __read_mostly = { { .name = "iprange", .revision = 0, - .family = AF_INET, + .family = NFPROTO_IPV4, .match = iprange_mt_v0, .matchsize = sizeof(struct ipt_iprange_info), .me = THIS_MODULE, @@ -149,7 +149,7 @@ static struct xt_match iprange_mt_reg[] __read_mostly = { { .name = "iprange", .revision = 1, - .family = AF_INET, + .family = NFPROTO_IPV4, .match = iprange_mt4, .matchsize = sizeof(struct xt_iprange_mtinfo), .me = THIS_MODULE, @@ -157,7 +157,7 @@ static struct xt_match iprange_mt_reg[] __read_mostly = { { .name = "iprange", .revision = 1, - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = iprange_mt6, .matchsize = sizeof(struct xt_iprange_mtinfo), .me = THIS_MODULE, diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c index b8640f97295..b8612d1914b 100644 --- a/net/netfilter/xt_length.c +++ b/net/netfilter/xt_length.c @@ -48,14 +48,14 @@ length_mt6(const struct sk_buff *skb, const struct net_device *in, static struct xt_match length_mt_reg[] __read_mostly = { { .name = "length", - .family = AF_INET, + .family = NFPROTO_IPV4, .match = length_mt, .matchsize = sizeof(struct xt_length_info), .me = THIS_MODULE, }, { .name = "length", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = length_mt6, .matchsize = sizeof(struct xt_length_info), .me = THIS_MODULE, diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index aad9ab8d204..584d66893c4 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -170,7 +170,7 @@ static int limit_mt_compat_to_user(void __user *dst, void *src) static struct xt_match limit_mt_reg[] __read_mostly = { { .name = "limit", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = limit_mt_check, .match = limit_mt, .matchsize = sizeof(struct xt_rateinfo), @@ -183,7 +183,7 @@ static struct xt_match limit_mt_reg[] __read_mostly = { }, { .name = "limit", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = limit_mt_check, .match = limit_mt, .matchsize = sizeof(struct xt_rateinfo), diff --git a/net/netfilter/xt_mac.c b/net/netfilter/xt_mac.c index b3e96a0ec17..60db240098a 100644 --- a/net/netfilter/xt_mac.c +++ b/net/netfilter/xt_mac.c @@ -42,7 +42,7 @@ mac_mt(const struct sk_buff *skb, const struct net_device *in, static struct xt_match mac_mt_reg[] __read_mostly = { { .name = "mac", - .family = AF_INET, + .family = NFPROTO_IPV4, .match = mac_mt, .matchsize = sizeof(struct xt_mac_info), .hooks = (1 << NF_INET_PRE_ROUTING) | @@ -52,7 +52,7 @@ static struct xt_match mac_mt_reg[] __read_mostly = { }, { .name = "mac", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = mac_mt, .matchsize = sizeof(struct xt_mac_info), .hooks = (1 << NF_INET_PRE_ROUTING) | diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c index 9f78f6120fb..c66affd5722 100644 --- a/net/netfilter/xt_mark.c +++ b/net/netfilter/xt_mark.c @@ -92,7 +92,7 @@ static struct xt_match mark_mt_reg[] __read_mostly = { { .name = "mark", .revision = 0, - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = mark_mt_check_v0, .match = mark_mt_v0, .matchsize = sizeof(struct xt_mark_info), @@ -106,7 +106,7 @@ static struct xt_match mark_mt_reg[] __read_mostly = { { .name = "mark", .revision = 0, - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = mark_mt_check_v0, .match = mark_mt_v0, .matchsize = sizeof(struct xt_mark_info), @@ -120,7 +120,7 @@ static struct xt_match mark_mt_reg[] __read_mostly = { { .name = "mark", .revision = 1, - .family = AF_INET, + .family = NFPROTO_IPV4, .match = mark_mt, .matchsize = sizeof(struct xt_mark_mtinfo1), .me = THIS_MODULE, @@ -128,7 +128,7 @@ static struct xt_match mark_mt_reg[] __read_mostly = { { .name = "mark", .revision = 1, - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = mark_mt, .matchsize = sizeof(struct xt_mark_mtinfo1), .me = THIS_MODULE, diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c index fd88c489b70..f6fe008ab8c 100644 --- a/net/netfilter/xt_multiport.c +++ b/net/netfilter/xt_multiport.c @@ -216,7 +216,7 @@ multiport_mt6_check(const char *tablename, const void *info, static struct xt_match multiport_mt_reg[] __read_mostly = { { .name = "multiport", - .family = AF_INET, + .family = NFPROTO_IPV4, .revision = 0, .checkentry = multiport_mt_check_v0, .match = multiport_mt_v0, @@ -225,7 +225,7 @@ static struct xt_match multiport_mt_reg[] __read_mostly = { }, { .name = "multiport", - .family = AF_INET, + .family = NFPROTO_IPV4, .revision = 1, .checkentry = multiport_mt_check, .match = multiport_mt, @@ -234,7 +234,7 @@ static struct xt_match multiport_mt_reg[] __read_mostly = { }, { .name = "multiport", - .family = AF_INET6, + .family = NFPROTO_IPV6, .revision = 0, .checkentry = multiport_mt6_check_v0, .match = multiport_mt_v0, @@ -243,7 +243,7 @@ static struct xt_match multiport_mt_reg[] __read_mostly = { }, { .name = "multiport", - .family = AF_INET6, + .family = NFPROTO_IPV6, .revision = 1, .checkentry = multiport_mt6_check, .match = multiport_mt, diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index 9059c16144c..d1c3b7ae9b4 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -153,7 +153,7 @@ static struct xt_match owner_mt_reg[] __read_mostly = { { .name = "owner", .revision = 0, - .family = AF_INET, + .family = NFPROTO_IPV4, .match = owner_mt_v0, .matchsize = sizeof(struct ipt_owner_info), .checkentry = owner_mt_check_v0, @@ -164,7 +164,7 @@ static struct xt_match owner_mt_reg[] __read_mostly = { { .name = "owner", .revision = 0, - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = owner_mt6_v0, .matchsize = sizeof(struct ip6t_owner_info), .checkentry = owner_mt6_check_v0, @@ -175,7 +175,7 @@ static struct xt_match owner_mt_reg[] __read_mostly = { { .name = "owner", .revision = 1, - .family = AF_INET, + .family = NFPROTO_IPV4, .match = owner_mt, .matchsize = sizeof(struct xt_owner_match_info), .hooks = (1 << NF_INET_LOCAL_OUT) | @@ -185,7 +185,7 @@ static struct xt_match owner_mt_reg[] __read_mostly = { { .name = "owner", .revision = 1, - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = owner_mt, .matchsize = sizeof(struct xt_owner_match_info), .hooks = (1 << NF_INET_LOCAL_OUT) | diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index 4ec1094bda9..72a0bdd53fa 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -121,7 +121,7 @@ physdev_mt_check(const char *tablename, const void *ip, static struct xt_match physdev_mt_reg[] __read_mostly = { { .name = "physdev", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = physdev_mt_check, .match = physdev_mt, .matchsize = sizeof(struct xt_physdev_info), @@ -129,7 +129,7 @@ static struct xt_match physdev_mt_reg[] __read_mostly = { }, { .name = "physdev", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = physdev_mt_check, .match = physdev_mt, .matchsize = sizeof(struct xt_physdev_info), diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c index 7936f7e2325..81e86d319a8 100644 --- a/net/netfilter/xt_pkttype.c +++ b/net/netfilter/xt_pkttype.c @@ -33,10 +33,10 @@ pkttype_mt(const struct sk_buff *skb, const struct net_device *in, if (skb->pkt_type != PACKET_LOOPBACK) type = skb->pkt_type; - else if (match->family == AF_INET && + else if (match->family == NFPROTO_IPV4 && ipv4_is_multicast(ip_hdr(skb)->daddr)) type = PACKET_MULTICAST; - else if (match->family == AF_INET6 && + else if (match->family == NFPROTO_IPV6 && ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF) type = PACKET_MULTICAST; else @@ -48,14 +48,14 @@ pkttype_mt(const struct sk_buff *skb, const struct net_device *in, static struct xt_match pkttype_mt_reg[] __read_mostly = { { .name = "pkttype", - .family = AF_INET, + .family = NFPROTO_IPV4, .match = pkttype_mt, .matchsize = sizeof(struct xt_pkttype_info), .me = THIS_MODULE, }, { .name = "pkttype", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = pkttype_mt, .matchsize = sizeof(struct xt_pkttype_info), .me = THIS_MODULE, diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c index d351582b2a3..f1d514e9d0a 100644 --- a/net/netfilter/xt_policy.c +++ b/net/netfilter/xt_policy.c @@ -26,9 +26,9 @@ xt_addr_cmp(const union nf_inet_addr *a1, const union nf_inet_addr *m, const union nf_inet_addr *a2, unsigned short family) { switch (family) { - case AF_INET: + case NFPROTO_IPV4: return ((a1->ip ^ a2->ip) & m->ip) == 0; - case AF_INET6: + case NFPROTO_IPV6: return ipv6_masked_addr_cmp(&a1->in6, &m->in6, &a2->in6) == 0; } return false; @@ -165,7 +165,7 @@ policy_mt_check(const char *tablename, const void *ip_void, static struct xt_match policy_mt_reg[] __read_mostly = { { .name = "policy", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = policy_mt_check, .match = policy_mt, .matchsize = sizeof(struct xt_policy_info), @@ -173,7 +173,7 @@ static struct xt_match policy_mt_reg[] __read_mostly = { }, { .name = "policy", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = policy_mt_check, .match = policy_mt, .matchsize = sizeof(struct xt_policy_info), diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c index 3b021d0c522..59f61e32b62 100644 --- a/net/netfilter/xt_quota.c +++ b/net/netfilter/xt_quota.c @@ -57,7 +57,7 @@ quota_mt_check(const char *tablename, const void *entry, static struct xt_match quota_mt_reg[] __read_mostly = { { .name = "quota", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = quota_mt_check, .match = quota_mt, .matchsize = sizeof(struct xt_quota_info), @@ -65,7 +65,7 @@ static struct xt_match quota_mt_reg[] __read_mostly = { }, { .name = "quota", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = quota_mt_check, .match = quota_mt, .matchsize = sizeof(struct xt_quota_info), diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c index ebd84f1b4f6..ba1cb5760f4 100644 --- a/net/netfilter/xt_rateest.c +++ b/net/netfilter/xt_rateest.c @@ -139,7 +139,7 @@ static void xt_rateest_mt_destroy(const struct xt_match *match, static struct xt_match xt_rateest_match[] __read_mostly = { { - .family = AF_INET, + .family = NFPROTO_IPV4, .name = "rateest", .match = xt_rateest_mt, .checkentry = xt_rateest_mt_checkentry, @@ -148,7 +148,7 @@ static struct xt_match xt_rateest_match[] __read_mostly = { .me = THIS_MODULE, }, { - .family = AF_INET6, + .family = NFPROTO_IPV6, .name = "rateest", .match = xt_rateest_mt, .checkentry = xt_rateest_mt_checkentry, diff --git a/net/netfilter/xt_realm.c b/net/netfilter/xt_realm.c index 7df1627c536..ef65756d489 100644 --- a/net/netfilter/xt_realm.c +++ b/net/netfilter/xt_realm.c @@ -39,7 +39,7 @@ static struct xt_match realm_mt_reg __read_mostly = { .matchsize = sizeof(struct xt_realm_info), .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), - .family = AF_INET, + .family = NFPROTO_IPV4, .me = THIS_MODULE }; diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index adc2e2f1b09..4a916e2624d 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -124,7 +124,7 @@ recent_entry_lookup(const struct recent_table *table, struct recent_entry *e; unsigned int h; - if (family == AF_INET) + if (family == NFPROTO_IPV4) h = recent_entry_hash4(addrp); else h = recent_entry_hash6(addrp); @@ -165,7 +165,7 @@ recent_entry_init(struct recent_table *t, const union nf_inet_addr *addr, e->nstamps = 1; e->index = 1; e->family = family; - if (family == AF_INET) + if (family == NFPROTO_IPV4) list_add_tail(&e->list, &t->iphash[recent_entry_hash4(addr)]); else list_add_tail(&e->list, &t->iphash[recent_entry_hash6(addr)]); @@ -216,7 +216,7 @@ recent_mt(const struct sk_buff *skb, const struct net_device *in, u_int8_t ttl; bool ret = info->invert; - if (match->family == AF_INET) { + if (match->family == NFPROTO_IPV4) { const struct iphdr *iph = ip_hdr(skb); if (info->side == XT_RECENT_DEST) @@ -429,7 +429,7 @@ static int recent_seq_show(struct seq_file *seq, void *v) unsigned int i; i = (e->index - 1) % ip_pkt_list_tot; - if (e->family == AF_INET) + if (e->family == NFPROTO_IPV4) seq_printf(seq, "src=" NIPQUAD_FMT " ttl: %u last_seen: %lu " "oldest_pkt: %u", NIPQUAD(e->addr.ip), e->ttl, e->stamps[i], e->index); @@ -519,10 +519,11 @@ static ssize_t recent_old_proc_write(struct file *file, addr = in_aton(c); spin_lock_bh(&recent_lock); - e = recent_entry_lookup(t, (const void *)&addr, PF_INET, 0); + e = recent_entry_lookup(t, (const void *)&addr, NFPROTO_IPV4, 0); if (e == NULL) { if (add) - recent_entry_init(t, (const void *)&addr, PF_INET, 0); + recent_entry_init(t, (const void *)&addr, + NFPROTO_IPV4, 0); } else { if (add) recent_entry_update(t, e); @@ -585,10 +586,10 @@ recent_mt_proc_write(struct file *file, const char __user *input, ++c; --size; if (strnchr(c, size, ':') != NULL) { - family = AF_INET6; + family = NFPROTO_IPV6; succ = in6_pton(c, size, (void *)&addr, '\n', NULL); } else { - family = AF_INET; + family = NFPROTO_IPV4; succ = in4_pton(c, size, (void *)&addr, '\n', NULL); } @@ -628,7 +629,7 @@ static struct xt_match recent_mt_reg[] __read_mostly = { { .name = "recent", .revision = 0, - .family = AF_INET, + .family = NFPROTO_IPV4, .match = recent_mt, .matchsize = sizeof(struct xt_recent_mtinfo), .checkentry = recent_mt_check, @@ -638,7 +639,7 @@ static struct xt_match recent_mt_reg[] __read_mostly = { { .name = "recent", .revision = 0, - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = recent_mt, .matchsize = sizeof(struct xt_recent_mtinfo), .checkentry = recent_mt_check, diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c index e6e4681fa04..ab67aca4d8f 100644 --- a/net/netfilter/xt_sctp.c +++ b/net/netfilter/xt_sctp.c @@ -169,7 +169,7 @@ sctp_mt_check(const char *tablename, const void *inf, static struct xt_match sctp_mt_reg[] __read_mostly = { { .name = "sctp", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = sctp_mt_check, .match = sctp_mt, .matchsize = sizeof(struct xt_sctp_info), @@ -178,7 +178,7 @@ static struct xt_match sctp_mt_reg[] __read_mostly = { }, { .name = "sctp", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = sctp_mt_check, .match = sctp_mt, .matchsize = sizeof(struct xt_sctp_info), diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c index a776dc36a19..f92f8bcc1e3 100644 --- a/net/netfilter/xt_state.c +++ b/net/netfilter/xt_state.c @@ -61,7 +61,7 @@ static void state_mt_destroy(const struct xt_match *match, void *matchinfo) static struct xt_match state_mt_reg[] __read_mostly = { { .name = "state", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = state_mt_check, .match = state_mt, .destroy = state_mt_destroy, @@ -70,7 +70,7 @@ static struct xt_match state_mt_reg[] __read_mostly = { }, { .name = "state", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = state_mt_check, .match = state_mt, .destroy = state_mt_destroy, diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c index 43133080da7..fd3bb1400df 100644 --- a/net/netfilter/xt_statistic.c +++ b/net/netfilter/xt_statistic.c @@ -69,7 +69,7 @@ statistic_mt_check(const char *tablename, const void *entry, static struct xt_match statistic_mt_reg[] __read_mostly = { { .name = "statistic", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = statistic_mt_check, .match = statistic_mt, .matchsize = sizeof(struct xt_statistic_info), @@ -77,7 +77,7 @@ static struct xt_match statistic_mt_reg[] __read_mostly = { }, { .name = "statistic", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = statistic_mt_check, .match = statistic_mt, .matchsize = sizeof(struct xt_statistic_info), diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c index 4903182a062..50169718377 100644 --- a/net/netfilter/xt_string.c +++ b/net/netfilter/xt_string.c @@ -85,7 +85,7 @@ static struct xt_match string_mt_reg[] __read_mostly = { { .name = "string", .revision = 0, - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = string_mt_check, .match = string_mt, .destroy = string_mt_destroy, @@ -95,7 +95,7 @@ static struct xt_match string_mt_reg[] __read_mostly = { { .name = "string", .revision = 1, - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = string_mt_check, .match = string_mt, .destroy = string_mt_destroy, @@ -105,7 +105,7 @@ static struct xt_match string_mt_reg[] __read_mostly = { { .name = "string", .revision = 0, - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = string_mt_check, .match = string_mt, .destroy = string_mt_destroy, @@ -115,7 +115,7 @@ static struct xt_match string_mt_reg[] __read_mostly = { { .name = "string", .revision = 1, - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = string_mt_check, .match = string_mt, .destroy = string_mt_destroy, diff --git a/net/netfilter/xt_tcpmss.c b/net/netfilter/xt_tcpmss.c index 6771bf01275..4791c7cbe5a 100644 --- a/net/netfilter/xt_tcpmss.c +++ b/net/netfilter/xt_tcpmss.c @@ -83,7 +83,7 @@ dropit: static struct xt_match tcpmss_mt_reg[] __read_mostly = { { .name = "tcpmss", - .family = AF_INET, + .family = NFPROTO_IPV4, .match = tcpmss_mt, .matchsize = sizeof(struct xt_tcpmss_match_info), .proto = IPPROTO_TCP, @@ -91,7 +91,7 @@ static struct xt_match tcpmss_mt_reg[] __read_mostly = { }, { .name = "tcpmss", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = tcpmss_mt, .matchsize = sizeof(struct xt_tcpmss_match_info), .proto = IPPROTO_TCP, diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c index 951b06b8d70..5a6268cbb9f 100644 --- a/net/netfilter/xt_tcpudp.c +++ b/net/netfilter/xt_tcpudp.c @@ -186,7 +186,7 @@ udp_mt_check(const char *tablename, const void *info, static struct xt_match tcpudp_mt_reg[] __read_mostly = { { .name = "tcp", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = tcp_mt_check, .match = tcp_mt, .matchsize = sizeof(struct xt_tcp), @@ -195,7 +195,7 @@ static struct xt_match tcpudp_mt_reg[] __read_mostly = { }, { .name = "tcp", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = tcp_mt_check, .match = tcp_mt, .matchsize = sizeof(struct xt_tcp), @@ -204,7 +204,7 @@ static struct xt_match tcpudp_mt_reg[] __read_mostly = { }, { .name = "udp", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = udp_mt_check, .match = udp_mt, .matchsize = sizeof(struct xt_udp), @@ -213,7 +213,7 @@ static struct xt_match tcpudp_mt_reg[] __read_mostly = { }, { .name = "udp", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = udp_mt_check, .match = udp_mt, .matchsize = sizeof(struct xt_udp), @@ -222,7 +222,7 @@ static struct xt_match tcpudp_mt_reg[] __read_mostly = { }, { .name = "udplite", - .family = AF_INET, + .family = NFPROTO_IPV4, .checkentry = udp_mt_check, .match = udp_mt, .matchsize = sizeof(struct xt_udp), @@ -231,7 +231,7 @@ static struct xt_match tcpudp_mt_reg[] __read_mostly = { }, { .name = "udplite", - .family = AF_INET6, + .family = NFPROTO_IPV6, .checkentry = udp_mt_check, .match = udp_mt, .matchsize = sizeof(struct xt_udp), diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index 307a2c3c2df..fe9dae2b4f5 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -240,7 +240,7 @@ time_mt_check(const char *tablename, const void *ip, static struct xt_match time_mt_reg[] __read_mostly = { { .name = "time", - .family = AF_INET, + .family = NFPROTO_IPV4, .match = time_mt, .matchsize = sizeof(struct xt_time_info), .checkentry = time_mt_check, @@ -248,7 +248,7 @@ static struct xt_match time_mt_reg[] __read_mostly = { }, { .name = "time", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = time_mt, .matchsize = sizeof(struct xt_time_info), .checkentry = time_mt_check, diff --git a/net/netfilter/xt_u32.c b/net/netfilter/xt_u32.c index 627e0f336d5..ed9f8340611 100644 --- a/net/netfilter/xt_u32.c +++ b/net/netfilter/xt_u32.c @@ -102,14 +102,14 @@ u32_mt(const struct sk_buff *skb, const struct net_device *in, static struct xt_match u32_mt_reg[] __read_mostly = { { .name = "u32", - .family = AF_INET, + .family = NFPROTO_IPV4, .match = u32_mt, .matchsize = sizeof(struct xt_u32), .me = THIS_MODULE, }, { .name = "u32", - .family = AF_INET6, + .family = NFPROTO_IPV6, .match = u32_mt, .matchsize = sizeof(struct xt_u32), .me = THIS_MODULE, -- cgit v1.2.3 From 55b69e91040c685a064198bd76e59885b7ad26c6 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:01 +0200 Subject: netfilter: implement NFPROTO_UNSPEC as a wildcard for extensions When a match or target is looked up using xt_find_{match,target}, Xtables will also search the NFPROTO_UNSPEC module list. This allows for protocol-independent extensions (like xt_time) to be reused from other components (e.g. arptables, ebtables). Extensions that take different codepaths depending on match->family or target->family of course cannot use NFPROTO_UNSPEC within the registration structure (e.g. xt_pkttype). Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/netfilter/x_tables.c | 10 ++++++++++ net/netfilter/xt_CLASSIFY.c | 38 ++++++++++++-------------------------- net/netfilter/xt_MARK.c | 10 +--------- net/netfilter/xt_RATEEST.c | 33 +++++++++++---------------------- net/netfilter/xt_SECMARK.c | 32 +++++++++++--------------------- net/netfilter/xt_TRACE.c | 26 +++++++++----------------- net/netfilter/xt_limit.c | 40 +++++++++++++--------------------------- net/netfilter/xt_mark.c | 26 ++------------------------ net/netfilter/xt_quota.c | 29 ++++++++++------------------- net/netfilter/xt_rateest.c | 33 +++++++++++---------------------- net/netfilter/xt_statistic.c | 31 ++++++++++--------------------- net/netfilter/xt_string.c | 31 ++++++------------------------- net/netfilter/xt_time.c | 28 +++++++++------------------- net/netfilter/xt_u32.c | 26 +++++++++----------------- 14 files changed, 124 insertions(+), 269 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 2a7eb1da5d0..aece6c2d134 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -209,6 +209,11 @@ struct xt_match *xt_find_match(u8 af, const char *name, u8 revision) } } mutex_unlock(&xt[af].mutex); + + if (af != NFPROTO_UNSPEC) + /* Try searching again in the family-independent list */ + return xt_find_match(NFPROTO_UNSPEC, name, revision); + return ERR_PTR(err); } EXPORT_SYMBOL(xt_find_match); @@ -234,6 +239,11 @@ struct xt_target *xt_find_target(u8 af, const char *name, u8 revision) } } mutex_unlock(&xt[af].mutex); + + if (af != NFPROTO_UNSPEC) + /* Try searching again in the family-independent list */ + return xt_find_target(NFPROTO_UNSPEC, name, revision); + return ERR_PTR(err); } EXPORT_SYMBOL(xt_find_target); diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c index 9d68da1748b..8cffa295dd3 100644 --- a/net/netfilter/xt_CLASSIFY.c +++ b/net/netfilter/xt_CLASSIFY.c @@ -37,40 +37,26 @@ classify_tg(struct sk_buff *skb, const struct net_device *in, return XT_CONTINUE; } -static struct xt_target classify_tg_reg[] __read_mostly = { - { - .family = NFPROTO_IPV4, - .name = "CLASSIFY", - .target = classify_tg, - .targetsize = sizeof(struct xt_classify_target_info), - .table = "mangle", - .hooks = (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_FORWARD) | - (1 << NF_INET_POST_ROUTING), - .me = THIS_MODULE, - }, - { - .name = "CLASSIFY", - .family = NFPROTO_IPV6, - .target = classify_tg, - .targetsize = sizeof(struct xt_classify_target_info), - .table = "mangle", - .hooks = (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_FORWARD) | - (1 << NF_INET_POST_ROUTING), - .me = THIS_MODULE, - }, +static struct xt_target classify_tg_reg __read_mostly = { + .name = "CLASSIFY", + .revision = 0, + .family = NFPROTO_UNSPEC, + .table = "mangle", + .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | + (1 << NF_INET_POST_ROUTING), + .target = classify_tg, + .targetsize = sizeof(struct xt_classify_target_info), + .me = THIS_MODULE, }; static int __init classify_tg_init(void) { - return xt_register_targets(classify_tg_reg, - ARRAY_SIZE(classify_tg_reg)); + return xt_register_target(&classify_tg_reg); } static void __exit classify_tg_exit(void) { - xt_unregister_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg)); + xt_unregister_target(&classify_tg_reg); } module_init(classify_tg_init); diff --git a/net/netfilter/xt_MARK.c b/net/netfilter/xt_MARK.c index 55ef0796c76..c8ea7a80970 100644 --- a/net/netfilter/xt_MARK.c +++ b/net/netfilter/xt_MARK.c @@ -222,15 +222,7 @@ static struct xt_target mark_tg_reg[] __read_mostly = { { .name = "MARK", .revision = 2, - .family = NFPROTO_IPV4, - .target = mark_tg, - .targetsize = sizeof(struct xt_mark_tginfo2), - .me = THIS_MODULE, - }, - { - .name = "MARK", - .revision = 2, - .family = NFPROTO_IPV6, + .family = NFPROTO_UNSPEC, .target = mark_tg, .targetsize = sizeof(struct xt_mark_tginfo2), .me = THIS_MODULE, diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index f7114fc5cfc..da7946e6ecb 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -157,25 +157,15 @@ static void xt_rateest_tg_destroy(const struct xt_target *target, xt_rateest_put(info->est); } -static struct xt_target xt_rateest_target[] __read_mostly = { - { - .family = NFPROTO_IPV4, - .name = "RATEEST", - .target = xt_rateest_tg, - .checkentry = xt_rateest_tg_checkentry, - .destroy = xt_rateest_tg_destroy, - .targetsize = sizeof(struct xt_rateest_target_info), - .me = THIS_MODULE, - }, - { - .family = NFPROTO_IPV6, - .name = "RATEEST", - .target = xt_rateest_tg, - .checkentry = xt_rateest_tg_checkentry, - .destroy = xt_rateest_tg_destroy, - .targetsize = sizeof(struct xt_rateest_target_info), - .me = THIS_MODULE, - }, +static struct xt_target xt_rateest_tg_reg __read_mostly = { + .name = "RATEEST", + .revision = 0, + .family = NFPROTO_UNSPEC, + .target = xt_rateest_tg, + .checkentry = xt_rateest_tg_checkentry, + .destroy = xt_rateest_tg_destroy, + .targetsize = sizeof(struct xt_rateest_target_info), + .me = THIS_MODULE, }; static int __init xt_rateest_tg_init(void) @@ -186,13 +176,12 @@ static int __init xt_rateest_tg_init(void) INIT_HLIST_HEAD(&rateest_hash[i]); get_random_bytes(&jhash_rnd, sizeof(jhash_rnd)); - return xt_register_targets(xt_rateest_target, - ARRAY_SIZE(xt_rateest_target)); + return xt_register_target(&xt_rateest_tg_reg); } static void __exit xt_rateest_tg_fini(void) { - xt_unregister_targets(xt_rateest_target, ARRAY_SIZE(xt_rateest_target)); + xt_unregister_target(&xt_rateest_tg_reg); } diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c index 8f8f57b93a6..2a2ab833481 100644 --- a/net/netfilter/xt_SECMARK.c +++ b/net/netfilter/xt_SECMARK.c @@ -125,35 +125,25 @@ static void secmark_tg_destroy(const struct xt_target *target, void *targinfo) } } -static struct xt_target secmark_tg_reg[] __read_mostly = { - { - .name = "SECMARK", - .family = NFPROTO_IPV4, - .checkentry = secmark_tg_check, - .destroy = secmark_tg_destroy, - .target = secmark_tg, - .targetsize = sizeof(struct xt_secmark_target_info), - .me = THIS_MODULE, - }, - { - .name = "SECMARK", - .family = NFPROTO_IPV6, - .checkentry = secmark_tg_check, - .destroy = secmark_tg_destroy, - .target = secmark_tg, - .targetsize = sizeof(struct xt_secmark_target_info), - .me = THIS_MODULE, - }, +static struct xt_target secmark_tg_reg __read_mostly = { + .name = "SECMARK", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = secmark_tg_check, + .destroy = secmark_tg_destroy, + .target = secmark_tg, + .targetsize = sizeof(struct xt_secmark_target_info), + .me = THIS_MODULE, }; static int __init secmark_tg_init(void) { - return xt_register_targets(secmark_tg_reg, ARRAY_SIZE(secmark_tg_reg)); + return xt_register_target(&secmark_tg_reg); } static void __exit secmark_tg_exit(void) { - xt_unregister_targets(secmark_tg_reg, ARRAY_SIZE(secmark_tg_reg)); + xt_unregister_target(&secmark_tg_reg); } module_init(secmark_tg_init); diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c index e1bcad57914..da35f9f1cd7 100644 --- a/net/netfilter/xt_TRACE.c +++ b/net/netfilter/xt_TRACE.c @@ -19,31 +19,23 @@ trace_tg(struct sk_buff *skb, const struct net_device *in, return XT_CONTINUE; } -static struct xt_target trace_tg_reg[] __read_mostly = { - { - .name = "TRACE", - .family = NFPROTO_IPV4, - .target = trace_tg, - .table = "raw", - .me = THIS_MODULE, - }, - { - .name = "TRACE", - .family = NFPROTO_IPV6, - .target = trace_tg, - .table = "raw", - .me = THIS_MODULE, - }, +static struct xt_target trace_tg_reg __read_mostly = { + .name = "TRACE", + .revision = 0, + .family = NFPROTO_UNSPEC, + .table = "raw", + .target = trace_tg, + .me = THIS_MODULE, }; static int __init trace_tg_init(void) { - return xt_register_targets(trace_tg_reg, ARRAY_SIZE(trace_tg_reg)); + return xt_register_target(&trace_tg_reg); } static void __exit trace_tg_exit(void) { - xt_unregister_targets(trace_tg_reg, ARRAY_SIZE(trace_tg_reg)); + xt_unregister_target(&trace_tg_reg); } module_init(trace_tg_init); diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index 584d66893c4..00247bd1095 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -167,43 +167,29 @@ static int limit_mt_compat_to_user(void __user *dst, void *src) } #endif /* CONFIG_COMPAT */ -static struct xt_match limit_mt_reg[] __read_mostly = { - { - .name = "limit", - .family = NFPROTO_IPV4, - .checkentry = limit_mt_check, - .match = limit_mt, - .matchsize = sizeof(struct xt_rateinfo), +static struct xt_match limit_mt_reg __read_mostly = { + .name = "limit", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = limit_mt, + .checkentry = limit_mt_check, + .matchsize = sizeof(struct xt_rateinfo), #ifdef CONFIG_COMPAT - .compatsize = sizeof(struct compat_xt_rateinfo), - .compat_from_user = limit_mt_compat_from_user, - .compat_to_user = limit_mt_compat_to_user, + .compatsize = sizeof(struct compat_xt_rateinfo), + .compat_from_user = limit_mt_compat_from_user, + .compat_to_user = limit_mt_compat_to_user, #endif - .me = THIS_MODULE, - }, - { - .name = "limit", - .family = NFPROTO_IPV6, - .checkentry = limit_mt_check, - .match = limit_mt, - .matchsize = sizeof(struct xt_rateinfo), -#ifdef CONFIG_COMPAT - .compatsize = sizeof(struct compat_xt_rateinfo), - .compat_from_user = limit_mt_compat_from_user, - .compat_to_user = limit_mt_compat_to_user, -#endif - .me = THIS_MODULE, - }, + .me = THIS_MODULE, }; static int __init limit_mt_init(void) { - return xt_register_matches(limit_mt_reg, ARRAY_SIZE(limit_mt_reg)); + return xt_register_match(&limit_mt_reg); } static void __exit limit_mt_exit(void) { - xt_unregister_matches(limit_mt_reg, ARRAY_SIZE(limit_mt_reg)); + xt_unregister_match(&limit_mt_reg); } module_init(limit_mt_init); diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c index c66affd5722..96dd2b63b6b 100644 --- a/net/netfilter/xt_mark.c +++ b/net/netfilter/xt_mark.c @@ -92,7 +92,7 @@ static struct xt_match mark_mt_reg[] __read_mostly = { { .name = "mark", .revision = 0, - .family = NFPROTO_IPV4, + .family = NFPROTO_UNSPEC, .checkentry = mark_mt_check_v0, .match = mark_mt_v0, .matchsize = sizeof(struct xt_mark_info), @@ -103,32 +103,10 @@ static struct xt_match mark_mt_reg[] __read_mostly = { #endif .me = THIS_MODULE, }, - { - .name = "mark", - .revision = 0, - .family = NFPROTO_IPV6, - .checkentry = mark_mt_check_v0, - .match = mark_mt_v0, - .matchsize = sizeof(struct xt_mark_info), -#ifdef CONFIG_COMPAT - .compatsize = sizeof(struct compat_xt_mark_info), - .compat_from_user = mark_mt_compat_from_user_v0, - .compat_to_user = mark_mt_compat_to_user_v0, -#endif - .me = THIS_MODULE, - }, - { - .name = "mark", - .revision = 1, - .family = NFPROTO_IPV4, - .match = mark_mt, - .matchsize = sizeof(struct xt_mark_mtinfo1), - .me = THIS_MODULE, - }, { .name = "mark", .revision = 1, - .family = NFPROTO_IPV6, + .family = NFPROTO_UNSPEC, .match = mark_mt, .matchsize = sizeof(struct xt_mark_mtinfo1), .me = THIS_MODULE, diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c index 59f61e32b62..a3c8798f0cc 100644 --- a/net/netfilter/xt_quota.c +++ b/net/netfilter/xt_quota.c @@ -54,33 +54,24 @@ quota_mt_check(const char *tablename, const void *entry, return true; } -static struct xt_match quota_mt_reg[] __read_mostly = { - { - .name = "quota", - .family = NFPROTO_IPV4, - .checkentry = quota_mt_check, - .match = quota_mt, - .matchsize = sizeof(struct xt_quota_info), - .me = THIS_MODULE - }, - { - .name = "quota", - .family = NFPROTO_IPV6, - .checkentry = quota_mt_check, - .match = quota_mt, - .matchsize = sizeof(struct xt_quota_info), - .me = THIS_MODULE - }, +static struct xt_match quota_mt_reg __read_mostly = { + .name = "quota", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = quota_mt, + .checkentry = quota_mt_check, + .matchsize = sizeof(struct xt_quota_info), + .me = THIS_MODULE, }; static int __init quota_mt_init(void) { - return xt_register_matches(quota_mt_reg, ARRAY_SIZE(quota_mt_reg)); + return xt_register_match("a_mt_reg); } static void __exit quota_mt_exit(void) { - xt_unregister_matches(quota_mt_reg, ARRAY_SIZE(quota_mt_reg)); + xt_unregister_match("a_mt_reg); } module_init(quota_mt_init); diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c index ba1cb5760f4..4dcfd7353db 100644 --- a/net/netfilter/xt_rateest.c +++ b/net/netfilter/xt_rateest.c @@ -137,36 +137,25 @@ static void xt_rateest_mt_destroy(const struct xt_match *match, xt_rateest_put(info->est2); } -static struct xt_match xt_rateest_match[] __read_mostly = { - { - .family = NFPROTO_IPV4, - .name = "rateest", - .match = xt_rateest_mt, - .checkentry = xt_rateest_mt_checkentry, - .destroy = xt_rateest_mt_destroy, - .matchsize = sizeof(struct xt_rateest_match_info), - .me = THIS_MODULE, - }, - { - .family = NFPROTO_IPV6, - .name = "rateest", - .match = xt_rateest_mt, - .checkentry = xt_rateest_mt_checkentry, - .destroy = xt_rateest_mt_destroy, - .matchsize = sizeof(struct xt_rateest_match_info), - .me = THIS_MODULE, - }, +static struct xt_match xt_rateest_mt_reg __read_mostly = { + .name = "rateest", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = xt_rateest_mt, + .checkentry = xt_rateest_mt_checkentry, + .destroy = xt_rateest_mt_destroy, + .matchsize = sizeof(struct xt_rateest_match_info), + .me = THIS_MODULE, }; static int __init xt_rateest_mt_init(void) { - return xt_register_matches(xt_rateest_match, - ARRAY_SIZE(xt_rateest_match)); + return xt_register_match(&xt_rateest_mt_reg); } static void __exit xt_rateest_mt_fini(void) { - xt_unregister_matches(xt_rateest_match, ARRAY_SIZE(xt_rateest_match)); + xt_unregister_match(&xt_rateest_mt_reg); } MODULE_AUTHOR("Patrick McHardy "); diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c index fd3bb1400df..f41a92322e6 100644 --- a/net/netfilter/xt_statistic.c +++ b/net/netfilter/xt_statistic.c @@ -66,35 +66,24 @@ statistic_mt_check(const char *tablename, const void *entry, return true; } -static struct xt_match statistic_mt_reg[] __read_mostly = { - { - .name = "statistic", - .family = NFPROTO_IPV4, - .checkentry = statistic_mt_check, - .match = statistic_mt, - .matchsize = sizeof(struct xt_statistic_info), - .me = THIS_MODULE, - }, - { - .name = "statistic", - .family = NFPROTO_IPV6, - .checkentry = statistic_mt_check, - .match = statistic_mt, - .matchsize = sizeof(struct xt_statistic_info), - .me = THIS_MODULE, - }, +static struct xt_match xt_statistic_mt_reg __read_mostly = { + .name = "statistic", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = statistic_mt, + .checkentry = statistic_mt_check, + .matchsize = sizeof(struct xt_statistic_info), + .me = THIS_MODULE, }; static int __init statistic_mt_init(void) { - return xt_register_matches(statistic_mt_reg, - ARRAY_SIZE(statistic_mt_reg)); + return xt_register_match(&xt_statistic_mt_reg); } static void __exit statistic_mt_exit(void) { - xt_unregister_matches(statistic_mt_reg, - ARRAY_SIZE(statistic_mt_reg)); + xt_unregister_match(&xt_statistic_mt_reg); } module_init(statistic_mt_init); diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c index 50169718377..18d8884e737 100644 --- a/net/netfilter/xt_string.c +++ b/net/netfilter/xt_string.c @@ -81,11 +81,11 @@ static void string_mt_destroy(const struct xt_match *match, void *matchinfo) textsearch_destroy(STRING_TEXT_PRIV(matchinfo)->config); } -static struct xt_match string_mt_reg[] __read_mostly = { +static struct xt_match xt_string_mt_reg[] __read_mostly = { { .name = "string", .revision = 0, - .family = NFPROTO_IPV4, + .family = NFPROTO_UNSPEC, .checkentry = string_mt_check, .match = string_mt, .destroy = string_mt_destroy, @@ -95,27 +95,7 @@ static struct xt_match string_mt_reg[] __read_mostly = { { .name = "string", .revision = 1, - .family = NFPROTO_IPV4, - .checkentry = string_mt_check, - .match = string_mt, - .destroy = string_mt_destroy, - .matchsize = sizeof(struct xt_string_info), - .me = THIS_MODULE - }, - { - .name = "string", - .revision = 0, - .family = NFPROTO_IPV6, - .checkentry = string_mt_check, - .match = string_mt, - .destroy = string_mt_destroy, - .matchsize = sizeof(struct xt_string_info), - .me = THIS_MODULE - }, - { - .name = "string", - .revision = 1, - .family = NFPROTO_IPV6, + .family = NFPROTO_UNSPEC, .checkentry = string_mt_check, .match = string_mt, .destroy = string_mt_destroy, @@ -126,12 +106,13 @@ static struct xt_match string_mt_reg[] __read_mostly = { static int __init string_mt_init(void) { - return xt_register_matches(string_mt_reg, ARRAY_SIZE(string_mt_reg)); + return xt_register_matches(xt_string_mt_reg, + ARRAY_SIZE(xt_string_mt_reg)); } static void __exit string_mt_exit(void) { - xt_unregister_matches(string_mt_reg, ARRAY_SIZE(string_mt_reg)); + xt_unregister_matches(xt_string_mt_reg, ARRAY_SIZE(xt_string_mt_reg)); } module_init(string_mt_init); diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index fe9dae2b4f5..32d4c769caa 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -237,33 +237,23 @@ time_mt_check(const char *tablename, const void *ip, return true; } -static struct xt_match time_mt_reg[] __read_mostly = { - { - .name = "time", - .family = NFPROTO_IPV4, - .match = time_mt, - .matchsize = sizeof(struct xt_time_info), - .checkentry = time_mt_check, - .me = THIS_MODULE, - }, - { - .name = "time", - .family = NFPROTO_IPV6, - .match = time_mt, - .matchsize = sizeof(struct xt_time_info), - .checkentry = time_mt_check, - .me = THIS_MODULE, - }, +static struct xt_match xt_time_mt_reg __read_mostly = { + .name = "time", + .family = NFPROTO_UNSPEC, + .match = time_mt, + .checkentry = time_mt_check, + .matchsize = sizeof(struct xt_time_info), + .me = THIS_MODULE, }; static int __init time_mt_init(void) { - return xt_register_matches(time_mt_reg, ARRAY_SIZE(time_mt_reg)); + return xt_register_match(&xt_time_mt_reg); } static void __exit time_mt_exit(void) { - xt_unregister_matches(time_mt_reg, ARRAY_SIZE(time_mt_reg)); + xt_unregister_match(&xt_time_mt_reg); } module_init(time_mt_init); diff --git a/net/netfilter/xt_u32.c b/net/netfilter/xt_u32.c index ed9f8340611..a6b971dc5d3 100644 --- a/net/netfilter/xt_u32.c +++ b/net/netfilter/xt_u32.c @@ -99,31 +99,23 @@ u32_mt(const struct sk_buff *skb, const struct net_device *in, return ret ^ data->invert; } -static struct xt_match u32_mt_reg[] __read_mostly = { - { - .name = "u32", - .family = NFPROTO_IPV4, - .match = u32_mt, - .matchsize = sizeof(struct xt_u32), - .me = THIS_MODULE, - }, - { - .name = "u32", - .family = NFPROTO_IPV6, - .match = u32_mt, - .matchsize = sizeof(struct xt_u32), - .me = THIS_MODULE, - }, +static struct xt_match xt_u32_mt_reg __read_mostly = { + .name = "u32", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = u32_mt, + .matchsize = sizeof(struct xt_u32), + .me = THIS_MODULE, }; static int __init u32_mt_init(void) { - return xt_register_matches(u32_mt_reg, ARRAY_SIZE(u32_mt_reg)); + return xt_register_match(&xt_u32_mt_reg); } static void __exit u32_mt_exit(void) { - xt_unregister_matches(u32_mt_reg, ARRAY_SIZE(u32_mt_reg)); + xt_unregister_match(&xt_u32_mt_reg); } module_init(u32_mt_init); -- cgit v1.2.3 From dfdb8d791877052bbb527d9688d94a064721d8f7 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:02 +0200 Subject: netfilter: netns nf_conntrack: add netns boilerplate One comment: #ifdefs around #include is necessary to overcome amazing compile breakages in NOTRACK-in-netns patch (see below). Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_core.c | 4 ++-- net/netfilter/nf_conntrack_expect.c | 4 ++-- net/netfilter/nf_conntrack_standalone.c | 21 ++++++++++++++++++--- 3 files changed, 22 insertions(+), 7 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 6aaf64b5ded..ee79e932589 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1006,7 +1006,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_flush); /* Mishearing the voices in his head, our hero wonders how he's supposed to kill the mall. */ -void nf_conntrack_cleanup(void) +void nf_conntrack_cleanup(struct net *net) { rcu_assign_pointer(ip_ct_attach, NULL); @@ -1120,7 +1120,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, &nf_conntrack_htable_size, 0600); -int __init nf_conntrack_init(void) +int nf_conntrack_init(struct net *net) { int max_factor = 8; int ret; diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 990fa12f2ee..e6a79f2a7c5 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -537,7 +537,7 @@ static const struct file_operations exp_file_ops = { }; #endif /* CONFIG_PROC_FS */ -static int __init exp_proc_init(void) +static int exp_proc_init(void) { #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc; @@ -558,7 +558,7 @@ static void exp_proc_remove(void) module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0600); -int __init nf_conntrack_expect_init(void) +int nf_conntrack_expect_init(void) { int err = -ENOMEM; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 8509db14670..81dec17196d 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -440,11 +440,26 @@ static void nf_conntrack_standalone_fini_sysctl(void) } #endif /* CONFIG_SYSCTL */ +static int nf_conntrack_net_init(struct net *net) +{ + return nf_conntrack_init(net); +} + +static void nf_conntrack_net_exit(struct net *net) +{ + nf_conntrack_cleanup(net); +} + +static struct pernet_operations nf_conntrack_net_ops = { + .init = nf_conntrack_net_init, + .exit = nf_conntrack_net_exit, +}; + static int __init nf_conntrack_standalone_init(void) { int ret; - ret = nf_conntrack_init(); + ret = register_pernet_subsys(&nf_conntrack_net_ops); if (ret < 0) goto out; ret = nf_conntrack_standalone_init_proc(); @@ -458,7 +473,7 @@ static int __init nf_conntrack_standalone_init(void) out_sysctl: nf_conntrack_standalone_fini_proc(); out_proc: - nf_conntrack_cleanup(); + unregister_pernet_subsys(&nf_conntrack_net_ops); out: return ret; } @@ -467,7 +482,7 @@ static void __exit nf_conntrack_standalone_fini(void) { nf_conntrack_standalone_fini_sysctl(); nf_conntrack_standalone_fini_proc(); - nf_conntrack_cleanup(); + unregister_pernet_subsys(&nf_conntrack_net_ops); } module_init(nf_conntrack_standalone_init); -- cgit v1.2.3 From 5a1fb391d881905e89623d78858d05b248cbc86a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:02 +0200 Subject: netfilter: netns nf_conntrack: add ->ct_net -- pointer from conntrack to netns Conntrack (struct nf_conn) gets pointer to netns: ->ct_net -- netns in which it was created. It comes from netdevice. ->ct_net is write-once field. Every conntrack in system has ->ct_net initialized, no exceptions. ->ct_net doesn't pin netns: conntracks are recycled after timeouts and pinning background traffic will prevent netns from even starting shutdown sequence. Right now every conntrack is created in init_net. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_core.c | 17 +++++++++++++---- net/netfilter/nf_conntrack_netlink.c | 2 +- 2 files changed, 14 insertions(+), 5 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index ee79e932589..cefc338f6e5 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -464,7 +464,8 @@ static noinline int early_drop(unsigned int hash) return dropped; } -struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, +struct nf_conn *nf_conntrack_alloc(struct net *net, + const struct nf_conntrack_tuple *orig, const struct nf_conntrack_tuple *repl, gfp_t gfp) { @@ -503,6 +504,9 @@ struct nf_conn *nf_conntrack_alloc(const struct nf_conntrack_tuple *orig, ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; /* Don't set timer yet: wait for confirmation */ setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct); +#ifdef CONFIG_NET_NS + ct->ct_net = net; +#endif INIT_RCU_HEAD(&ct->rcu); return ct; @@ -528,7 +532,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_free); /* Allocate a new conntrack: we return -ENOMEM if classification failed due to stress. Otherwise it really is unclassifiable. */ static struct nf_conntrack_tuple_hash * -init_conntrack(const struct nf_conntrack_tuple *tuple, +init_conntrack(struct net *net, + const struct nf_conntrack_tuple *tuple, struct nf_conntrack_l3proto *l3proto, struct nf_conntrack_l4proto *l4proto, struct sk_buff *skb, @@ -544,7 +549,7 @@ init_conntrack(const struct nf_conntrack_tuple *tuple, return NULL; } - ct = nf_conntrack_alloc(tuple, &repl_tuple, GFP_ATOMIC); + ct = nf_conntrack_alloc(net, tuple, &repl_tuple, GFP_ATOMIC); if (ct == NULL || IS_ERR(ct)) { pr_debug("Can't allocate conntrack.\n"); return (struct nf_conntrack_tuple_hash *)ct; @@ -631,7 +636,8 @@ resolve_normal_ct(struct sk_buff *skb, /* look for tuple match */ h = nf_conntrack_find_get(&tuple); if (!h) { - h = init_conntrack(&tuple, l3proto, l4proto, skb, dataoff); + h = init_conntrack(&init_net, &tuple, l3proto, l4proto, skb, + dataoff); if (!h) return NULL; if (IS_ERR(h)) @@ -1185,6 +1191,9 @@ int nf_conntrack_init(struct net *net) /* Set up fake conntrack: - to never be deleted, not in any hashes */ +#ifdef CONFIG_NET_NS + nf_conntrack_untracked.ct_net = &init_net; +#endif atomic_set(&nf_conntrack_untracked.ct_general.use, 1); /* - and look it like as a confirmed connection */ set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index a8752031adc..da3cdc8db70 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1125,7 +1125,7 @@ ctnetlink_create_conntrack(struct nlattr *cda[], struct nf_conn_help *help; struct nf_conntrack_helper *helper; - ct = nf_conntrack_alloc(otuple, rtuple, GFP_KERNEL); + ct = nf_conntrack_alloc(&init_net, otuple, rtuple, GFP_KERNEL); if (ct == NULL || IS_ERR(ct)) return -ENOMEM; -- cgit v1.2.3 From 49ac8713b6d064adf7474080fdccebd7cce76be0 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:03 +0200 Subject: netfilter: netns nf_conntrack: per-netns conntrack count Sysctls and proc files are stubbed to init_net's one. This is temporary. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_core.c | 18 ++++++++---------- net/netfilter/nf_conntrack_standalone.c | 4 ++-- 2 files changed, 10 insertions(+), 12 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index cefc338f6e5..8299b3490e7 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -44,10 +44,6 @@ DEFINE_SPINLOCK(nf_conntrack_lock); EXPORT_SYMBOL_GPL(nf_conntrack_lock); -/* nf_conntrack_standalone needs this */ -atomic_t nf_conntrack_count = ATOMIC_INIT(0); -EXPORT_SYMBOL_GPL(nf_conntrack_count); - unsigned int nf_conntrack_htable_size __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); @@ -477,13 +473,13 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, } /* We don't want any race condition at early drop stage */ - atomic_inc(&nf_conntrack_count); + atomic_inc(&net->ct.count); if (nf_conntrack_max && - unlikely(atomic_read(&nf_conntrack_count) > nf_conntrack_max)) { + unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { unsigned int hash = hash_conntrack(orig); if (!early_drop(hash)) { - atomic_dec(&nf_conntrack_count); + atomic_dec(&net->ct.count); if (net_ratelimit()) printk(KERN_WARNING "nf_conntrack: table full, dropping" @@ -495,7 +491,7 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, ct = kmem_cache_zalloc(nf_conntrack_cachep, gfp); if (ct == NULL) { pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n"); - atomic_dec(&nf_conntrack_count); + atomic_dec(&net->ct.count); return ERR_PTR(-ENOMEM); } @@ -516,10 +512,11 @@ EXPORT_SYMBOL_GPL(nf_conntrack_alloc); static void nf_conntrack_free_rcu(struct rcu_head *head) { struct nf_conn *ct = container_of(head, struct nf_conn, rcu); + struct net *net = nf_ct_net(ct); nf_ct_ext_free(ct); kmem_cache_free(nf_conntrack_cachep, ct); - atomic_dec(&nf_conntrack_count); + atomic_dec(&net->ct.count); } void nf_conntrack_free(struct nf_conn *ct) @@ -1024,7 +1021,7 @@ void nf_conntrack_cleanup(struct net *net) nf_ct_event_cache_flush(); i_see_dead_people: nf_conntrack_flush(); - if (atomic_read(&nf_conntrack_count) != 0) { + if (atomic_read(&net->ct.count) != 0) { schedule(); goto i_see_dead_people; } @@ -1148,6 +1145,7 @@ int nf_conntrack_init(struct net *net) * entries. */ max_factor = 4; } + atomic_set(&net->ct.count, 0); nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, &nf_conntrack_vmalloc); if (!nf_conntrack_hash) { diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 81dec17196d..021b505907d 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -226,7 +226,7 @@ static void ct_cpu_seq_stop(struct seq_file *seq, void *v) static int ct_cpu_seq_show(struct seq_file *seq, void *v) { - unsigned int nr_conntracks = atomic_read(&nf_conntrack_count); + unsigned int nr_conntracks = atomic_read(&init_net.ct.count); const struct ip_conntrack_stat *st = v; if (v == SEQ_START_TOKEN) { @@ -338,7 +338,7 @@ static ctl_table nf_ct_sysctl_table[] = { { .ctl_name = NET_NF_CONNTRACK_COUNT, .procname = "nf_conntrack_count", - .data = &nf_conntrack_count, + .data = &init_net.ct.count, .maxlen = sizeof(int), .mode = 0444, .proc_handler = &proc_dointvec, -- cgit v1.2.3 From 400dad39d1c33fe797e47326d87a3f54d0ac5181 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:03 +0200 Subject: netfilter: netns nf_conntrack: per-netns conntrack hash * make per-netns conntrack hash Other solution is to add ->ct_net pointer to tuplehashes and still has one hash, I tried that it's ugly and requires more code deep down in protocol modules et al. * propagate netns pointer to where needed, e. g. to conntrack iterators. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_core.c | 74 +++++++++++++++++---------------- net/netfilter/nf_conntrack_helper.c | 2 +- net/netfilter/nf_conntrack_netlink.c | 16 +++---- net/netfilter/nf_conntrack_pptp.c | 2 +- net/netfilter/nf_conntrack_proto.c | 4 +- net/netfilter/nf_conntrack_standalone.c | 4 +- net/netfilter/xt_connlimit.c | 2 +- 7 files changed, 53 insertions(+), 51 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 8299b3490e7..da56b260552 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -50,15 +50,11 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); int nf_conntrack_max __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_max); -struct hlist_head *nf_conntrack_hash __read_mostly; -EXPORT_SYMBOL_GPL(nf_conntrack_hash); - struct nf_conn nf_conntrack_untracked __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_untracked); unsigned int nf_ct_log_invalid __read_mostly; HLIST_HEAD(unconfirmed); -static int nf_conntrack_vmalloc __read_mostly; static struct kmem_cache *nf_conntrack_cachep __read_mostly; DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat); @@ -242,7 +238,7 @@ static void death_by_timeout(unsigned long ul_conntrack) } struct nf_conntrack_tuple_hash * -__nf_conntrack_find(const struct nf_conntrack_tuple *tuple) +__nf_conntrack_find(struct net *net, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_tuple_hash *h; struct hlist_node *n; @@ -252,7 +248,7 @@ __nf_conntrack_find(const struct nf_conntrack_tuple *tuple) * at least once for the stats anyway. */ local_bh_disable(); - hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], hnode) { + hlist_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnode) { if (nf_ct_tuple_equal(tuple, &h->tuple)) { NF_CT_STAT_INC(found); local_bh_enable(); @@ -268,13 +264,13 @@ EXPORT_SYMBOL_GPL(__nf_conntrack_find); /* Find a connection corresponding to a tuple. */ struct nf_conntrack_tuple_hash * -nf_conntrack_find_get(const struct nf_conntrack_tuple *tuple) +nf_conntrack_find_get(struct net *net, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; rcu_read_lock(); - h = __nf_conntrack_find(tuple); + h = __nf_conntrack_find(net, tuple); if (h) { ct = nf_ct_tuplehash_to_ctrack(h); if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use))) @@ -290,10 +286,12 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct, unsigned int hash, unsigned int repl_hash) { + struct net *net = nf_ct_net(ct); + hlist_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode, - &nf_conntrack_hash[hash]); + &net->ct.hash[hash]); hlist_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnode, - &nf_conntrack_hash[repl_hash]); + &net->ct.hash[repl_hash]); } void nf_conntrack_hash_insert(struct nf_conn *ct) @@ -319,8 +317,10 @@ __nf_conntrack_confirm(struct sk_buff *skb) struct nf_conn_help *help; struct hlist_node *n; enum ip_conntrack_info ctinfo; + struct net *net; ct = nf_ct_get(skb, &ctinfo); + net = nf_ct_net(ct); /* ipt_REJECT uses nf_conntrack_attach to attach related ICMP/TCP RST packets in other direction. Actual packet @@ -347,11 +347,11 @@ __nf_conntrack_confirm(struct sk_buff *skb) /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ - hlist_for_each_entry(h, n, &nf_conntrack_hash[hash], hnode) + hlist_for_each_entry(h, n, &net->ct.hash[hash], hnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, &h->tuple)) goto out; - hlist_for_each_entry(h, n, &nf_conntrack_hash[repl_hash], hnode) + hlist_for_each_entry(h, n, &net->ct.hash[repl_hash], hnode) if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, &h->tuple)) goto out; @@ -394,6 +394,7 @@ int nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack) { + struct net *net = nf_ct_net(ignored_conntrack); struct nf_conntrack_tuple_hash *h; struct hlist_node *n; unsigned int hash = hash_conntrack(tuple); @@ -402,7 +403,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, * least once for the stats anyway. */ rcu_read_lock_bh(); - hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], hnode) { + hlist_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnode) { if (nf_ct_tuplehash_to_ctrack(h) != ignored_conntrack && nf_ct_tuple_equal(tuple, &h->tuple)) { NF_CT_STAT_INC(found); @@ -421,7 +422,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); /* There's a small race here where we may free a just-assured connection. Too bad: we're in trouble anyway. */ -static noinline int early_drop(unsigned int hash) +static noinline int early_drop(struct net *net, unsigned int hash) { /* Use oldest entry, which is roughly LRU */ struct nf_conntrack_tuple_hash *h; @@ -432,7 +433,7 @@ static noinline int early_drop(unsigned int hash) rcu_read_lock(); for (i = 0; i < nf_conntrack_htable_size; i++) { - hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], + hlist_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnode) { tmp = nf_ct_tuplehash_to_ctrack(h); if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) @@ -478,7 +479,7 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, if (nf_conntrack_max && unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { unsigned int hash = hash_conntrack(orig); - if (!early_drop(hash)) { + if (!early_drop(net, hash)) { atomic_dec(&net->ct.count); if (net_ratelimit()) printk(KERN_WARNING @@ -631,7 +632,7 @@ resolve_normal_ct(struct sk_buff *skb, } /* look for tuple match */ - h = nf_conntrack_find_get(&tuple); + h = nf_conntrack_find_get(&init_net, &tuple); if (!h) { h = init_conntrack(&init_net, &tuple, l3proto, l4proto, skb, dataoff); @@ -941,7 +942,7 @@ static void nf_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) /* Bring out ya dead! */ static struct nf_conn * -get_next_corpse(int (*iter)(struct nf_conn *i, void *data), +get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), void *data, unsigned int *bucket) { struct nf_conntrack_tuple_hash *h; @@ -950,7 +951,7 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data), spin_lock_bh(&nf_conntrack_lock); for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { - hlist_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnode) { + hlist_for_each_entry(h, n, &net->ct.hash[*bucket], hnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (iter(ct, data)) goto found; @@ -969,13 +970,14 @@ found: return ct; } -void -nf_ct_iterate_cleanup(int (*iter)(struct nf_conn *i, void *data), void *data) +void nf_ct_iterate_cleanup(struct net *net, + int (*iter)(struct nf_conn *i, void *data), + void *data) { struct nf_conn *ct; unsigned int bucket = 0; - while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) { + while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) { /* Time to push up daises... */ if (del_timer(&ct->timeout)) death_by_timeout((unsigned long)ct); @@ -1001,9 +1003,9 @@ void nf_ct_free_hashtable(struct hlist_head *hash, int vmalloced, unsigned int s } EXPORT_SYMBOL_GPL(nf_ct_free_hashtable); -void nf_conntrack_flush(void) +void nf_conntrack_flush(struct net *net) { - nf_ct_iterate_cleanup(kill_all, NULL); + nf_ct_iterate_cleanup(net, kill_all, NULL); } EXPORT_SYMBOL_GPL(nf_conntrack_flush); @@ -1020,7 +1022,7 @@ void nf_conntrack_cleanup(struct net *net) nf_ct_event_cache_flush(); i_see_dead_people: - nf_conntrack_flush(); + nf_conntrack_flush(net); if (atomic_read(&net->ct.count) != 0) { schedule(); goto i_see_dead_people; @@ -1032,7 +1034,7 @@ void nf_conntrack_cleanup(struct net *net) rcu_assign_pointer(nf_ct_destroy, NULL); kmem_cache_destroy(nf_conntrack_cachep); - nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_vmalloc, + nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, nf_conntrack_htable_size); nf_conntrack_acct_fini(); @@ -1097,8 +1099,8 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) */ spin_lock_bh(&nf_conntrack_lock); for (i = 0; i < nf_conntrack_htable_size; i++) { - while (!hlist_empty(&nf_conntrack_hash[i])) { - h = hlist_entry(nf_conntrack_hash[i].first, + while (!hlist_empty(&init_net.ct.hash[i])) { + h = hlist_entry(init_net.ct.hash[i].first, struct nf_conntrack_tuple_hash, hnode); hlist_del_rcu(&h->hnode); bucket = __hash_conntrack(&h->tuple, hashsize, rnd); @@ -1106,12 +1108,12 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) } } old_size = nf_conntrack_htable_size; - old_vmalloced = nf_conntrack_vmalloc; - old_hash = nf_conntrack_hash; + old_vmalloced = init_net.ct.hash_vmalloc; + old_hash = init_net.ct.hash; nf_conntrack_htable_size = hashsize; - nf_conntrack_vmalloc = vmalloced; - nf_conntrack_hash = hash; + init_net.ct.hash_vmalloc = vmalloced; + init_net.ct.hash = hash; nf_conntrack_hash_rnd = rnd; spin_unlock_bh(&nf_conntrack_lock); @@ -1146,9 +1148,9 @@ int nf_conntrack_init(struct net *net) max_factor = 4; } atomic_set(&net->ct.count, 0); - nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, - &nf_conntrack_vmalloc); - if (!nf_conntrack_hash) { + net->ct.hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, + &net->ct.hash_vmalloc); + if (!net->ct.hash) { printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); goto err_out; } @@ -1207,7 +1209,7 @@ out_fini_proto: err_free_conntrack_slab: kmem_cache_destroy(nf_conntrack_cachep); err_free_hash: - nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_vmalloc, + nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, nf_conntrack_htable_size); err_out: return -ENOMEM; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 8e0b4c8f62a..d91278dfdaf 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -159,7 +159,7 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) hlist_for_each_entry(h, n, &unconfirmed, hnode) unhelp(h, me); for (i = 0; i < nf_conntrack_htable_size; i++) { - hlist_for_each_entry(h, n, &nf_conntrack_hash[i], hnode) + hlist_for_each_entry(h, n, &init_net.ct.hash[i], hnode) unhelp(h, me); } spin_unlock_bh(&nf_conntrack_lock); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index da3cdc8db70..918a3358a12 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -549,7 +549,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) last = (struct nf_conn *)cb->args[1]; for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) { restart: - hlist_for_each_entry_rcu(h, n, &nf_conntrack_hash[cb->args[0]], + hlist_for_each_entry_rcu(h, n, &init_net.ct.hash[cb->args[0]], hnode) { if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; @@ -794,14 +794,14 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY, u3); else { /* Flush the whole table */ - nf_conntrack_flush(); + nf_conntrack_flush(&init_net); return 0; } if (err < 0) return err; - h = nf_conntrack_find_get(&tuple); + h = nf_conntrack_find_get(&init_net, &tuple); if (!h) return -ENOENT; @@ -847,7 +847,7 @@ ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, if (err < 0) return err; - h = nf_conntrack_find_get(&tuple); + h = nf_conntrack_find_get(&init_net, &tuple); if (!h) return -ENOENT; @@ -1213,9 +1213,9 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, spin_lock_bh(&nf_conntrack_lock); if (cda[CTA_TUPLE_ORIG]) - h = __nf_conntrack_find(&otuple); + h = __nf_conntrack_find(&init_net, &otuple); else if (cda[CTA_TUPLE_REPLY]) - h = __nf_conntrack_find(&rtuple); + h = __nf_conntrack_find(&init_net, &rtuple); if (h == NULL) { struct nf_conntrack_tuple master; @@ -1230,7 +1230,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, if (err < 0) goto out_unlock; - master_h = __nf_conntrack_find(&master); + master_h = __nf_conntrack_find(&init_net, &master); if (master_h == NULL) { err = -ENOENT; goto out_unlock; @@ -1670,7 +1670,7 @@ ctnetlink_create_expect(struct nlattr *cda[], u_int8_t u3) return err; /* Look for master conntrack of this expectation */ - h = nf_conntrack_find_get(&master_tuple); + h = nf_conntrack_find_get(&init_net, &master_tuple); if (!h) return -ENOENT; ct = nf_ct_tuplehash_to_ctrack(h); diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 97e54b0e43a..7caf45b59d2 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -143,7 +143,7 @@ static int destroy_sibling_or_exp(const struct nf_conntrack_tuple *t) pr_debug("trying to timeout ct or exp for tuple "); nf_ct_dump_tuple(t); - h = nf_conntrack_find_get(t); + h = nf_conntrack_find_get(&init_net, t); if (h) { sibling = nf_ct_tuplehash_to_ctrack(h); pr_debug("setting timeout of conntrack %p to 0\n", sibling); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index a49fc932629..3a2f7ef997f 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -219,7 +219,7 @@ void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto) synchronize_rcu(); /* Remove all contrack entries for this protocol */ - nf_ct_iterate_cleanup(kill_l3proto, proto); + nf_ct_iterate_cleanup(&init_net, kill_l3proto, proto); } EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_unregister); @@ -328,7 +328,7 @@ void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *l4proto) synchronize_rcu(); /* Remove all contrack entries for this protocol */ - nf_ct_iterate_cleanup(kill_l4proto, l4proto); + nf_ct_iterate_cleanup(&init_net, kill_l4proto, l4proto); } EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_unregister); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 021b505907d..5456e4b9424 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -51,7 +51,7 @@ static struct hlist_node *ct_get_first(struct seq_file *seq) for (st->bucket = 0; st->bucket < nf_conntrack_htable_size; st->bucket++) { - n = rcu_dereference(nf_conntrack_hash[st->bucket].first); + n = rcu_dereference(init_net.ct.hash[st->bucket].first); if (n) return n; } @@ -67,7 +67,7 @@ static struct hlist_node *ct_get_next(struct seq_file *seq, while (head == NULL) { if (++st->bucket >= nf_conntrack_htable_size) return NULL; - head = rcu_dereference(nf_conntrack_hash[st->bucket].first); + head = rcu_dereference(init_net.ct.hash[st->bucket].first); } return head; } diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index d2453d182d6..bd00830ff69 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -123,7 +123,7 @@ static int count_them(struct xt_connlimit_data *data, /* check the saved connections */ list_for_each_entry_safe(conn, tmp, hash, list) { - found = __nf_conntrack_find(&conn->tuple); + found = __nf_conntrack_find(&init_net, &conn->tuple); found_ct = NULL; if (found != NULL) -- cgit v1.2.3 From 9b03f38d0487f3908696242286d934c9b38f9d2a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:03 +0200 Subject: netfilter: netns nf_conntrack: per-netns expectations Make per-netns a) expectation hash and b) expectations count. Expectations always belongs to netns to which it's master conntrack belong. This is natural and doesn't bloat expectation. Proc files and leaf users are stubbed to init_net, this is temporary. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_core.c | 8 ++--- net/netfilter/nf_conntrack_expect.c | 55 +++++++++++++++++----------------- net/netfilter/nf_conntrack_h323_main.c | 2 +- net/netfilter/nf_conntrack_helper.c | 2 +- net/netfilter/nf_conntrack_netlink.c | 13 ++++---- net/netfilter/nf_conntrack_pptp.c | 4 +-- net/netfilter/nf_conntrack_sip.c | 2 +- 7 files changed, 44 insertions(+), 42 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index da56b260552..c188edea249 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -562,7 +562,7 @@ init_conntrack(struct net *net, nf_ct_acct_ext_add(ct, GFP_ATOMIC); spin_lock_bh(&nf_conntrack_lock); - exp = nf_ct_find_expectation(tuple); + exp = nf_ct_find_expectation(net, tuple); if (exp) { pr_debug("conntrack: expectation arrives ct=%p exp=%p\n", ct, exp); @@ -1038,7 +1038,7 @@ void nf_conntrack_cleanup(struct net *net) nf_conntrack_htable_size); nf_conntrack_acct_fini(); - nf_conntrack_expect_fini(); + nf_conntrack_expect_fini(net); nf_conntrack_helper_fini(); nf_conntrack_proto_fini(); } @@ -1173,7 +1173,7 @@ int nf_conntrack_init(struct net *net) if (ret < 0) goto err_free_conntrack_slab; - ret = nf_conntrack_expect_init(); + ret = nf_conntrack_expect_init(net); if (ret < 0) goto out_fini_proto; @@ -1203,7 +1203,7 @@ int nf_conntrack_init(struct net *net) out_fini_helper: nf_conntrack_helper_fini(); out_fini_expect: - nf_conntrack_expect_fini(); + nf_conntrack_expect_fini(net); out_fini_proto: nf_conntrack_proto_fini(); err_free_conntrack_slab: diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index e6a79f2a7c5..5307316356e 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -28,17 +28,12 @@ #include #include -struct hlist_head *nf_ct_expect_hash __read_mostly; -EXPORT_SYMBOL_GPL(nf_ct_expect_hash); - unsigned int nf_ct_expect_hsize __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_expect_hsize); static unsigned int nf_ct_expect_hash_rnd __read_mostly; -static unsigned int nf_ct_expect_count; unsigned int nf_ct_expect_max __read_mostly; static int nf_ct_expect_hash_rnd_initted __read_mostly; -static int nf_ct_expect_vmalloc; static struct kmem_cache *nf_ct_expect_cachep __read_mostly; @@ -46,12 +41,13 @@ static struct kmem_cache *nf_ct_expect_cachep __read_mostly; void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) { struct nf_conn_help *master_help = nfct_help(exp->master); + struct net *net = nf_ct_exp_net(exp); NF_CT_ASSERT(master_help); NF_CT_ASSERT(!timer_pending(&exp->timeout)); hlist_del_rcu(&exp->hnode); - nf_ct_expect_count--; + net->ct.expect_count--; hlist_del(&exp->lnode); master_help->expecting[exp->class]--; @@ -87,17 +83,17 @@ static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple } struct nf_conntrack_expect * -__nf_ct_expect_find(const struct nf_conntrack_tuple *tuple) +__nf_ct_expect_find(struct net *net, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_expect *i; struct hlist_node *n; unsigned int h; - if (!nf_ct_expect_count) + if (!net->ct.expect_count) return NULL; h = nf_ct_expect_dst_hash(tuple); - hlist_for_each_entry_rcu(i, n, &nf_ct_expect_hash[h], hnode) { + hlist_for_each_entry_rcu(i, n, &net->ct.expect_hash[h], hnode) { if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) return i; } @@ -107,12 +103,12 @@ EXPORT_SYMBOL_GPL(__nf_ct_expect_find); /* Just find a expectation corresponding to a tuple. */ struct nf_conntrack_expect * -nf_ct_expect_find_get(const struct nf_conntrack_tuple *tuple) +nf_ct_expect_find_get(struct net *net, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_expect *i; rcu_read_lock(); - i = __nf_ct_expect_find(tuple); + i = __nf_ct_expect_find(net, tuple); if (i && !atomic_inc_not_zero(&i->use)) i = NULL; rcu_read_unlock(); @@ -124,17 +120,17 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_find_get); /* If an expectation for this connection is found, it gets delete from * global list then returned. */ struct nf_conntrack_expect * -nf_ct_find_expectation(const struct nf_conntrack_tuple *tuple) +nf_ct_find_expectation(struct net *net, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_expect *i, *exp = NULL; struct hlist_node *n; unsigned int h; - if (!nf_ct_expect_count) + if (!net->ct.expect_count) return NULL; h = nf_ct_expect_dst_hash(tuple); - hlist_for_each_entry(i, n, &nf_ct_expect_hash[h], hnode) { + hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) { if (!(i->flags & NF_CT_EXPECT_INACTIVE) && nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) { exp = i; @@ -311,6 +307,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_put); static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) { struct nf_conn_help *master_help = nfct_help(exp->master); + struct net *net = nf_ct_exp_net(exp); const struct nf_conntrack_expect_policy *p; unsigned int h = nf_ct_expect_dst_hash(&exp->tuple); @@ -319,8 +316,8 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) hlist_add_head(&exp->lnode, &master_help->expectations); master_help->expecting[exp->class]++; - hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]); - nf_ct_expect_count++; + hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]); + net->ct.expect_count++; setup_timer(&exp->timeout, nf_ct_expectation_timed_out, (unsigned long)exp); @@ -371,6 +368,7 @@ int nf_ct_expect_related(struct nf_conntrack_expect *expect) struct nf_conntrack_expect *i; struct nf_conn *master = expect->master; struct nf_conn_help *master_help = nfct_help(master); + struct net *net = nf_ct_exp_net(expect); struct hlist_node *n; unsigned int h; int ret; @@ -383,7 +381,7 @@ int nf_ct_expect_related(struct nf_conntrack_expect *expect) goto out; } h = nf_ct_expect_dst_hash(&expect->tuple); - hlist_for_each_entry(i, n, &nf_ct_expect_hash[h], hnode) { + hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) { if (expect_matches(i, expect)) { /* Refresh timer: if it's dying, ignore.. */ if (refresh_timer(i)) { @@ -406,7 +404,7 @@ int nf_ct_expect_related(struct nf_conntrack_expect *expect) } } - if (nf_ct_expect_count >= nf_ct_expect_max) { + if (net->ct.expect_count >= nf_ct_expect_max) { if (net_ratelimit()) printk(KERN_WARNING "nf_conntrack: expectation table full\n"); @@ -430,11 +428,12 @@ struct ct_expect_iter_state { static struct hlist_node *ct_expect_get_first(struct seq_file *seq) { + struct net *net = &init_net; struct ct_expect_iter_state *st = seq->private; struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { - n = rcu_dereference(nf_ct_expect_hash[st->bucket].first); + n = rcu_dereference(net->ct.expect_hash[st->bucket].first); if (n) return n; } @@ -444,13 +443,14 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct hlist_node *head) { + struct net *net = &init_net; struct ct_expect_iter_state *st = seq->private; head = rcu_dereference(head->next); while (head == NULL) { if (++st->bucket >= nf_ct_expect_hsize) return NULL; - head = rcu_dereference(nf_ct_expect_hash[st->bucket].first); + head = rcu_dereference(net->ct.expect_hash[st->bucket].first); } return head; } @@ -558,7 +558,7 @@ static void exp_proc_remove(void) module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0600); -int nf_conntrack_expect_init(void) +int nf_conntrack_expect_init(struct net *net) { int err = -ENOMEM; @@ -569,9 +569,10 @@ int nf_conntrack_expect_init(void) } nf_ct_expect_max = nf_ct_expect_hsize * 4; - nf_ct_expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, - &nf_ct_expect_vmalloc); - if (nf_ct_expect_hash == NULL) + net->ct.expect_count = 0; + net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, + &net->ct.expect_vmalloc); + if (net->ct.expect_hash == NULL) goto err1; nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect", @@ -589,16 +590,16 @@ int nf_conntrack_expect_init(void) err3: kmem_cache_destroy(nf_ct_expect_cachep); err2: - nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_vmalloc, + nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc, nf_ct_expect_hsize); err1: return err; } -void nf_conntrack_expect_fini(void) +void nf_conntrack_expect_fini(struct net *net) { exp_proc_remove(); kmem_cache_destroy(nf_ct_expect_cachep); - nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_vmalloc, + nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc, nf_ct_expect_hsize); } diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 5dc0478108a..dfb826c973d 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -1219,7 +1219,7 @@ static struct nf_conntrack_expect *find_expect(struct nf_conn *ct, tuple.dst.u.tcp.port = port; tuple.dst.protonum = IPPROTO_TCP; - exp = __nf_ct_expect_find(&tuple); + exp = __nf_ct_expect_find(&init_net, &tuple); if (exp && exp->master == ct) return exp; return NULL; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index d91278dfdaf..c793db810cd 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -145,7 +145,7 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) /* Get rid of expectations */ for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, n, next, - &nf_ct_expect_hash[i], hnode) { + &init_net.ct.expect_hash[i], hnode) { struct nf_conn_help *help = nfct_help(exp->master); if ((help->helper == me || exp->helper == me) && del_timer(&exp->timeout)) { diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 918a3358a12..cadfd15b44f 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1458,6 +1458,7 @@ static int ctnetlink_exp_done(struct netlink_callback *cb) static int ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = &init_net; struct nf_conntrack_expect *exp, *last; struct nfgenmsg *nfmsg = NLMSG_DATA(cb->nlh); struct hlist_node *n; @@ -1467,7 +1468,7 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) last = (struct nf_conntrack_expect *)cb->args[1]; for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) { restart: - hlist_for_each_entry(exp, n, &nf_ct_expect_hash[cb->args[0]], + hlist_for_each_entry(exp, n, &net->ct.expect_hash[cb->args[0]], hnode) { if (l3proto && exp->tuple.src.l3num != l3proto) continue; @@ -1529,7 +1530,7 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, if (err < 0) return err; - exp = nf_ct_expect_find_get(&tuple); + exp = nf_ct_expect_find_get(&init_net, &tuple); if (!exp) return -ENOENT; @@ -1583,7 +1584,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, return err; /* bump usage count to 2 */ - exp = nf_ct_expect_find_get(&tuple); + exp = nf_ct_expect_find_get(&init_net, &tuple); if (!exp) return -ENOENT; @@ -1613,7 +1614,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, } for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, n, next, - &nf_ct_expect_hash[i], + &init_net.ct.expect_hash[i], hnode) { m_help = nfct_help(exp->master); if (m_help->helper == h @@ -1629,7 +1630,7 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, spin_lock_bh(&nf_conntrack_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, n, next, - &nf_ct_expect_hash[i], + &init_net.ct.expect_hash[i], hnode) { if (del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); @@ -1724,7 +1725,7 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, return err; spin_lock_bh(&nf_conntrack_lock); - exp = __nf_ct_expect_find(&tuple); + exp = __nf_ct_expect_find(&init_net, &tuple); if (!exp) { spin_unlock_bh(&nf_conntrack_lock); diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 7caf45b59d2..5db7df5d19b 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -121,7 +121,7 @@ static void pptp_expectfn(struct nf_conn *ct, pr_debug("trying to unexpect other dir: "); nf_ct_dump_tuple(&inv_t); - exp_other = nf_ct_expect_find_get(&inv_t); + exp_other = nf_ct_expect_find_get(&init_net, &inv_t); if (exp_other) { /* delete other expectation. */ pr_debug("found\n"); @@ -154,7 +154,7 @@ static int destroy_sibling_or_exp(const struct nf_conntrack_tuple *t) nf_ct_put(sibling); return 1; } else { - exp = nf_ct_expect_find_get(t); + exp = nf_ct_expect_find_get(&init_net, t); if (exp) { pr_debug("unexpect_related of expect %p\n", exp); nf_ct_unexpect_related(exp); diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 1fa306be60f..a006080eb38 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -775,7 +775,7 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, rcu_read_lock(); do { - exp = __nf_ct_expect_find(&tuple); + exp = __nf_ct_expect_find(&init_net, &tuple); if (!exp || exp->master == ct || nfct_help(exp->master)->helper != nfct_help(ct)->helper || -- cgit v1.2.3 From 63c9a26264be108b52de087724673f8664570e34 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:04 +0200 Subject: netfilter: netns nf_conntrack: per-netns unconfirmed list What is confirmed connection in one netns can very well be unconfirmed in another one. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_core.c | 7 ++++--- net/netfilter/nf_conntrack_helper.c | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index c188edea249..2a105db1330 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -54,7 +54,6 @@ struct nf_conn nf_conntrack_untracked __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_untracked); unsigned int nf_ct_log_invalid __read_mostly; -HLIST_HEAD(unconfirmed); static struct kmem_cache *nf_conntrack_cachep __read_mostly; DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat); @@ -596,7 +595,8 @@ init_conntrack(struct net *net, } /* Overload tuple linked list to put us in unconfirmed list. */ - hlist_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode, &unconfirmed); + hlist_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode, + &net->ct.unconfirmed); spin_unlock_bh(&nf_conntrack_lock); @@ -957,7 +957,7 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), goto found; } } - hlist_for_each_entry(h, n, &unconfirmed, hnode) { + hlist_for_each_entry(h, n, &net->ct.unconfirmed, hnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (iter(ct, data)) set_bit(IPS_DYING_BIT, &ct->status); @@ -1154,6 +1154,7 @@ int nf_conntrack_init(struct net *net) printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); goto err_out; } + INIT_HLIST_HEAD(&net->ct.unconfirmed); nf_conntrack_max = max_factor * nf_conntrack_htable_size; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index c793db810cd..920e778539a 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -156,7 +156,7 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) } /* Get rid of expecteds, set helpers to NULL. */ - hlist_for_each_entry(h, n, &unconfirmed, hnode) + hlist_for_each_entry(h, n, &init_net.ct.unconfirmed, hnode) unhelp(h, me); for (i = 0; i < nf_conntrack_htable_size; i++) { hlist_for_each_entry(h, n, &init_net.ct.hash[i], hnode) -- cgit v1.2.3 From a702a65fc1376fc1f6757ec2a6960348af3f1876 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:04 +0200 Subject: netfilter: netns nf_conntrack: pass netns pointer to nf_conntrack_in() It's deducible from skb->dev or skb->dst->dev, but we know netns at the moment of call, so pass it down and use for finding and creating conntracks. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_core.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 2a105db1330..5c96d9732c7 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -611,7 +611,8 @@ init_conntrack(struct net *net, /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ static inline struct nf_conn * -resolve_normal_ct(struct sk_buff *skb, +resolve_normal_ct(struct net *net, + struct sk_buff *skb, unsigned int dataoff, u_int16_t l3num, u_int8_t protonum, @@ -632,10 +633,9 @@ resolve_normal_ct(struct sk_buff *skb, } /* look for tuple match */ - h = nf_conntrack_find_get(&init_net, &tuple); + h = nf_conntrack_find_get(net, &tuple); if (!h) { - h = init_conntrack(&init_net, &tuple, l3proto, l4proto, skb, - dataoff); + h = init_conntrack(net, &tuple, l3proto, l4proto, skb, dataoff); if (!h) return NULL; if (IS_ERR(h)) @@ -669,7 +669,8 @@ resolve_normal_ct(struct sk_buff *skb, } unsigned int -nf_conntrack_in(u_int8_t pf, unsigned int hooknum, struct sk_buff *skb) +nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, + struct sk_buff *skb) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; @@ -709,8 +710,8 @@ nf_conntrack_in(u_int8_t pf, unsigned int hooknum, struct sk_buff *skb) return -ret; } - ct = resolve_normal_ct(skb, dataoff, pf, protonum, l3proto, l4proto, - &set_reply, &ctinfo); + ct = resolve_normal_ct(net, skb, dataoff, pf, protonum, + l3proto, l4proto, &set_reply, &ctinfo); if (!ct) { /* Not valid part of a connection */ NF_CT_STAT_INC_ATOMIC(invalid); -- cgit v1.2.3 From 74c51a1497033e6ff7b8096797daca233a4a30df Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:05 +0200 Subject: netfilter: netns nf_conntrack: pass netns pointer to L4 protocol's ->error hook Again, it's deducible from skb, but we're going to use it for nf_conntrack_checksum and statistics, so just pass it from upper layer. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_core.c | 12 +++++++----- net/netfilter/nf_conntrack_proto_dccp.c | 6 +++--- net/netfilter/nf_conntrack_proto_tcp.c | 3 ++- net/netfilter/nf_conntrack_proto_udp.c | 2 +- net/netfilter/nf_conntrack_proto_udplite.c | 4 +++- 5 files changed, 16 insertions(+), 11 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 5c96d9732c7..251f020c7c1 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -703,11 +703,13 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, /* It may be an special packet, error, unclean... * inverse of the return code tells to the netfilter * core what to do with the packet. */ - if (l4proto->error != NULL && - (ret = l4proto->error(skb, dataoff, &ctinfo, pf, hooknum)) <= 0) { - NF_CT_STAT_INC_ATOMIC(error); - NF_CT_STAT_INC_ATOMIC(invalid); - return -ret; + if (l4proto->error != NULL) { + ret = l4proto->error(net, skb, dataoff, &ctinfo, pf, hooknum); + if (ret <= 0) { + NF_CT_STAT_INC_ATOMIC(error); + NF_CT_STAT_INC_ATOMIC(invalid); + return -ret; + } } ct = resolve_normal_ct(net, skb, dataoff, pf, protonum, diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index edc30358dc1..6ead8da3e9e 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -545,9 +545,9 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, return NF_ACCEPT; } -static int dccp_error(struct sk_buff *skb, unsigned int dataoff, - enum ip_conntrack_info *ctinfo, u_int8_t pf, - unsigned int hooknum) +static int dccp_error(struct net *net, struct sk_buff *skb, + unsigned int dataoff, enum ip_conntrack_info *ctinfo, + u_int8_t pf, unsigned int hooknum) { struct dccp_hdr _dh, *dh; unsigned int dccp_len = skb->len - dataoff; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 539a8202025..4e71de2405f 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -746,7 +746,8 @@ static const u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_ACK|TH_URG) + 1] = }; /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ -static int tcp_error(struct sk_buff *skb, +static int tcp_error(struct net *net, + struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info *ctinfo, u_int8_t pf, diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 2a965c4a0ea..8a245beb2c9 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -89,7 +89,7 @@ static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb, return true; } -static int udp_error(struct sk_buff *skb, unsigned int dataoff, +static int udp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c index 4fb6c8d83a8..981701919a7 100644 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -89,7 +89,9 @@ static bool udplite_new(struct nf_conn *ct, const struct sk_buff *skb, return true; } -static int udplite_error(struct sk_buff *skb, unsigned int dataoff, +static int udplite_error(struct net *net, + struct sk_buff *skb, + unsigned int dataoff, enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) -- cgit v1.2.3 From b2ce2c7479d9b60dd268203e56bb738e78fd5fda Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:05 +0200 Subject: netfilter: netns nf_conntrack: per-netns /proc/net/nf_conntrack, /proc/net/stat/nf_conntrack Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_standalone.c | 51 ++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 20 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 5456e4b9424..02eaf872277 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -40,18 +40,20 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, EXPORT_SYMBOL_GPL(print_tuple); struct ct_iter_state { + struct seq_net_private p; unsigned int bucket; }; static struct hlist_node *ct_get_first(struct seq_file *seq) { + struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_conntrack_htable_size; st->bucket++) { - n = rcu_dereference(init_net.ct.hash[st->bucket].first); + n = rcu_dereference(net->ct.hash[st->bucket].first); if (n) return n; } @@ -61,13 +63,14 @@ static struct hlist_node *ct_get_first(struct seq_file *seq) static struct hlist_node *ct_get_next(struct seq_file *seq, struct hlist_node *head) { + struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; head = rcu_dereference(head->next); while (head == NULL) { if (++st->bucket >= nf_conntrack_htable_size) return NULL; - head = rcu_dereference(init_net.ct.hash[st->bucket].first); + head = rcu_dereference(net->ct.hash[st->bucket].first); } return head; } @@ -177,7 +180,7 @@ static const struct seq_operations ct_seq_ops = { static int ct_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &ct_seq_ops, + return seq_open_net(inode, file, &ct_seq_ops, sizeof(struct ct_iter_state)); } @@ -186,7 +189,7 @@ static const struct file_operations ct_file_ops = { .open = ct_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) @@ -277,38 +280,38 @@ static const struct file_operations ct_cpu_seq_fops = { .release = seq_release, }; -static int nf_conntrack_standalone_init_proc(void) +static int nf_conntrack_standalone_init_proc(struct net *net) { struct proc_dir_entry *pde; - pde = proc_net_fops_create(&init_net, "nf_conntrack", 0440, &ct_file_ops); + pde = proc_net_fops_create(net, "nf_conntrack", 0440, &ct_file_ops); if (!pde) goto out_nf_conntrack; - pde = proc_create("nf_conntrack", S_IRUGO, init_net.proc_net_stat, + pde = proc_create("nf_conntrack", S_IRUGO, net->proc_net_stat, &ct_cpu_seq_fops); if (!pde) goto out_stat_nf_conntrack; return 0; out_stat_nf_conntrack: - proc_net_remove(&init_net, "nf_conntrack"); + proc_net_remove(net, "nf_conntrack"); out_nf_conntrack: return -ENOMEM; } -static void nf_conntrack_standalone_fini_proc(void) +static void nf_conntrack_standalone_fini_proc(struct net *net) { - remove_proc_entry("nf_conntrack", init_net.proc_net_stat); - proc_net_remove(&init_net, "nf_conntrack"); + remove_proc_entry("nf_conntrack", net->proc_net_stat); + proc_net_remove(net, "nf_conntrack"); } #else -static int nf_conntrack_standalone_init_proc(void) +static int nf_conntrack_standalone_init_proc(struct net *net) { return 0; } -static void nf_conntrack_standalone_fini_proc(void) +static void nf_conntrack_standalone_fini_proc(struct net *net) { } #endif /* CONFIG_PROC_FS */ @@ -442,11 +445,25 @@ static void nf_conntrack_standalone_fini_sysctl(void) static int nf_conntrack_net_init(struct net *net) { - return nf_conntrack_init(net); + int ret; + + ret = nf_conntrack_init(net); + if (ret < 0) + goto out_init; + ret = nf_conntrack_standalone_init_proc(net); + if (ret < 0) + goto out_proc; + return 0; + +out_proc: + nf_conntrack_cleanup(net); +out_init: + return ret; } static void nf_conntrack_net_exit(struct net *net) { + nf_conntrack_standalone_fini_proc(net); nf_conntrack_cleanup(net); } @@ -462,17 +479,12 @@ static int __init nf_conntrack_standalone_init(void) ret = register_pernet_subsys(&nf_conntrack_net_ops); if (ret < 0) goto out; - ret = nf_conntrack_standalone_init_proc(); - if (ret < 0) - goto out_proc; ret = nf_conntrack_standalone_init_sysctl(); if (ret < 0) goto out_sysctl; return 0; out_sysctl: - nf_conntrack_standalone_fini_proc(); -out_proc: unregister_pernet_subsys(&nf_conntrack_net_ops); out: return ret; @@ -481,7 +493,6 @@ out: static void __exit nf_conntrack_standalone_fini(void) { nf_conntrack_standalone_fini_sysctl(); - nf_conntrack_standalone_fini_proc(); unregister_pernet_subsys(&nf_conntrack_net_ops); } -- cgit v1.2.3 From dc5129f8df7cc3f2f04b322728d71c42795d34cc Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:06 +0200 Subject: netfilter: netns nf_conntrack: per-netns /proc/net/nf_conntrack_expect Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_expect.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 5307316356e..6a09200049e 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -423,12 +423,13 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_related); #ifdef CONFIG_PROC_FS struct ct_expect_iter_state { + struct seq_net_private p; unsigned int bucket; }; static struct hlist_node *ct_expect_get_first(struct seq_file *seq) { - struct net *net = &init_net; + struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; struct hlist_node *n; @@ -443,7 +444,7 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct hlist_node *head) { - struct net *net = &init_net; + struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; head = rcu_dereference(head->next); @@ -524,7 +525,7 @@ static const struct seq_operations exp_seq_ops = { static int exp_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &exp_seq_ops, + return seq_open_net(inode, file, &exp_seq_ops, sizeof(struct ct_expect_iter_state)); } @@ -533,26 +534,26 @@ static const struct file_operations exp_file_ops = { .open = exp_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; #endif /* CONFIG_PROC_FS */ -static int exp_proc_init(void) +static int exp_proc_init(struct net *net) { #ifdef CONFIG_PROC_FS struct proc_dir_entry *proc; - proc = proc_net_fops_create(&init_net, "nf_conntrack_expect", 0440, &exp_file_ops); + proc = proc_net_fops_create(net, "nf_conntrack_expect", 0440, &exp_file_ops); if (!proc) return -ENOMEM; #endif /* CONFIG_PROC_FS */ return 0; } -static void exp_proc_remove(void) +static void exp_proc_remove(struct net *net) { #ifdef CONFIG_PROC_FS - proc_net_remove(&init_net, "nf_conntrack_expect"); + proc_net_remove(net, "nf_conntrack_expect"); #endif /* CONFIG_PROC_FS */ } @@ -581,7 +582,7 @@ int nf_conntrack_expect_init(struct net *net) if (!nf_ct_expect_cachep) goto err2; - err = exp_proc_init(); + err = exp_proc_init(net); if (err < 0) goto err3; @@ -598,7 +599,7 @@ err1: void nf_conntrack_expect_fini(struct net *net) { - exp_proc_remove(); + exp_proc_remove(net); kmem_cache_destroy(nf_ct_expect_cachep); nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc, nf_ct_expect_hsize); -- cgit v1.2.3 From 68047937677f2dffb5c47b57ce8baba5714b2142 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:06 +0200 Subject: netfilter: netns nf_conntrack: unregister helper in every netns Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_helper.c | 40 ++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 16 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 920e778539a..9c06b9f86ad 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -123,29 +123,18 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) } EXPORT_SYMBOL_GPL(nf_conntrack_helper_register); -void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) +static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, + struct net *net) { struct nf_conntrack_tuple_hash *h; struct nf_conntrack_expect *exp; const struct hlist_node *n, *next; unsigned int i; - mutex_lock(&nf_ct_helper_mutex); - hlist_del_rcu(&me->hnode); - nf_ct_helper_count--; - mutex_unlock(&nf_ct_helper_mutex); - - /* Make sure every nothing is still using the helper unless its a - * connection in the hash. - */ - synchronize_rcu(); - - spin_lock_bh(&nf_conntrack_lock); - /* Get rid of expectations */ for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, n, next, - &init_net.ct.expect_hash[i], hnode) { + &net->ct.expect_hash[i], hnode) { struct nf_conn_help *help = nfct_help(exp->master); if ((help->helper == me || exp->helper == me) && del_timer(&exp->timeout)) { @@ -156,12 +145,31 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) } /* Get rid of expecteds, set helpers to NULL. */ - hlist_for_each_entry(h, n, &init_net.ct.unconfirmed, hnode) + hlist_for_each_entry(h, n, &net->ct.unconfirmed, hnode) unhelp(h, me); for (i = 0; i < nf_conntrack_htable_size; i++) { - hlist_for_each_entry(h, n, &init_net.ct.hash[i], hnode) + hlist_for_each_entry(h, n, &net->ct.hash[i], hnode) unhelp(h, me); } +} + +void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) +{ + struct net *net; + + mutex_lock(&nf_ct_helper_mutex); + hlist_del_rcu(&me->hnode); + nf_ct_helper_count--; + mutex_unlock(&nf_ct_helper_mutex); + + /* Make sure every nothing is still using the helper unless its a + * connection in the hash. + */ + synchronize_rcu(); + + spin_lock_bh(&nf_conntrack_lock); + for_each_net(net) + __nf_conntrack_helper_unregister(me, net); spin_unlock_bh(&nf_conntrack_lock); } EXPORT_SYMBOL_GPL(nf_conntrack_helper_unregister); -- cgit v1.2.3 From 678d66753091a4102910392fb6198a6c6ce7f510 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:07 +0200 Subject: netfilter: netns nf_conntrack: cleanup after L3 and L4 proto unregister in every netns Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_proto.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 3a2f7ef997f..a59a307e685 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -207,6 +207,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_register); void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto) { + struct net *net; + BUG_ON(proto->l3proto >= AF_MAX); mutex_lock(&nf_ct_proto_mutex); @@ -219,7 +221,8 @@ void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto) synchronize_rcu(); /* Remove all contrack entries for this protocol */ - nf_ct_iterate_cleanup(&init_net, kill_l3proto, proto); + for_each_net(net) + nf_ct_iterate_cleanup(net, kill_l3proto, proto); } EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_unregister); @@ -316,6 +319,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_register); void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *l4proto) { + struct net *net; + BUG_ON(l4proto->l3proto >= PF_MAX); mutex_lock(&nf_ct_proto_mutex); @@ -328,7 +333,8 @@ void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *l4proto) synchronize_rcu(); /* Remove all contrack entries for this protocol */ - nf_ct_iterate_cleanup(&init_net, kill_l4proto, l4proto); + for_each_net(net) + nf_ct_iterate_cleanup(net, kill_l4proto, l4proto); } EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_unregister); -- cgit v1.2.3 From a71996fccce4b2086a26036aa3c915365ca36926 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:07 +0200 Subject: netfilter: netns nf_conntrack: pass conntrack to nf_conntrack_event_cache() not skb This is cleaner, we already know conntrack to which event is relevant. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_core.c | 10 +++++----- net/netfilter/nf_conntrack_ftp.c | 9 +++++---- net/netfilter/nf_conntrack_proto_gre.c | 2 +- net/netfilter/nf_conntrack_proto_sctp.c | 4 ++-- net/netfilter/nf_conntrack_proto_tcp.c | 6 +++--- net/netfilter/nf_conntrack_proto_udp.c | 2 +- net/netfilter/nf_conntrack_proto_udplite.c | 2 +- net/netfilter/xt_CONNMARK.c | 8 ++++---- net/netfilter/xt_CONNSECMARK.c | 2 +- 9 files changed, 23 insertions(+), 22 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 251f020c7c1..01f59c57730 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -370,14 +370,14 @@ __nf_conntrack_confirm(struct sk_buff *skb) spin_unlock_bh(&nf_conntrack_lock); help = nfct_help(ct); if (help && help->helper) - nf_conntrack_event_cache(IPCT_HELPER, skb); + nf_conntrack_event_cache(IPCT_HELPER, ct); #ifdef CONFIG_NF_NAT_NEEDED if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) || test_bit(IPS_DST_NAT_DONE_BIT, &ct->status)) - nf_conntrack_event_cache(IPCT_NATINFO, skb); + nf_conntrack_event_cache(IPCT_NATINFO, ct); #endif nf_conntrack_event_cache(master_ct(ct) ? - IPCT_RELATED : IPCT_NEW, skb); + IPCT_RELATED : IPCT_NEW, ct); return NF_ACCEPT; out: @@ -740,7 +740,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, } if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) - nf_conntrack_event_cache(IPCT_STATUS, skb); + nf_conntrack_event_cache(IPCT_STATUS, ct); return ret; } @@ -853,7 +853,7 @@ acct: /* must be unlocked when calling event cache */ if (event) - nf_conntrack_event_cache(event, skb); + nf_conntrack_event_cache(event, ct); } EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index bb20672fe03..4f7107107e9 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -318,7 +318,8 @@ static int find_nl_seq(u32 seq, const struct nf_ct_ftp_master *info, int dir) } /* We don't update if it's older than what we have. */ -static void update_nl_seq(u32 nl_seq, struct nf_ct_ftp_master *info, int dir, +static void update_nl_seq(struct nf_conn *ct, u32 nl_seq, + struct nf_ct_ftp_master *info, int dir, struct sk_buff *skb) { unsigned int i, oldest = NUM_SEQ_TO_REMEMBER; @@ -336,11 +337,11 @@ static void update_nl_seq(u32 nl_seq, struct nf_ct_ftp_master *info, int dir, if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) { info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq; - nf_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb); + nf_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, ct); } else if (oldest != NUM_SEQ_TO_REMEMBER && after(nl_seq, info->seq_aft_nl[dir][oldest])) { info->seq_aft_nl[dir][oldest] = nl_seq; - nf_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb); + nf_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, ct); } } @@ -509,7 +510,7 @@ out_update_nl: /* Now if this ends in \n, update ftp info. Seq may have been * adjusted by NAT code. */ if (ends_in_nl) - update_nl_seq(seq, ct_ftp_info, dir, skb); + update_nl_seq(ct, seq, ct_ftp_info, dir, skb); out: spin_unlock_bh(&nf_ftp_lock); return ret; diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index c5a78220fa3..5b1273a01fe 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -229,7 +229,7 @@ static int gre_packet(struct nf_conn *ct, ct->proto.gre.stream_timeout); /* Also, more likely to be important, and not a probe. */ set_bit(IPS_ASSURED_BIT, &ct->status); - nf_conntrack_event_cache(IPCT_STATUS, skb); + nf_conntrack_event_cache(IPCT_STATUS, ct); } else nf_ct_refresh_acct(ct, ctinfo, skb, ct->proto.gre.timeout); diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index b5a90596d3f..ae8c2609e23 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -369,7 +369,7 @@ static int sctp_packet(struct nf_conn *ct, ct->proto.sctp.state = new_state; if (old_state != new_state) - nf_conntrack_event_cache(IPCT_PROTOINFO, skb); + nf_conntrack_event_cache(IPCT_PROTOINFO, ct); } write_unlock_bh(&sctp_lock); @@ -380,7 +380,7 @@ static int sctp_packet(struct nf_conn *ct, new_state == SCTP_CONNTRACK_ESTABLISHED) { pr_debug("Setting assured bit\n"); set_bit(IPS_ASSURED_BIT, &ct->status); - nf_conntrack_event_cache(IPCT_STATUS, skb); + nf_conntrack_event_cache(IPCT_STATUS, ct); } return NF_ACCEPT; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 4e71de2405f..b5d62d66e02 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -969,9 +969,9 @@ static int tcp_packet(struct nf_conn *ct, timeout = tcp_timeouts[new_state]; write_unlock_bh(&tcp_lock); - nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); + nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, ct); if (new_state != old_state) - nf_conntrack_event_cache(IPCT_PROTOINFO, skb); + nf_conntrack_event_cache(IPCT_PROTOINFO, ct); if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { /* If only reply is a RST, we can consider ourselves not to @@ -990,7 +990,7 @@ static int tcp_packet(struct nf_conn *ct, after SYN_RECV or a valid answer for a picked up connection. */ set_bit(IPS_ASSURED_BIT, &ct->status); - nf_conntrack_event_cache(IPCT_STATUS, skb); + nf_conntrack_event_cache(IPCT_STATUS, ct); } nf_ct_refresh_acct(ct, ctinfo, skb, timeout); diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 8a245beb2c9..e0ee89e179c 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -75,7 +75,7 @@ static int udp_packet(struct nf_conn *ct, nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udp_timeout_stream); /* Also, more likely to be important, and not a probe */ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) - nf_conntrack_event_cache(IPCT_STATUS, skb); + nf_conntrack_event_cache(IPCT_STATUS, ct); } else nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udp_timeout); diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c index 981701919a7..c5b77c8f86c 100644 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -75,7 +75,7 @@ static int udplite_packet(struct nf_conn *ct, nf_ct_udplite_timeout_stream); /* Also, more likely to be important, and not a probe */ if (!test_and_set_bit(IPS_ASSURED_BIT, &ct->status)) - nf_conntrack_event_cache(IPCT_STATUS, skb); + nf_conntrack_event_cache(IPCT_STATUS, ct); } else nf_ct_refresh_acct(ct, ctinfo, skb, nf_ct_udplite_timeout); diff --git a/net/netfilter/xt_CONNMARK.c b/net/netfilter/xt_CONNMARK.c index e72e5d01752..e1415c3f5c9 100644 --- a/net/netfilter/xt_CONNMARK.c +++ b/net/netfilter/xt_CONNMARK.c @@ -54,7 +54,7 @@ connmark_tg_v0(struct sk_buff *skb, const struct net_device *in, newmark = (ct->mark & ~markinfo->mask) | markinfo->mark; if (newmark != ct->mark) { ct->mark = newmark; - nf_conntrack_event_cache(IPCT_MARK, skb); + nf_conntrack_event_cache(IPCT_MARK, ct); } break; case XT_CONNMARK_SAVE: @@ -62,7 +62,7 @@ connmark_tg_v0(struct sk_buff *skb, const struct net_device *in, (skb->mark & markinfo->mask); if (ct->mark != newmark) { ct->mark = newmark; - nf_conntrack_event_cache(IPCT_MARK, skb); + nf_conntrack_event_cache(IPCT_MARK, ct); } break; case XT_CONNMARK_RESTORE: @@ -95,7 +95,7 @@ connmark_tg(struct sk_buff *skb, const struct net_device *in, newmark = (ct->mark & ~info->ctmask) ^ info->ctmark; if (ct->mark != newmark) { ct->mark = newmark; - nf_conntrack_event_cache(IPCT_MARK, skb); + nf_conntrack_event_cache(IPCT_MARK, ct); } break; case XT_CONNMARK_SAVE: @@ -103,7 +103,7 @@ connmark_tg(struct sk_buff *skb, const struct net_device *in, (skb->mark & info->nfmask); if (ct->mark != newmark) { ct->mark = newmark; - nf_conntrack_event_cache(IPCT_MARK, skb); + nf_conntrack_event_cache(IPCT_MARK, ct); } break; case XT_CONNMARK_RESTORE: diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c index ae939e54dfa..5f221c3bd35 100644 --- a/net/netfilter/xt_CONNSECMARK.c +++ b/net/netfilter/xt_CONNSECMARK.c @@ -43,7 +43,7 @@ static void secmark_save(const struct sk_buff *skb) ct = nf_ct_get(skb, &ctinfo); if (ct && !ct->secmark) { ct->secmark = skb->secmark; - nf_conntrack_event_cache(IPCT_SECMARK, skb); + nf_conntrack_event_cache(IPCT_SECMARK, ct); } } } -- cgit v1.2.3 From 6058fa6bb96a5b6145cba10c5171f09c2783ca69 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:07 +0200 Subject: netfilter: netns nf_conntrack: per-netns event cache Heh, last minute proof-reading of this patch made me think, that this is actually unneeded, simply because "ct" pointers will be different for different conntracks in different netns, just like they are different in one netns. Not so sure anymore. [Patrick: pointers will be different, flushing can only be done while inactive though and thus it needs to be per netns] Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_core.c | 12 +++++++++--- net/netfilter/nf_conntrack_ecache.c | 26 +++++++++++++++++++------- 2 files changed, 28 insertions(+), 10 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 01f59c57730..b55944e5e4e 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1023,7 +1023,8 @@ void nf_conntrack_cleanup(struct net *net) delete... */ synchronize_net(); - nf_ct_event_cache_flush(); + nf_ct_event_cache_flush(net); + nf_conntrack_ecache_fini(net); i_see_dead_people: nf_conntrack_flush(net); if (atomic_read(&net->ct.count) != 0) { @@ -1151,11 +1152,14 @@ int nf_conntrack_init(struct net *net) max_factor = 4; } atomic_set(&net->ct.count, 0); + ret = nf_conntrack_ecache_init(net); + if (ret < 0) + goto err_ecache; net->ct.hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, &net->ct.hash_vmalloc); if (!net->ct.hash) { printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); - goto err_out; + goto err_hash; } INIT_HLIST_HEAD(&net->ct.unconfirmed); @@ -1215,6 +1219,8 @@ err_free_conntrack_slab: err_free_hash: nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, nf_conntrack_htable_size); -err_out: +err_hash: + nf_conntrack_ecache_fini(net); +err_ecache: return -ENOMEM; } diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 83c41ac3505..a5f5e2e65d1 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -29,9 +29,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_chain); ATOMIC_NOTIFIER_HEAD(nf_ct_expect_chain); EXPORT_SYMBOL_GPL(nf_ct_expect_chain); -DEFINE_PER_CPU(struct nf_conntrack_ecache, nf_conntrack_ecache); -EXPORT_PER_CPU_SYMBOL_GPL(nf_conntrack_ecache); - /* deliver cached events and clear cache entry - must be called with locally * disabled softirqs */ static inline void @@ -51,10 +48,11 @@ __nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache) * by code prior to async packet handling for freeing the skb */ void nf_ct_deliver_cached_events(const struct nf_conn *ct) { + struct net *net = nf_ct_net(ct); struct nf_conntrack_ecache *ecache; local_bh_disable(); - ecache = &__get_cpu_var(nf_conntrack_ecache); + ecache = per_cpu_ptr(net->ct.ecache, raw_smp_processor_id()); if (ecache->ct == ct) __nf_ct_deliver_cached_events(ecache); local_bh_enable(); @@ -64,10 +62,11 @@ EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); /* Deliver cached events for old pending events, if current conntrack != old */ void __nf_ct_event_cache_init(struct nf_conn *ct) { + struct net *net = nf_ct_net(ct); struct nf_conntrack_ecache *ecache; /* take care of delivering potentially old events */ - ecache = &__get_cpu_var(nf_conntrack_ecache); + ecache = per_cpu_ptr(net->ct.ecache, raw_smp_processor_id()); BUG_ON(ecache->ct == ct); if (ecache->ct) __nf_ct_deliver_cached_events(ecache); @@ -79,18 +78,31 @@ EXPORT_SYMBOL_GPL(__nf_ct_event_cache_init); /* flush the event cache - touches other CPU's data and must not be called * while packets are still passing through the code */ -void nf_ct_event_cache_flush(void) +void nf_ct_event_cache_flush(struct net *net) { struct nf_conntrack_ecache *ecache; int cpu; for_each_possible_cpu(cpu) { - ecache = &per_cpu(nf_conntrack_ecache, cpu); + ecache = per_cpu_ptr(net->ct.ecache, cpu); if (ecache->ct) nf_ct_put(ecache->ct); } } +int nf_conntrack_ecache_init(struct net *net) +{ + net->ct.ecache = alloc_percpu(struct nf_conntrack_ecache); + if (!net->ct.ecache) + return -ENOMEM; + return 0; +} + +void nf_conntrack_ecache_fini(struct net *net) +{ + free_percpu(net->ct.ecache); +} + int nf_conntrack_register_notifier(struct notifier_block *nb) { return atomic_notifier_chain_register(&nf_conntrack_chain, nb); -- cgit v1.2.3 From 0d55af8791bfb42e04cc456b348910582f230343 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:07 +0200 Subject: netfilter: netns nf_conntrack: per-netns statistics Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_core.c | 49 ++++++++++++++++++--------------- net/netfilter/nf_conntrack_expect.c | 4 +-- net/netfilter/nf_conntrack_standalone.c | 4 +-- 3 files changed, 31 insertions(+), 26 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index b55944e5e4e..1e87fa0cd3a 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -56,9 +56,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_untracked); unsigned int nf_ct_log_invalid __read_mostly; static struct kmem_cache *nf_conntrack_cachep __read_mostly; -DEFINE_PER_CPU(struct ip_conntrack_stat, nf_conntrack_stat); -EXPORT_PER_CPU_SYMBOL(nf_conntrack_stat); - static int nf_conntrack_hash_rnd_initted; static unsigned int nf_conntrack_hash_rnd; @@ -171,6 +168,7 @@ static void destroy_conntrack(struct nf_conntrack *nfct) { struct nf_conn *ct = (struct nf_conn *)nfct; + struct net *net = nf_ct_net(ct); struct nf_conntrack_l4proto *l4proto; pr_debug("destroy_conntrack(%p)\n", ct); @@ -203,7 +201,7 @@ destroy_conntrack(struct nf_conntrack *nfct) hlist_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnode); } - NF_CT_STAT_INC(delete); + NF_CT_STAT_INC(net, delete); spin_unlock_bh(&nf_conntrack_lock); if (ct->master) @@ -216,6 +214,7 @@ destroy_conntrack(struct nf_conntrack *nfct) static void death_by_timeout(unsigned long ul_conntrack) { struct nf_conn *ct = (void *)ul_conntrack; + struct net *net = nf_ct_net(ct); struct nf_conn_help *help = nfct_help(ct); struct nf_conntrack_helper *helper; @@ -230,7 +229,7 @@ static void death_by_timeout(unsigned long ul_conntrack) spin_lock_bh(&nf_conntrack_lock); /* Inside lock so preempt is disabled on module removal path. * Otherwise we can get spurious warnings. */ - NF_CT_STAT_INC(delete_list); + NF_CT_STAT_INC(net, delete_list); clean_from_lists(ct); spin_unlock_bh(&nf_conntrack_lock); nf_ct_put(ct); @@ -249,11 +248,11 @@ __nf_conntrack_find(struct net *net, const struct nf_conntrack_tuple *tuple) local_bh_disable(); hlist_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnode) { if (nf_ct_tuple_equal(tuple, &h->tuple)) { - NF_CT_STAT_INC(found); + NF_CT_STAT_INC(net, found); local_bh_enable(); return h; } - NF_CT_STAT_INC(searched); + NF_CT_STAT_INC(net, searched); } local_bh_enable(); @@ -366,7 +365,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) add_timer(&ct->timeout); atomic_inc(&ct->ct_general.use); set_bit(IPS_CONFIRMED_BIT, &ct->status); - NF_CT_STAT_INC(insert); + NF_CT_STAT_INC(net, insert); spin_unlock_bh(&nf_conntrack_lock); help = nfct_help(ct); if (help && help->helper) @@ -381,7 +380,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) return NF_ACCEPT; out: - NF_CT_STAT_INC(insert_failed); + NF_CT_STAT_INC(net, insert_failed); spin_unlock_bh(&nf_conntrack_lock); return NF_DROP; } @@ -405,11 +404,11 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, hlist_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnode) { if (nf_ct_tuplehash_to_ctrack(h) != ignored_conntrack && nf_ct_tuple_equal(tuple, &h->tuple)) { - NF_CT_STAT_INC(found); + NF_CT_STAT_INC(net, found); rcu_read_unlock_bh(); return 1; } - NF_CT_STAT_INC(searched); + NF_CT_STAT_INC(net, searched); } rcu_read_unlock_bh(); @@ -454,7 +453,7 @@ static noinline int early_drop(struct net *net, unsigned int hash) if (del_timer(&ct->timeout)) { death_by_timeout((unsigned long)ct); dropped = 1; - NF_CT_STAT_INC_ATOMIC(early_drop); + NF_CT_STAT_INC_ATOMIC(net, early_drop); } nf_ct_put(ct); return dropped; @@ -581,7 +580,7 @@ init_conntrack(struct net *net, ct->secmark = exp->master->secmark; #endif nf_conntrack_get(&ct->master->ct_general); - NF_CT_STAT_INC(expect_new); + NF_CT_STAT_INC(net, expect_new); } else { struct nf_conntrack_helper *helper; @@ -591,7 +590,7 @@ init_conntrack(struct net *net, if (help) rcu_assign_pointer(help->helper, helper); } - NF_CT_STAT_INC(new); + NF_CT_STAT_INC(net, new); } /* Overload tuple linked list to put us in unconfirmed list. */ @@ -683,7 +682,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, /* Previously seen (loopback or untracked)? Ignore. */ if (skb->nfct) { - NF_CT_STAT_INC_ATOMIC(ignore); + NF_CT_STAT_INC_ATOMIC(net, ignore); return NF_ACCEPT; } @@ -693,8 +692,8 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, &dataoff, &protonum); if (ret <= 0) { pr_debug("not prepared to track yet or error occured\n"); - NF_CT_STAT_INC_ATOMIC(error); - NF_CT_STAT_INC_ATOMIC(invalid); + NF_CT_STAT_INC_ATOMIC(net, error); + NF_CT_STAT_INC_ATOMIC(net, invalid); return -ret; } @@ -706,8 +705,8 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, if (l4proto->error != NULL) { ret = l4proto->error(net, skb, dataoff, &ctinfo, pf, hooknum); if (ret <= 0) { - NF_CT_STAT_INC_ATOMIC(error); - NF_CT_STAT_INC_ATOMIC(invalid); + NF_CT_STAT_INC_ATOMIC(net, error); + NF_CT_STAT_INC_ATOMIC(net, invalid); return -ret; } } @@ -716,13 +715,13 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, l3proto, l4proto, &set_reply, &ctinfo); if (!ct) { /* Not valid part of a connection */ - NF_CT_STAT_INC_ATOMIC(invalid); + NF_CT_STAT_INC_ATOMIC(net, invalid); return NF_ACCEPT; } if (IS_ERR(ct)) { /* Too stressed to deal. */ - NF_CT_STAT_INC_ATOMIC(drop); + NF_CT_STAT_INC_ATOMIC(net, drop); return NF_DROP; } @@ -735,7 +734,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, pr_debug("nf_conntrack_in: Can't track with proto module\n"); nf_conntrack_put(skb->nfct); skb->nfct = NULL; - NF_CT_STAT_INC_ATOMIC(invalid); + NF_CT_STAT_INC_ATOMIC(net, invalid); return -ret; } @@ -1043,6 +1042,7 @@ void nf_conntrack_cleanup(struct net *net) nf_conntrack_acct_fini(); nf_conntrack_expect_fini(net); + free_percpu(net->ct.stat); nf_conntrack_helper_fini(); nf_conntrack_proto_fini(); } @@ -1152,6 +1152,9 @@ int nf_conntrack_init(struct net *net) max_factor = 4; } atomic_set(&net->ct.count, 0); + net->ct.stat = alloc_percpu(struct ip_conntrack_stat); + if (!net->ct.stat) + goto err_stat; ret = nf_conntrack_ecache_init(net); if (ret < 0) goto err_ecache; @@ -1222,5 +1225,7 @@ err_free_hash: err_hash: nf_conntrack_ecache_fini(net); err_ecache: + free_percpu(net->ct.stat); +err_stat: return -ENOMEM; } diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 6a09200049e..b7f75117161 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -53,7 +53,7 @@ void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) master_help->expecting[exp->class]--; nf_ct_expect_put(exp); - NF_CT_STAT_INC(expect_delete); + NF_CT_STAT_INC(net, expect_delete); } EXPORT_SYMBOL_GPL(nf_ct_unlink_expect); @@ -326,7 +326,7 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) add_timer(&exp->timeout); atomic_inc(&exp->use); - NF_CT_STAT_INC(expect_create); + NF_CT_STAT_INC(net, expect_create); } /* Race with expectations being used means we could have none to find; OK. */ diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 02eaf872277..a4fdbbf363a 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -203,7 +203,7 @@ static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) if (!cpu_possible(cpu)) continue; *pos = cpu + 1; - return &per_cpu(nf_conntrack_stat, cpu); + return per_cpu_ptr(init_net.ct.stat, cpu); } return NULL; @@ -217,7 +217,7 @@ static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (!cpu_possible(cpu)) continue; *pos = cpu + 1; - return &per_cpu(nf_conntrack_stat, cpu); + return per_cpu_ptr(init_net.ct.stat, cpu); } return NULL; -- cgit v1.2.3 From 8e9df80180b73d4107bf8fbf28b1633c541d2770 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:08 +0200 Subject: netfilter: netns nf_conntrack: per-netns /proc/net/stat/nf_conntrack, /proc/net/stat/ip_conntrack Show correct conntrack count, while I'm at it. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_standalone.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index a4fdbbf363a..169760ddc16 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -194,6 +194,7 @@ static const struct file_operations ct_file_ops = { static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) { + struct net *net = seq_file_net(seq); int cpu; if (*pos == 0) @@ -203,7 +204,7 @@ static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) if (!cpu_possible(cpu)) continue; *pos = cpu + 1; - return per_cpu_ptr(init_net.ct.stat, cpu); + return per_cpu_ptr(net->ct.stat, cpu); } return NULL; @@ -211,13 +212,14 @@ static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + struct net *net = seq_file_net(seq); int cpu; for (cpu = *pos; cpu < NR_CPUS; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu + 1; - return per_cpu_ptr(init_net.ct.stat, cpu); + return per_cpu_ptr(net->ct.stat, cpu); } return NULL; @@ -229,7 +231,8 @@ static void ct_cpu_seq_stop(struct seq_file *seq, void *v) static int ct_cpu_seq_show(struct seq_file *seq, void *v) { - unsigned int nr_conntracks = atomic_read(&init_net.ct.count); + struct net *net = seq_file_net(seq); + unsigned int nr_conntracks = atomic_read(&net->ct.count); const struct ip_conntrack_stat *st = v; if (v == SEQ_START_TOKEN) { @@ -269,7 +272,8 @@ static const struct seq_operations ct_cpu_seq_ops = { static int ct_cpu_seq_open(struct inode *inode, struct file *file) { - return seq_open(file, &ct_cpu_seq_ops); + return seq_open_net(inode, file, &ct_cpu_seq_ops, + sizeof(struct seq_net_private)); } static const struct file_operations ct_cpu_seq_fops = { @@ -277,7 +281,7 @@ static const struct file_operations ct_cpu_seq_fops = { .open = ct_cpu_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_net, }; static int nf_conntrack_standalone_init_proc(struct net *net) -- cgit v1.2.3 From 802507071b72ed5025747126099cbc6d1542f596 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:08 +0200 Subject: netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_count sysctl Note, sysctl table is always duplicated, this is simpler and less special-cased. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_standalone.c | 73 ++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 32 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 169760ddc16..64b4f95b367 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -330,7 +330,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_checksum); static int log_invalid_proto_min = 0; static int log_invalid_proto_max = 255; -static struct ctl_table_header *nf_ct_sysctl_header; static struct ctl_table_header *nf_ct_netfilter_header; static ctl_table nf_ct_sysctl_table[] = { @@ -409,40 +408,58 @@ static struct ctl_path nf_ct_path[] = { EXPORT_SYMBOL_GPL(nf_ct_log_invalid); -static int nf_conntrack_standalone_init_sysctl(void) +static int nf_conntrack_standalone_init_sysctl(struct net *net) { - nf_ct_netfilter_header = - register_sysctl_paths(nf_ct_path, nf_ct_netfilter_table); - if (!nf_ct_netfilter_header) - goto out; - - nf_ct_sysctl_header = - register_sysctl_paths(nf_net_netfilter_sysctl_path, - nf_ct_sysctl_table); - if (!nf_ct_sysctl_header) + struct ctl_table *table; + + if (net_eq(net, &init_net)) { + nf_ct_netfilter_header = + register_sysctl_paths(nf_ct_path, nf_ct_netfilter_table); + if (!nf_ct_netfilter_header) + goto out; + } + + table = kmemdup(nf_ct_sysctl_table, sizeof(nf_ct_sysctl_table), + GFP_KERNEL); + if (!table) + goto out_kmemdup; + + table[1].data = &net->ct.count; + + net->ct.sysctl_header = register_net_sysctl_table(net, + nf_net_netfilter_sysctl_path, table); + if (!net->ct.sysctl_header) goto out_unregister_netfilter; return 0; out_unregister_netfilter: - unregister_sysctl_table(nf_ct_netfilter_header); + kfree(table); +out_kmemdup: + if (net_eq(net, &init_net)) + unregister_sysctl_table(nf_ct_netfilter_header); out: printk("nf_conntrack: can't register to sysctl.\n"); return -ENOMEM; } -static void nf_conntrack_standalone_fini_sysctl(void) +static void nf_conntrack_standalone_fini_sysctl(struct net *net) { - unregister_sysctl_table(nf_ct_netfilter_header); - unregister_sysctl_table(nf_ct_sysctl_header); + struct ctl_table *table; + + if (net_eq(net, &init_net)) + unregister_sysctl_table(nf_ct_netfilter_header); + table = net->ct.sysctl_header->ctl_table_arg; + unregister_net_sysctl_table(net->ct.sysctl_header); + kfree(table); } #else -static int nf_conntrack_standalone_init_sysctl(void) +static int nf_conntrack_standalone_init_sysctl(struct net *net) { return 0; } -static void nf_conntrack_standalone_fini_sysctl(void) +static void nf_conntrack_standalone_fini_sysctl(struct net *net) { } #endif /* CONFIG_SYSCTL */ @@ -457,8 +474,13 @@ static int nf_conntrack_net_init(struct net *net) ret = nf_conntrack_standalone_init_proc(net); if (ret < 0) goto out_proc; + ret = nf_conntrack_standalone_init_sysctl(net); + if (ret < 0) + goto out_sysctl; return 0; +out_sysctl: + nf_conntrack_standalone_fini_proc(net); out_proc: nf_conntrack_cleanup(net); out_init: @@ -467,6 +489,7 @@ out_init: static void nf_conntrack_net_exit(struct net *net) { + nf_conntrack_standalone_fini_sysctl(net); nf_conntrack_standalone_fini_proc(net); nf_conntrack_cleanup(net); } @@ -478,25 +501,11 @@ static struct pernet_operations nf_conntrack_net_ops = { static int __init nf_conntrack_standalone_init(void) { - int ret; - - ret = register_pernet_subsys(&nf_conntrack_net_ops); - if (ret < 0) - goto out; - ret = nf_conntrack_standalone_init_sysctl(); - if (ret < 0) - goto out_sysctl; - return 0; - -out_sysctl: - unregister_pernet_subsys(&nf_conntrack_net_ops); -out: - return ret; + return register_pernet_subsys(&nf_conntrack_net_ops); } static void __exit nf_conntrack_standalone_fini(void) { - nf_conntrack_standalone_fini_sysctl(); unregister_pernet_subsys(&nf_conntrack_net_ops); } -- cgit v1.2.3 From c04d05529a6e0bf97183a2caf76a0c7f07f5b78c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:08 +0200 Subject: netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_checksum sysctl Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_proto_dccp.c | 2 +- net/netfilter/nf_conntrack_proto_tcp.c | 2 +- net/netfilter/nf_conntrack_proto_udp.c | 2 +- net/netfilter/nf_conntrack_proto_udplite.c | 2 +- net/netfilter/nf_conntrack_standalone.c | 7 +++---- 5 files changed, 7 insertions(+), 8 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 6ead8da3e9e..769680e68b5 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -575,7 +575,7 @@ static int dccp_error(struct net *net, struct sk_buff *skb, } } - if (nf_conntrack_checksum && hooknum == NF_INET_PRE_ROUTING && + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_DCCP, pf)) { msg = "nf_ct_dccp: bad checksum "; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index b5d62d66e02..131c9be4470 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -780,7 +780,7 @@ static int tcp_error(struct net *net, * because the checksum is assumed to be correct. */ /* FIXME: Source route IP option packets --RR */ - if (nf_conntrack_checksum && hooknum == NF_INET_PRE_ROUTING && + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) { if (LOG_INVALID(IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index e0ee89e179c..3d3fffe3f8b 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -123,7 +123,7 @@ static int udp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, * We skip checking packets on the outgoing path * because the checksum is assumed to be correct. * FIXME: Source route IP option packets --RR */ - if (nf_conntrack_checksum && hooknum == NF_INET_PRE_ROUTING && + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) { if (LOG_INVALID(IPPROTO_UDP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c index c5b77c8f86c..3d1697c4f91 100644 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -129,7 +129,7 @@ static int udplite_error(struct net *net, } /* Checksum invalid? Ignore. */ - if (nf_conntrack_checksum && hooknum == NF_INET_PRE_ROUTING && + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP, pf)) { if (LOG_INVALID(IPPROTO_UDPLITE)) diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 64b4f95b367..5cd06637977 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -322,9 +322,6 @@ static void nf_conntrack_standalone_fini_proc(struct net *net) /* Sysctl support */ -int nf_conntrack_checksum __read_mostly = 1; -EXPORT_SYMBOL_GPL(nf_conntrack_checksum); - #ifdef CONFIG_SYSCTL /* Log invalid packets of a given protocol */ static int log_invalid_proto_min = 0; @@ -360,7 +357,7 @@ static ctl_table nf_ct_sysctl_table[] = { { .ctl_name = NET_NF_CONNTRACK_CHECKSUM, .procname = "nf_conntrack_checksum", - .data = &nf_conntrack_checksum, + .data = &init_net.ct.sysctl_checksum, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -425,6 +422,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) goto out_kmemdup; table[1].data = &net->ct.count; + table[3].data = &net->ct.sysctl_checksum; net->ct.sysctl_header = register_net_sysctl_table(net, nf_net_netfilter_sysctl_path, table); @@ -474,6 +472,7 @@ static int nf_conntrack_net_init(struct net *net) ret = nf_conntrack_standalone_init_proc(net); if (ret < 0) goto out_proc; + net->ct.sysctl_checksum = 1; ret = nf_conntrack_standalone_init_sysctl(net); if (ret < 0) goto out_sysctl; -- cgit v1.2.3 From c2a2c7e0cc39e7f9336cd67e8307a110bdba82f3 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:08 +0200 Subject: netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_log_invalid sysctl Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_core.c | 1 - net/netfilter/nf_conntrack_proto_dccp.c | 10 ++++++---- net/netfilter/nf_conntrack_proto_tcp.c | 18 ++++++++++-------- net/netfilter/nf_conntrack_proto_udp.c | 6 +++--- net/netfilter/nf_conntrack_proto_udplite.c | 8 ++++---- net/netfilter/nf_conntrack_standalone.c | 6 +++--- 6 files changed, 26 insertions(+), 23 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 1e87fa0cd3a..ade0bb3ab2e 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -53,7 +53,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_max); struct nf_conn nf_conntrack_untracked __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_untracked); -unsigned int nf_ct_log_invalid __read_mostly; static struct kmem_cache *nf_conntrack_cachep __read_mostly; static int nf_conntrack_hash_rnd_initted; diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 769680e68b5..8fcf1762fab 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -418,6 +418,7 @@ static bool dccp_invert_tuple(struct nf_conntrack_tuple *inv, static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff) { + struct net *net = nf_ct_net(ct); struct dccp_hdr _dh, *dh; const char *msg; u_int8_t state; @@ -445,7 +446,7 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb, return true; out_invalid: - if (LOG_INVALID(IPPROTO_DCCP)) + if (LOG_INVALID(net, IPPROTO_DCCP)) nf_log_packet(nf_ct_l3num(ct), 0, skb, NULL, NULL, NULL, msg); return false; } @@ -463,6 +464,7 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, u_int8_t pf, unsigned int hooknum) { + struct net *net = nf_ct_net(ct); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct dccp_hdr _dh, *dh; u_int8_t type, old_state, new_state; @@ -524,13 +526,13 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb, ct->proto.dccp.last_pkt = type; write_unlock_bh(&dccp_lock); - if (LOG_INVALID(IPPROTO_DCCP)) + if (LOG_INVALID(net, IPPROTO_DCCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_dccp: invalid packet ignored "); return NF_ACCEPT; case CT_DCCP_INVALID: write_unlock_bh(&dccp_lock); - if (LOG_INVALID(IPPROTO_DCCP)) + if (LOG_INVALID(net, IPPROTO_DCCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_dccp: invalid state transition "); return -NF_ACCEPT; @@ -590,7 +592,7 @@ static int dccp_error(struct net *net, struct sk_buff *skb, return NF_ACCEPT; out_invalid: - if (LOG_INVALID(IPPROTO_DCCP)) + if (LOG_INVALID(net, IPPROTO_DCCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, msg); return -NF_ACCEPT; } diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 131c9be4470..f947ec41e39 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -488,6 +488,7 @@ static bool tcp_in_window(const struct nf_conn *ct, const struct tcphdr *tcph, u_int8_t pf) { + struct net *net = nf_ct_net(ct); struct ip_ct_tcp_state *sender = &state->seen[dir]; struct ip_ct_tcp_state *receiver = &state->seen[!dir]; const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple; @@ -668,7 +669,7 @@ static bool tcp_in_window(const struct nf_conn *ct, if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || nf_ct_tcp_be_liberal) res = true; - if (!res && LOG_INVALID(IPPROTO_TCP)) + if (!res && LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: %s ", before(seq, sender->td_maxend + 1) ? @@ -761,7 +762,7 @@ static int tcp_error(struct net *net, /* Smaller that minimal TCP header? */ th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); if (th == NULL) { - if (LOG_INVALID(IPPROTO_TCP)) + if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: short packet "); return -NF_ACCEPT; @@ -769,7 +770,7 @@ static int tcp_error(struct net *net, /* Not whole TCP header or malformed packet */ if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { - if (LOG_INVALID(IPPROTO_TCP)) + if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: truncated/malformed packet "); return -NF_ACCEPT; @@ -782,7 +783,7 @@ static int tcp_error(struct net *net, /* FIXME: Source route IP option packets --RR */ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) { - if (LOG_INVALID(IPPROTO_TCP)) + if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: bad TCP checksum "); return -NF_ACCEPT; @@ -791,7 +792,7 @@ static int tcp_error(struct net *net, /* Check TCP flags. */ tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR|TH_PUSH)); if (!tcp_valid_flags[tcpflags]) { - if (LOG_INVALID(IPPROTO_TCP)) + if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: invalid TCP flag combination "); return -NF_ACCEPT; @@ -808,6 +809,7 @@ static int tcp_packet(struct nf_conn *ct, u_int8_t pf, unsigned int hooknum) { + struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple *tuple; enum tcp_conntrack new_state, old_state; enum ip_conntrack_dir dir; @@ -886,7 +888,7 @@ static int tcp_packet(struct nf_conn *ct, * thus initiate a clean new session. */ write_unlock_bh(&tcp_lock); - if (LOG_INVALID(IPPROTO_TCP)) + if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: killing out of sync session "); nf_ct_kill(ct); @@ -899,7 +901,7 @@ static int tcp_packet(struct nf_conn *ct, segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); write_unlock_bh(&tcp_lock); - if (LOG_INVALID(IPPROTO_TCP)) + if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: invalid packet ignored "); return NF_ACCEPT; @@ -908,7 +910,7 @@ static int tcp_packet(struct nf_conn *ct, pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", dir, get_conntrack_index(th), old_state); write_unlock_bh(&tcp_lock); - if (LOG_INVALID(IPPROTO_TCP)) + if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: invalid state "); return -NF_ACCEPT; diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 3d3fffe3f8b..7c2ca48698b 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -101,7 +101,7 @@ static int udp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, /* Header is too small? */ hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (hdr == NULL) { - if (LOG_INVALID(IPPROTO_UDP)) + if (LOG_INVALID(net, IPPROTO_UDP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_udp: short packet "); return -NF_ACCEPT; @@ -109,7 +109,7 @@ static int udp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, /* Truncated/malformed packets */ if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { - if (LOG_INVALID(IPPROTO_UDP)) + if (LOG_INVALID(net, IPPROTO_UDP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_udp: truncated/malformed packet "); return -NF_ACCEPT; @@ -125,7 +125,7 @@ static int udp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, * FIXME: Source route IP option packets --RR */ if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) { - if (LOG_INVALID(IPPROTO_UDP)) + if (LOG_INVALID(net, IPPROTO_UDP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_udp: bad UDP checksum "); return -NF_ACCEPT; diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c index 3d1697c4f91..d22d839e4f9 100644 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -104,7 +104,7 @@ static int udplite_error(struct net *net, /* Header is too small? */ hdr = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); if (hdr == NULL) { - if (LOG_INVALID(IPPROTO_UDPLITE)) + if (LOG_INVALID(net, IPPROTO_UDPLITE)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_udplite: short packet "); return -NF_ACCEPT; @@ -114,7 +114,7 @@ static int udplite_error(struct net *net, if (cscov == 0) cscov = udplen; else if (cscov < sizeof(*hdr) || cscov > udplen) { - if (LOG_INVALID(IPPROTO_UDPLITE)) + if (LOG_INVALID(net, IPPROTO_UDPLITE)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_udplite: invalid checksum coverage "); return -NF_ACCEPT; @@ -122,7 +122,7 @@ static int udplite_error(struct net *net, /* UDPLITE mandates checksums */ if (!hdr->check) { - if (LOG_INVALID(IPPROTO_UDPLITE)) + if (LOG_INVALID(net, IPPROTO_UDPLITE)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_udplite: checksum missing "); return -NF_ACCEPT; @@ -132,7 +132,7 @@ static int udplite_error(struct net *net, if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum_partial(skb, hooknum, dataoff, cscov, IPPROTO_UDP, pf)) { - if (LOG_INVALID(IPPROTO_UDPLITE)) + if (LOG_INVALID(net, IPPROTO_UDPLITE)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_udplite: bad UDPLite checksum "); return -NF_ACCEPT; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 5cd06637977..98106d4e89f 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -365,7 +365,7 @@ static ctl_table nf_ct_sysctl_table[] = { { .ctl_name = NET_NF_CONNTRACK_LOG_INVALID, .procname = "nf_conntrack_log_invalid", - .data = &nf_ct_log_invalid, + .data = &init_net.ct.sysctl_log_invalid, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_minmax, @@ -403,8 +403,6 @@ static struct ctl_path nf_ct_path[] = { { } }; -EXPORT_SYMBOL_GPL(nf_ct_log_invalid); - static int nf_conntrack_standalone_init_sysctl(struct net *net) { struct ctl_table *table; @@ -423,6 +421,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) table[1].data = &net->ct.count; table[3].data = &net->ct.sysctl_checksum; + table[4].data = &net->ct.sysctl_log_invalid; net->ct.sysctl_header = register_net_sysctl_table(net, nf_net_netfilter_sysctl_path, table); @@ -473,6 +472,7 @@ static int nf_conntrack_net_init(struct net *net) if (ret < 0) goto out_proc; net->ct.sysctl_checksum = 1; + net->ct.sysctl_log_invalid = 0; ret = nf_conntrack_standalone_init_sysctl(net); if (ret < 0) goto out_sysctl; -- cgit v1.2.3 From d716a4dfbbdf0d4731d596a96e5f4b0d892ac168 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:09 +0200 Subject: netfilter: netns nf_conntrack: per-netns conntrack accounting Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_acct.c | 100 +++++++++++++++++++++++++++----------- net/netfilter/nf_conntrack_core.c | 4 +- 2 files changed, 74 insertions(+), 30 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c index 59bd8b903a1..03591d37b9c 100644 --- a/net/netfilter/nf_conntrack_acct.c +++ b/net/netfilter/nf_conntrack_acct.c @@ -22,19 +22,17 @@ #define NF_CT_ACCT_DEFAULT 0 #endif -int nf_ct_acct __read_mostly = NF_CT_ACCT_DEFAULT; -EXPORT_SYMBOL_GPL(nf_ct_acct); +static int nf_ct_acct __read_mostly = NF_CT_ACCT_DEFAULT; module_param_named(acct, nf_ct_acct, bool, 0644); MODULE_PARM_DESC(acct, "Enable connection tracking flow accounting."); #ifdef CONFIG_SYSCTL -static struct ctl_table_header *acct_sysctl_header; static struct ctl_table acct_sysctl_table[] = { { .ctl_name = CTL_UNNUMBERED, .procname = "nf_conntrack_acct", - .data = &nf_ct_acct, + .data = &init_net.ct.sysctl_acct, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = &proc_dointvec, @@ -64,41 +62,87 @@ static struct nf_ct_ext_type acct_extend __read_mostly = { .id = NF_CT_EXT_ACCT, }; -int nf_conntrack_acct_init(void) +#ifdef CONFIG_SYSCTL +static int nf_conntrack_acct_init_sysctl(struct net *net) { - int ret; + struct ctl_table *table; -#ifdef CONFIG_NF_CT_ACCT - printk(KERN_WARNING "CONFIG_NF_CT_ACCT is deprecated and will be removed soon. Plase use\n"); - printk(KERN_WARNING "nf_conntrack.acct=1 kernel paramater, acct=1 nf_conntrack module option or\n"); - printk(KERN_WARNING "sysctl net.netfilter.nf_conntrack_acct=1 to enable it.\n"); -#endif + table = kmemdup(acct_sysctl_table, sizeof(acct_sysctl_table), + GFP_KERNEL); + if (!table) + goto out; + + table[0].data = &net->ct.sysctl_acct; - ret = nf_ct_extend_register(&acct_extend); - if (ret < 0) { - printk(KERN_ERR "nf_conntrack_acct: Unable to register extension\n"); - return ret; + net->ct.acct_sysctl_header = register_net_sysctl_table(net, + nf_net_netfilter_sysctl_path, table); + if (!net->ct.acct_sysctl_header) { + printk(KERN_ERR "nf_conntrack_acct: can't register to sysctl.\n"); + goto out_register; } + return 0; -#ifdef CONFIG_SYSCTL - acct_sysctl_header = register_sysctl_paths(nf_net_netfilter_sysctl_path, - acct_sysctl_table); +out_register: + kfree(table); +out: + return -ENOMEM; +} - if (!acct_sysctl_header) { - nf_ct_extend_unregister(&acct_extend); +static void nf_conntrack_acct_fini_sysctl(struct net *net) +{ + struct ctl_table *table; - printk(KERN_ERR "nf_conntrack_acct: can't register to sysctl.\n"); - return -ENOMEM; - } + table = net->ct.acct_sysctl_header->ctl_table_arg; + unregister_net_sysctl_table(net->ct.acct_sysctl_header); + kfree(table); +} +#else +static int nf_conntrack_acct_init_sysctl(struct net *net) +{ + return 0; +} + +static void nf_conntrack_acct_fini_sysctl(struct net *net) +{ +} +#endif + +int nf_conntrack_acct_init(struct net *net) +{ + int ret; + + net->ct.sysctl_acct = nf_ct_acct; + + if (net_eq(net, &init_net)) { +#ifdef CONFIG_NF_CT_ACCT + printk(KERN_WARNING "CONFIG_NF_CT_ACCT is deprecated and will be removed soon. Plase use\n"); + printk(KERN_WARNING "nf_conntrack.acct=1 kernel paramater, acct=1 nf_conntrack module option or\n"); + printk(KERN_WARNING "sysctl net.netfilter.nf_conntrack_acct=1 to enable it.\n"); #endif + ret = nf_ct_extend_register(&acct_extend); + if (ret < 0) { + printk(KERN_ERR "nf_conntrack_acct: Unable to register extension\n"); + goto out_extend_register; + } + } + + ret = nf_conntrack_acct_init_sysctl(net); + if (ret < 0) + goto out_sysctl; + return 0; + +out_sysctl: + if (net_eq(net, &init_net)) + nf_ct_extend_unregister(&acct_extend); +out_extend_register: + return ret; } -void nf_conntrack_acct_fini(void) +void nf_conntrack_acct_fini(struct net *net) { -#ifdef CONFIG_SYSCTL - unregister_sysctl_table(acct_sysctl_header); -#endif - nf_ct_extend_unregister(&acct_extend); + nf_conntrack_acct_fini_sysctl(net); + if (net_eq(net, &init_net)) + nf_ct_extend_unregister(&acct_extend); } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index ade0bb3ab2e..bb26460d897 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1039,7 +1039,7 @@ void nf_conntrack_cleanup(struct net *net) nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, nf_conntrack_htable_size); - nf_conntrack_acct_fini(); + nf_conntrack_acct_fini(net); nf_conntrack_expect_fini(net); free_percpu(net->ct.stat); nf_conntrack_helper_fini(); @@ -1191,7 +1191,7 @@ int nf_conntrack_init(struct net *net) if (ret < 0) goto out_fini_expect; - ret = nf_conntrack_acct_init(); + ret = nf_conntrack_acct_init(net); if (ret < 0) goto out_fini_helper; -- cgit v1.2.3 From 08f6547d266fdba087f7fa7963fc0610be5b7cd7 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:09 +0200 Subject: netfilter: netns nf_conntrack: final netns tweaks Add init_net checks to not remove kmem_caches twice and so on. Refactor functions to split code which should be executed only for init_net into one place. ip_ct_attach and ip_ct_destroy assignments remain separate, because they're separate stages in setup and teardown. NOTE: NOTRACK code is in for-every-net part. It will be made per-netns after we decidce how to do it correctly. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_core.c | 151 +++++++++++++++++++++++------------- net/netfilter/nf_conntrack_expect.c | 26 ++++--- 2 files changed, 114 insertions(+), 63 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index bb26460d897..27de3c7b006 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1010,17 +1010,15 @@ void nf_conntrack_flush(struct net *net) } EXPORT_SYMBOL_GPL(nf_conntrack_flush); -/* Mishearing the voices in his head, our hero wonders how he's - supposed to kill the mall. */ -void nf_conntrack_cleanup(struct net *net) +static void nf_conntrack_cleanup_init_net(void) { - rcu_assign_pointer(ip_ct_attach, NULL); - - /* This makes sure all current packets have passed through - netfilter framework. Roll on, two-stage module - delete... */ - synchronize_net(); + nf_conntrack_helper_fini(); + nf_conntrack_proto_fini(); + kmem_cache_destroy(nf_conntrack_cachep); +} +static void nf_conntrack_cleanup_net(struct net *net) +{ nf_ct_event_cache_flush(net); nf_conntrack_ecache_fini(net); i_see_dead_people: @@ -1033,17 +1031,31 @@ void nf_conntrack_cleanup(struct net *net) while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1) schedule(); - rcu_assign_pointer(nf_ct_destroy, NULL); - - kmem_cache_destroy(nf_conntrack_cachep); nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, nf_conntrack_htable_size); - nf_conntrack_acct_fini(net); nf_conntrack_expect_fini(net); free_percpu(net->ct.stat); - nf_conntrack_helper_fini(); - nf_conntrack_proto_fini(); +} + +/* Mishearing the voices in his head, our hero wonders how he's + supposed to kill the mall. */ +void nf_conntrack_cleanup(struct net *net) +{ + if (net_eq(net, &init_net)) + rcu_assign_pointer(ip_ct_attach, NULL); + + /* This makes sure all current packets have passed through + netfilter framework. Roll on, two-stage module + delete... */ + synchronize_net(); + + nf_conntrack_cleanup_net(net); + + if (net_eq(net, &init_net)) { + rcu_assign_pointer(nf_ct_destroy, NULL); + nf_conntrack_cleanup_init_net(); + } } struct hlist_head *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced) @@ -1128,7 +1140,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, &nf_conntrack_htable_size, 0600); -int nf_conntrack_init(struct net *net) +static int nf_conntrack_init_init_net(void) { int max_factor = 8; int ret; @@ -1150,21 +1162,6 @@ int nf_conntrack_init(struct net *net) * entries. */ max_factor = 4; } - atomic_set(&net->ct.count, 0); - net->ct.stat = alloc_percpu(struct ip_conntrack_stat); - if (!net->ct.stat) - goto err_stat; - ret = nf_conntrack_ecache_init(net); - if (ret < 0) - goto err_ecache; - net->ct.hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, - &net->ct.hash_vmalloc); - if (!net->ct.hash) { - printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); - goto err_hash; - } - INIT_HLIST_HEAD(&net->ct.unconfirmed); - nf_conntrack_max = max_factor * nf_conntrack_htable_size; printk("nf_conntrack version %s (%u buckets, %d max)\n", @@ -1176,28 +1173,55 @@ int nf_conntrack_init(struct net *net) 0, 0, NULL); if (!nf_conntrack_cachep) { printk(KERN_ERR "Unable to create nf_conn slab cache\n"); - goto err_free_hash; + ret = -ENOMEM; + goto err_cache; } ret = nf_conntrack_proto_init(); if (ret < 0) - goto err_free_conntrack_slab; - - ret = nf_conntrack_expect_init(net); - if (ret < 0) - goto out_fini_proto; + goto err_proto; ret = nf_conntrack_helper_init(); if (ret < 0) - goto out_fini_expect; + goto err_helper; + + return 0; + +err_helper: + nf_conntrack_proto_fini(); +err_proto: + kmem_cache_destroy(nf_conntrack_cachep); +err_cache: + return ret; +} + +static int nf_conntrack_init_net(struct net *net) +{ + int ret; + atomic_set(&net->ct.count, 0); + INIT_HLIST_HEAD(&net->ct.unconfirmed); + net->ct.stat = alloc_percpu(struct ip_conntrack_stat); + if (!net->ct.stat) { + ret = -ENOMEM; + goto err_stat; + } + ret = nf_conntrack_ecache_init(net); + if (ret < 0) + goto err_ecache; + net->ct.hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, + &net->ct.hash_vmalloc); + if (!net->ct.hash) { + ret = -ENOMEM; + printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); + goto err_hash; + } + ret = nf_conntrack_expect_init(net); + if (ret < 0) + goto err_expect; ret = nf_conntrack_acct_init(net); if (ret < 0) - goto out_fini_helper; - - /* For use by REJECT target */ - rcu_assign_pointer(ip_ct_attach, nf_conntrack_attach); - rcu_assign_pointer(nf_ct_destroy, destroy_conntrack); + goto err_acct; /* Set up fake conntrack: - to never be deleted, not in any hashes */ @@ -1208,17 +1232,11 @@ int nf_conntrack_init(struct net *net) /* - and look it like as a confirmed connection */ set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status); - return ret; + return 0; -out_fini_helper: - nf_conntrack_helper_fini(); -out_fini_expect: +err_acct: nf_conntrack_expect_fini(net); -out_fini_proto: - nf_conntrack_proto_fini(); -err_free_conntrack_slab: - kmem_cache_destroy(nf_conntrack_cachep); -err_free_hash: +err_expect: nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, nf_conntrack_htable_size); err_hash: @@ -1226,5 +1244,32 @@ err_hash: err_ecache: free_percpu(net->ct.stat); err_stat: - return -ENOMEM; + return ret; +} + +int nf_conntrack_init(struct net *net) +{ + int ret; + + if (net_eq(net, &init_net)) { + ret = nf_conntrack_init_init_net(); + if (ret < 0) + goto out_init_net; + } + ret = nf_conntrack_init_net(net); + if (ret < 0) + goto out_net; + + if (net_eq(net, &init_net)) { + /* For use by REJECT target */ + rcu_assign_pointer(ip_ct_attach, nf_conntrack_attach); + rcu_assign_pointer(nf_ct_destroy, destroy_conntrack); + } + return 0; + +out_net: + if (net_eq(net, &init_net)) + nf_conntrack_cleanup_init_net(); +out_init_net: + return ret; } diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index b7f75117161..37a703bc3b8 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -563,12 +563,14 @@ int nf_conntrack_expect_init(struct net *net) { int err = -ENOMEM; - if (!nf_ct_expect_hsize) { - nf_ct_expect_hsize = nf_conntrack_htable_size / 256; - if (!nf_ct_expect_hsize) - nf_ct_expect_hsize = 1; + if (net_eq(net, &init_net)) { + if (!nf_ct_expect_hsize) { + nf_ct_expect_hsize = nf_conntrack_htable_size / 256; + if (!nf_ct_expect_hsize) + nf_ct_expect_hsize = 1; + } + nf_ct_expect_max = nf_ct_expect_hsize * 4; } - nf_ct_expect_max = nf_ct_expect_hsize * 4; net->ct.expect_count = 0; net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, @@ -576,11 +578,13 @@ int nf_conntrack_expect_init(struct net *net) if (net->ct.expect_hash == NULL) goto err1; - nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect", + if (net_eq(net, &init_net)) { + nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect", sizeof(struct nf_conntrack_expect), 0, 0, NULL); - if (!nf_ct_expect_cachep) - goto err2; + if (!nf_ct_expect_cachep) + goto err2; + } err = exp_proc_init(net); if (err < 0) @@ -589,7 +593,8 @@ int nf_conntrack_expect_init(struct net *net) return 0; err3: - kmem_cache_destroy(nf_ct_expect_cachep); + if (net_eq(net, &init_net)) + kmem_cache_destroy(nf_ct_expect_cachep); err2: nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc, nf_ct_expect_hsize); @@ -600,7 +605,8 @@ err1: void nf_conntrack_expect_fini(struct net *net) { exp_proc_remove(net); - kmem_cache_destroy(nf_ct_expect_cachep); + if (net_eq(net, &init_net)) + kmem_cache_destroy(nf_ct_expect_cachep); nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc, nf_ct_expect_hsize); } -- cgit v1.2.3 From a5c3a8005cb7a36ebcc5b849f6045069ce4f7ca8 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:09 +0200 Subject: netfilter: netns nf_conntrack: SIP conntracking in netns Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_sip.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index a006080eb38..6813f1c8863 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -736,6 +736,7 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, struct nf_conntrack_expect *exp, *rtp_exp, *rtcp_exp; enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + struct net *net = nf_ct_net(ct); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); union nf_inet_addr *saddr; struct nf_conntrack_tuple tuple; @@ -775,7 +776,7 @@ static int set_expected_rtp_rtcp(struct sk_buff *skb, rcu_read_lock(); do { - exp = __nf_ct_expect_find(&init_net, &tuple); + exp = __nf_ct_expect_find(net, &tuple); if (!exp || exp->master == ct || nfct_help(exp->master)->helper != nfct_help(ct)->helper || -- cgit v1.2.3 From 84541cc13a3bb31a58c096dde3517461e3ad91c2 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:09 +0200 Subject: netfilter: netns nf_conntrack: H323 conntracking in netns Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_h323_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index dfb826c973d..c1504f71cdf 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -1210,6 +1210,7 @@ static struct nf_conntrack_expect *find_expect(struct nf_conn *ct, union nf_inet_addr *addr, __be16 port) { + struct net *net = nf_ct_net(ct); struct nf_conntrack_expect *exp; struct nf_conntrack_tuple tuple; @@ -1219,7 +1220,7 @@ static struct nf_conntrack_expect *find_expect(struct nf_conn *ct, tuple.dst.u.tcp.port = port; tuple.dst.protonum = IPPROTO_TCP; - exp = __nf_ct_expect_find(&init_net, &tuple); + exp = __nf_ct_expect_find(net, &tuple); if (exp && exp->master == ct) return exp; return NULL; -- cgit v1.2.3 From 3bb0d1c00f86b13bb184193a8f0189ddd6f0459f Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:10 +0200 Subject: netfilter: netns nf_conntrack: GRE conntracking in netns * make keymap list per-netns * per-netns keymal lock (not strictly necessary) * flush keymap at netns stop and module unload. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_pptp.c | 2 +- net/netfilter/nf_conntrack_proto_gre.c | 97 ++++++++++++++++++++++++++-------- 2 files changed, 75 insertions(+), 24 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index 5db7df5d19b..e47d5de41cc 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -602,7 +602,7 @@ static int __init nf_conntrack_pptp_init(void) static void __exit nf_conntrack_pptp_fini(void) { nf_conntrack_helper_unregister(&pptp); - nf_ct_gre_keymap_flush(); + nf_ct_gre_keymap_flush(&init_net); } module_init(nf_conntrack_pptp_init); diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 5b1273a01fe..a2cdbcbf64c 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -29,8 +29,11 @@ #include #include #include +#include #include - +#include +#include +#include #include #include #include @@ -40,19 +43,23 @@ #define GRE_TIMEOUT (30 * HZ) #define GRE_STREAM_TIMEOUT (180 * HZ) -static DEFINE_RWLOCK(nf_ct_gre_lock); -static LIST_HEAD(gre_keymap_list); +static int proto_gre_net_id; +struct netns_proto_gre { + rwlock_t keymap_lock; + struct list_head keymap_list; +}; -void nf_ct_gre_keymap_flush(void) +void nf_ct_gre_keymap_flush(struct net *net) { + struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id); struct nf_ct_gre_keymap *km, *tmp; - write_lock_bh(&nf_ct_gre_lock); - list_for_each_entry_safe(km, tmp, &gre_keymap_list, list) { + write_lock_bh(&net_gre->keymap_lock); + list_for_each_entry_safe(km, tmp, &net_gre->keymap_list, list) { list_del(&km->list); kfree(km); } - write_unlock_bh(&nf_ct_gre_lock); + write_unlock_bh(&net_gre->keymap_lock); } EXPORT_SYMBOL(nf_ct_gre_keymap_flush); @@ -67,19 +74,20 @@ static inline int gre_key_cmpfn(const struct nf_ct_gre_keymap *km, } /* look up the source key for a given tuple */ -static __be16 gre_keymap_lookup(struct nf_conntrack_tuple *t) +static __be16 gre_keymap_lookup(struct net *net, struct nf_conntrack_tuple *t) { + struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id); struct nf_ct_gre_keymap *km; __be16 key = 0; - read_lock_bh(&nf_ct_gre_lock); - list_for_each_entry(km, &gre_keymap_list, list) { + read_lock_bh(&net_gre->keymap_lock); + list_for_each_entry(km, &net_gre->keymap_list, list) { if (gre_key_cmpfn(km, t)) { key = km->tuple.src.u.gre.key; break; } } - read_unlock_bh(&nf_ct_gre_lock); + read_unlock_bh(&net_gre->keymap_lock); pr_debug("lookup src key 0x%x for ", key); nf_ct_dump_tuple(t); @@ -91,20 +99,22 @@ static __be16 gre_keymap_lookup(struct nf_conntrack_tuple *t) int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir, struct nf_conntrack_tuple *t) { + struct net *net = nf_ct_net(ct); + struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id); struct nf_conn_help *help = nfct_help(ct); struct nf_ct_gre_keymap **kmp, *km; kmp = &help->help.ct_pptp_info.keymap[dir]; if (*kmp) { /* check whether it's a retransmission */ - read_lock_bh(&nf_ct_gre_lock); - list_for_each_entry(km, &gre_keymap_list, list) { + read_lock_bh(&net_gre->keymap_lock); + list_for_each_entry(km, &net_gre->keymap_list, list) { if (gre_key_cmpfn(km, t) && km == *kmp) { - read_unlock_bh(&nf_ct_gre_lock); + read_unlock_bh(&net_gre->keymap_lock); return 0; } } - read_unlock_bh(&nf_ct_gre_lock); + read_unlock_bh(&net_gre->keymap_lock); pr_debug("trying to override keymap_%s for ct %p\n", dir == IP_CT_DIR_REPLY ? "reply" : "orig", ct); return -EEXIST; @@ -119,9 +129,9 @@ int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir, pr_debug("adding new entry %p: ", km); nf_ct_dump_tuple(&km->tuple); - write_lock_bh(&nf_ct_gre_lock); - list_add_tail(&km->list, &gre_keymap_list); - write_unlock_bh(&nf_ct_gre_lock); + write_lock_bh(&net_gre->keymap_lock); + list_add_tail(&km->list, &net_gre->keymap_list); + write_unlock_bh(&net_gre->keymap_lock); return 0; } @@ -130,12 +140,14 @@ EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_add); /* destroy the keymap entries associated with specified master ct */ void nf_ct_gre_keymap_destroy(struct nf_conn *ct) { + struct net *net = nf_ct_net(ct); + struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id); struct nf_conn_help *help = nfct_help(ct); enum ip_conntrack_dir dir; pr_debug("entering for ct %p\n", ct); - write_lock_bh(&nf_ct_gre_lock); + write_lock_bh(&net_gre->keymap_lock); for (dir = IP_CT_DIR_ORIGINAL; dir < IP_CT_DIR_MAX; dir++) { if (help->help.ct_pptp_info.keymap[dir]) { pr_debug("removing %p from list\n", @@ -145,7 +157,7 @@ void nf_ct_gre_keymap_destroy(struct nf_conn *ct) help->help.ct_pptp_info.keymap[dir] = NULL; } } - write_unlock_bh(&nf_ct_gre_lock); + write_unlock_bh(&net_gre->keymap_lock); } EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_destroy); @@ -164,6 +176,7 @@ static bool gre_invert_tuple(struct nf_conntrack_tuple *tuple, static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct nf_conntrack_tuple *tuple) { + struct net *net = dev_net(skb->dev ? skb->dev : skb->dst->dev); const struct gre_hdr_pptp *pgrehdr; struct gre_hdr_pptp _pgrehdr; __be16 srckey; @@ -190,7 +203,7 @@ static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, } tuple->dst.u.gre.key = pgrehdr->call_id; - srckey = gre_keymap_lookup(tuple); + srckey = gre_keymap_lookup(net, tuple); tuple->src.u.gre.key = srckey; return true; @@ -285,15 +298,53 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 __read_mostly = { #endif }; +static int proto_gre_net_init(struct net *net) +{ + struct netns_proto_gre *net_gre; + int rv; + + net_gre = kmalloc(sizeof(struct netns_proto_gre), GFP_KERNEL); + if (!net_gre) + return -ENOMEM; + rwlock_init(&net_gre->keymap_lock); + INIT_LIST_HEAD(&net_gre->keymap_list); + + rv = net_assign_generic(net, proto_gre_net_id, net_gre); + if (rv < 0) + kfree(net_gre); + return rv; +} + +static void proto_gre_net_exit(struct net *net) +{ + struct netns_proto_gre *net_gre = net_generic(net, proto_gre_net_id); + + nf_ct_gre_keymap_flush(net); + kfree(net_gre); +} + +static struct pernet_operations proto_gre_net_ops = { + .init = proto_gre_net_init, + .exit = proto_gre_net_exit, +}; + static int __init nf_ct_proto_gre_init(void) { - return nf_conntrack_l4proto_register(&nf_conntrack_l4proto_gre4); + int rv; + + rv = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_gre4); + if (rv < 0) + return rv; + rv = register_pernet_gen_device(&proto_gre_net_id, &proto_gre_net_ops); + if (rv < 0) + nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_gre4); + return rv; } static void nf_ct_proto_gre_fini(void) { nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_gre4); - nf_ct_gre_keymap_flush(); + unregister_pernet_gen_device(proto_gre_net_id, &proto_gre_net_ops); } module_init(nf_ct_proto_gre_init); -- cgit v1.2.3 From 0e6e75af921d1f4799eeb9f83a31c86ab7cdeb8f Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:10 +0200 Subject: netfilter: netns nf_conntrack: PPTP conntracking in netns Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_pptp.c | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index e47d5de41cc..373e51e91ce 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -98,6 +98,7 @@ EXPORT_SYMBOL(pptp_msg_name); static void pptp_expectfn(struct nf_conn *ct, struct nf_conntrack_expect *exp) { + struct net *net = nf_ct_net(ct); typeof(nf_nat_pptp_hook_expectfn) nf_nat_pptp_expectfn; pr_debug("increasing timeouts\n"); @@ -121,7 +122,7 @@ static void pptp_expectfn(struct nf_conn *ct, pr_debug("trying to unexpect other dir: "); nf_ct_dump_tuple(&inv_t); - exp_other = nf_ct_expect_find_get(&init_net, &inv_t); + exp_other = nf_ct_expect_find_get(net, &inv_t); if (exp_other) { /* delete other expectation. */ pr_debug("found\n"); @@ -134,7 +135,8 @@ static void pptp_expectfn(struct nf_conn *ct, rcu_read_unlock(); } -static int destroy_sibling_or_exp(const struct nf_conntrack_tuple *t) +static int destroy_sibling_or_exp(struct net *net, + const struct nf_conntrack_tuple *t) { const struct nf_conntrack_tuple_hash *h; struct nf_conntrack_expect *exp; @@ -143,7 +145,7 @@ static int destroy_sibling_or_exp(const struct nf_conntrack_tuple *t) pr_debug("trying to timeout ct or exp for tuple "); nf_ct_dump_tuple(t); - h = nf_conntrack_find_get(&init_net, t); + h = nf_conntrack_find_get(net, t); if (h) { sibling = nf_ct_tuplehash_to_ctrack(h); pr_debug("setting timeout of conntrack %p to 0\n", sibling); @@ -154,7 +156,7 @@ static int destroy_sibling_or_exp(const struct nf_conntrack_tuple *t) nf_ct_put(sibling); return 1; } else { - exp = nf_ct_expect_find_get(&init_net, t); + exp = nf_ct_expect_find_get(net, t); if (exp) { pr_debug("unexpect_related of expect %p\n", exp); nf_ct_unexpect_related(exp); @@ -168,6 +170,7 @@ static int destroy_sibling_or_exp(const struct nf_conntrack_tuple *t) /* timeout GRE data connections */ static void pptp_destroy_siblings(struct nf_conn *ct) { + struct net *net = nf_ct_net(ct); const struct nf_conn_help *help = nfct_help(ct); struct nf_conntrack_tuple t; @@ -178,7 +181,7 @@ static void pptp_destroy_siblings(struct nf_conn *ct) t.dst.protonum = IPPROTO_GRE; t.src.u.gre.key = help->help.ct_pptp_info.pns_call_id; t.dst.u.gre.key = help->help.ct_pptp_info.pac_call_id; - if (!destroy_sibling_or_exp(&t)) + if (!destroy_sibling_or_exp(net, &t)) pr_debug("failed to timeout original pns->pac ct/exp\n"); /* try reply (pac->pns) tuple */ @@ -186,7 +189,7 @@ static void pptp_destroy_siblings(struct nf_conn *ct) t.dst.protonum = IPPROTO_GRE; t.src.u.gre.key = help->help.ct_pptp_info.pac_call_id; t.dst.u.gre.key = help->help.ct_pptp_info.pns_call_id; - if (!destroy_sibling_or_exp(&t)) + if (!destroy_sibling_or_exp(net, &t)) pr_debug("failed to timeout reply pac->pns ct/exp\n"); } @@ -594,15 +597,32 @@ static struct nf_conntrack_helper pptp __read_mostly = { .expect_policy = &pptp_exp_policy, }; +static void nf_conntrack_pptp_net_exit(struct net *net) +{ + nf_ct_gre_keymap_flush(net); +} + +static struct pernet_operations nf_conntrack_pptp_net_ops = { + .exit = nf_conntrack_pptp_net_exit, +}; + static int __init nf_conntrack_pptp_init(void) { - return nf_conntrack_helper_register(&pptp); + int rv; + + rv = nf_conntrack_helper_register(&pptp); + if (rv < 0) + return rv; + rv = register_pernet_subsys(&nf_conntrack_pptp_net_ops); + if (rv < 0) + nf_conntrack_helper_unregister(&pptp); + return rv; } static void __exit nf_conntrack_pptp_fini(void) { nf_conntrack_helper_unregister(&pptp); - nf_ct_gre_keymap_flush(&init_net); + unregister_pernet_subsys(&nf_conntrack_pptp_net_ops); } module_init(nf_conntrack_pptp_init); -- cgit v1.2.3 From 4de6f16b9ec2422fa7ef9c22f7b1c8d5a55499b4 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Oct 2008 11:35:11 +0200 Subject: netfilter: enable netfilter in netns From kernel perspective, allow entrance in nf_hook_slow(). Stuff which uses nf_register_hook/nf_register_hooks, but otherwise not netns-ready: DECnet netfilter ipt_CLUSTERIP nf_nat_standalone.c together with XFRM (?) IPVS several individual match modules (like hashlimit) ctnetlink NOTRACK all sorts of queueing and reporting to userspace L3 and L4 protocol sysctls, bridge sysctls probably something else Anyway critical mass has been achieved, there is no reason to hide netfilter any longer. From userspace perspective, allow to manipulate all sorts of iptables/ip6tables/arptables rules. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy --- net/netfilter/core.c | 8 -------- net/netfilter/nf_sockopt.c | 3 --- 2 files changed, 11 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/core.c b/net/netfilter/core.c index b16cd79951c..a90ac83c591 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -165,14 +165,6 @@ int nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb, unsigned int verdict; int ret = 0; -#ifdef CONFIG_NET_NS - struct net *net; - - net = indev == NULL ? dev_net(outdev) : dev_net(indev); - if (net != &init_net) - return 1; -#endif - /* We may already have this, but read-locks nest anyway */ rcu_read_lock(); diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c index f9b46de6a3d..8ab829f8657 100644 --- a/net/netfilter/nf_sockopt.c +++ b/net/netfilter/nf_sockopt.c @@ -65,9 +65,6 @@ static struct nf_sockopt_ops *nf_sockopt_find(struct sock *sk, u_int8_t pf, { struct nf_sockopt_ops *ops; - if (!net_eq(sock_net(sk), &init_net)) - return ERR_PTR(-ENOPROTOOPT); - if (mutex_lock_interruptible(&nf_sockopt_mutex) != 0) return ERR_PTR(-EINTR); -- cgit v1.2.3 From 9ad2d745a23853927a19789b034d9eb2e62d78ee Mon Sep 17 00:00:00 2001 From: KOVACS Krisztian Date: Wed, 8 Oct 2008 11:35:12 +0200 Subject: netfilter: iptables tproxy core The iptables tproxy core is a module that contains the common routines used by various tproxy related modules (TPROXY target and socket match) Signed-off-by: KOVACS Krisztian Signed-off-by: Patrick McHardy --- net/netfilter/Kconfig | 15 +++++++ net/netfilter/Makefile | 3 ++ net/netfilter/nf_tproxy_core.c | 96 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 114 insertions(+) create mode 100644 net/netfilter/nf_tproxy_core.c (limited to 'net/netfilter') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 4a464857f21..ed1dcfb61e1 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -287,6 +287,21 @@ config NF_CT_NETLINK help This option enables support for a netlink-based userspace interface +# transparent proxy support +config NETFILTER_TPROXY + tristate "Transparent proxying support (EXPERIMENTAL)" + depends on EXPERIMENTAL + depends on IP_NF_MANGLE + depends on NETFILTER_ADVANCED + help + This option enables transparent proxying support, that is, + support for handling non-locally bound IPv4 TCP and UDP sockets. + For it to work you will have to configure certain iptables rules + and use policy routing. For more information on how to set it up + see Documentation/networking/tproxy.txt. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XTABLES tristate "Netfilter Xtables support (required for ip_tables)" default m if NETFILTER_ADVANCED=n diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index f101cf61e6f..fc8bbb48d38 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -34,6 +34,9 @@ obj-$(CONFIG_NF_CONNTRACK_SANE) += nf_conntrack_sane.o obj-$(CONFIG_NF_CONNTRACK_SIP) += nf_conntrack_sip.o obj-$(CONFIG_NF_CONNTRACK_TFTP) += nf_conntrack_tftp.o +# transparent proxy support +obj-$(CONFIG_NETFILTER_TPROXY) += nf_tproxy_core.o + # generic X tables obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c new file mode 100644 index 00000000000..fe34f4bf74c --- /dev/null +++ b/net/netfilter/nf_tproxy_core.c @@ -0,0 +1,96 @@ +/* + * Transparent proxy support for Linux/iptables + * + * Copyright (c) 2006-2007 BalaBit IT Ltd. + * Author: Balazs Scheidler, Krisztian Kovacs + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include + +#include +#include +#include +#include +#include + +struct sock * +nf_tproxy_get_sock_v4(struct net *net, const u8 protocol, + const __be32 saddr, const __be32 daddr, + const __be16 sport, const __be16 dport, + const struct net_device *in, bool listening_only) +{ + struct sock *sk; + + /* look up socket */ + switch (protocol) { + case IPPROTO_TCP: + if (listening_only) + sk = __inet_lookup_listener(net, &tcp_hashinfo, + daddr, ntohs(dport), + in->ifindex); + else + sk = __inet_lookup(net, &tcp_hashinfo, + saddr, sport, daddr, dport, + in->ifindex); + break; + case IPPROTO_UDP: + sk = udp4_lib_lookup(net, saddr, sport, daddr, dport, + in->ifindex); + break; + default: + WARN_ON(1); + sk = NULL; + } + + pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, listener only: %d, sock %p\n", + protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), listening_only, sk); + + return sk; +} +EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v4); + +static void +nf_tproxy_destructor(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + skb->sk = NULL; + skb->destructor = NULL; + + if (sk) + nf_tproxy_put_sock(sk); +} + +/* consumes sk */ +int +nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk) +{ + if (inet_sk(sk)->transparent) { + skb->sk = sk; + skb->destructor = nf_tproxy_destructor; + return 1; + } else + nf_tproxy_put_sock(sk); + + return 0; +} +EXPORT_SYMBOL_GPL(nf_tproxy_assign_sock); + +static int __init nf_tproxy_init(void) +{ + pr_info("NF_TPROXY: Transparent proxy support initialized, version 4.1.0\n"); + pr_info("NF_TPROXY: Copyright (c) 2006-2007 BalaBit IT Ltd.\n"); + return 0; +} + +module_init(nf_tproxy_init); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Krisztian Kovacs"); +MODULE_DESCRIPTION("Transparent proxy support core routines"); -- cgit v1.2.3 From 136cdc71fd54e77463e570643ac76e2b696e48a0 Mon Sep 17 00:00:00 2001 From: KOVACS Krisztian Date: Wed, 8 Oct 2008 11:35:12 +0200 Subject: netfilter: iptables socket match Add iptables 'socket' match, which matches packets for which a TCP/UDP socket lookup succeeds. Signed-off-by: KOVACS Krisztian Signed-off-by: Patrick McHardy --- net/netfilter/Kconfig | 15 ++++ net/netfilter/Makefile | 1 + net/netfilter/xt_socket.c | 192 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 208 insertions(+) create mode 100644 net/netfilter/xt_socket.c (limited to 'net/netfilter') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index ed1dcfb61e1..f6c80729948 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -778,6 +778,21 @@ config NETFILTER_XT_MATCH_SCTP If you want to compile it as a module, say M here and read . If unsure, say `N'. +config NETFILTER_XT_MATCH_SOCKET + tristate '"socket" match support (EXPERIMENTAL)' + depends on EXPERIMENTAL + depends on NETFILTER_TPROXY + depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED + select NF_DEFRAG_IPV4 + help + This option adds a `socket' match, which can be used to match + packets for which a TCP or UDP socket lookup finds a valid socket. + It can be used in combination with the MARK target and policy + routing to implement full featured non-locally bound sockets. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_MATCH_STATE tristate '"state" match support' depends on NETFILTER_XTABLES diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index fc8bbb48d38..1cdc3a13d01 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -81,6 +81,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_RATEEST) += xt_rateest.o obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o obj-$(CONFIG_NETFILTER_XT_MATCH_RECENT) += xt_recent.o obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o +obj-$(CONFIG_NETFILTER_XT_MATCH_SOCKET) += xt_socket.o obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o obj-$(CONFIG_NETFILTER_XT_MATCH_STATISTIC) += xt_statistic.o obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c new file mode 100644 index 00000000000..ac9db17c7b9 --- /dev/null +++ b/net/netfilter/xt_socket.c @@ -0,0 +1,192 @@ +/* + * Transparent proxy support for Linux/iptables + * + * Copyright (C) 2007-2008 BalaBit IT Ltd. + * Author: Krisztian Kovacs + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#define XT_SOCKET_HAVE_CONNTRACK 1 +#include +#endif + +static int +extract_icmp_fields(const struct sk_buff *skb, + u8 *protocol, + __be32 *raddr, + __be32 *laddr, + __be16 *rport, + __be16 *lport) +{ + unsigned int outside_hdrlen = ip_hdrlen(skb); + struct iphdr *inside_iph, _inside_iph; + struct icmphdr *icmph, _icmph; + __be16 *ports, _ports[2]; + + icmph = skb_header_pointer(skb, outside_hdrlen, + sizeof(_icmph), &_icmph); + if (icmph == NULL) + return 1; + + switch (icmph->type) { + case ICMP_DEST_UNREACH: + case ICMP_SOURCE_QUENCH: + case ICMP_REDIRECT: + case ICMP_TIME_EXCEEDED: + case ICMP_PARAMETERPROB: + break; + default: + return 1; + } + + inside_iph = skb_header_pointer(skb, outside_hdrlen + + sizeof(struct icmphdr), + sizeof(_inside_iph), &_inside_iph); + if (inside_iph == NULL) + return 1; + + if (inside_iph->protocol != IPPROTO_TCP && + inside_iph->protocol != IPPROTO_UDP) + return 1; + + ports = skb_header_pointer(skb, outside_hdrlen + + sizeof(struct icmphdr) + + (inside_iph->ihl << 2), + sizeof(_ports), &_ports); + if (ports == NULL) + return 1; + + /* the inside IP packet is the one quoted from our side, thus + * its saddr is the local address */ + *protocol = inside_iph->protocol; + *laddr = inside_iph->saddr; + *lport = ports[0]; + *raddr = inside_iph->daddr; + *rport = ports[1]; + + return 0; +} + + +static bool +socket_mt(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct xt_match *match, + const void *matchinfo, + int offset, + unsigned int protoff, + bool *hotdrop) +{ + const struct iphdr *iph = ip_hdr(skb); + struct udphdr _hdr, *hp = NULL; + struct sock *sk; + __be32 daddr, saddr; + __be16 dport, sport; + u8 protocol; +#ifdef XT_SOCKET_HAVE_CONNTRACK + struct nf_conn const *ct; + enum ip_conntrack_info ctinfo; +#endif + + if (iph->protocol == IPPROTO_UDP || iph->protocol == IPPROTO_TCP) { + hp = skb_header_pointer(skb, ip_hdrlen(skb), + sizeof(_hdr), &_hdr); + if (hp == NULL) + return false; + + protocol = iph->protocol; + saddr = iph->saddr; + sport = hp->source; + daddr = iph->daddr; + dport = hp->dest; + + } else if (iph->protocol == IPPROTO_ICMP) { + if (extract_icmp_fields(skb, &protocol, &saddr, &daddr, + &sport, &dport)) + return false; + } else { + return false; + } + +#ifdef XT_SOCKET_HAVE_CONNTRACK + /* Do the lookup with the original socket address in case this is a + * reply packet of an established SNAT-ted connection. */ + + ct = nf_ct_get(skb, &ctinfo); + if (ct && (ct != &nf_conntrack_untracked) && + ((iph->protocol != IPPROTO_ICMP && + ctinfo == IP_CT_IS_REPLY + IP_CT_ESTABLISHED) || + (iph->protocol == IPPROTO_ICMP && + ctinfo == IP_CT_IS_REPLY + IP_CT_RELATED)) && + (ct->status & IPS_SRC_NAT_DONE)) { + + daddr = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip; + dport = (iph->protocol == IPPROTO_TCP) ? + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port : + ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port; + } +#endif + + sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), protocol, + saddr, daddr, sport, dport, in, false); + if (sk != NULL) { + bool wildcard = (inet_sk(sk)->rcv_saddr == 0); + + nf_tproxy_put_sock(sk); + if (wildcard) + sk = NULL; + } + + pr_debug("socket match: proto %u %08x:%u -> %08x:%u " + "(orig %08x:%u) sock %p\n", + protocol, ntohl(saddr), ntohs(sport), + ntohl(daddr), ntohs(dport), + ntohl(iph->daddr), hp ? ntohs(hp->dest) : 0, sk); + + return (sk != NULL); +} + +static struct xt_match socket_mt_reg __read_mostly = { + .name = "socket", + .family = AF_INET, + .match = socket_mt, + .hooks = 1 << NF_INET_PRE_ROUTING, + .me = THIS_MODULE, +}; + +static int __init socket_mt_init(void) +{ + nf_defrag_ipv4_enable(); + return xt_register_match(&socket_mt_reg); +} + +static void __exit socket_mt_exit(void) +{ + xt_unregister_match(&socket_mt_reg); +} + +module_init(socket_mt_init); +module_exit(socket_mt_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler"); +MODULE_DESCRIPTION("x_tables socket match module"); +MODULE_ALIAS("ipt_socket"); -- cgit v1.2.3 From e84392707e10301b93121e1b74e2823db50cdf9e Mon Sep 17 00:00:00 2001 From: KOVACS Krisztian Date: Wed, 8 Oct 2008 11:35:12 +0200 Subject: netfilter: iptables TPROXY target The TPROXY target implements redirection of non-local TCP/UDP traffic to local sockets. Additionally, it's possible to manipulate the packet mark if and only if a socket has been found. (We need this because we cannot use multiple targets in the same iptables rule.) Signed-off-by: KOVACS Krisztian Signed-off-by: Patrick McHardy --- net/netfilter/Kconfig | 15 +++++++ net/netfilter/Makefile | 1 + net/netfilter/xt_TPROXY.c | 112 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 128 insertions(+) create mode 100644 net/netfilter/xt_TPROXY.c (limited to 'net/netfilter') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index f6c80729948..de18bba619f 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -421,6 +421,21 @@ config NETFILTER_XT_TARGET_RATEEST To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_TARGET_TPROXY + tristate '"TPROXY" target support (EXPERIMENTAL)' + depends on EXPERIMENTAL + depends on NETFILTER_TPROXY + depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED + select NF_DEFRAG_IPV4 + help + This option adds a `TPROXY' target, which is somewhat similar to + REDIRECT. It can only be used in the mangle table and is useful + to redirect traffic to a transparent proxy. It does _not_ depend + on Netfilter connection tracking and NAT, unlike REDIRECT. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_TARGET_TRACE tristate '"TRACE" target support' depends on NETFILTER_XTABLES diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 1cdc3a13d01..8ce67665882 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -51,6 +51,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o obj-$(CONFIG_NETFILTER_XT_TARGET_RATEEST) += xt_RATEEST.o obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o +obj-$(CONFIG_NETFILTER_XT_TARGET_TPROXY) += xt_TPROXY.o obj-$(CONFIG_NETFILTER_XT_TARGET_TCPMSS) += xt_TCPMSS.o obj-$(CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP) += xt_TCPOPTSTRIP.o obj-$(CONFIG_NETFILTER_XT_TARGET_TRACE) += xt_TRACE.o diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c new file mode 100644 index 00000000000..183f251d2f0 --- /dev/null +++ b/net/netfilter/xt_TPROXY.c @@ -0,0 +1,112 @@ +/* + * Transparent proxy support for Linux/iptables + * + * Copyright (c) 2006-2007 BalaBit IT Ltd. + * Author: Balazs Scheidler, Krisztian Kovacs + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +static unsigned int +tproxy_tg(struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const struct xt_target *target, + const void *targinfo) +{ + const struct iphdr *iph = ip_hdr(skb); + const struct xt_tproxy_target_info *tgi = targinfo; + struct udphdr _hdr, *hp; + struct sock *sk; + + hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr); + if (hp == NULL) + return NF_DROP; + + sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, + iph->saddr, tgi->laddr ? tgi->laddr : iph->daddr, + hp->source, tgi->lport ? tgi->lport : hp->dest, + in, true); + + /* NOTE: assign_sock consumes our sk reference */ + if (sk && nf_tproxy_assign_sock(skb, sk)) { + /* This should be in a separate target, but we don't do multiple + targets on the same rule yet */ + skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value; + + pr_debug("redirecting: proto %u %08x:%u -> %08x:%u, mark: %x\n", + iph->protocol, ntohl(iph->daddr), ntohs(hp->dest), + ntohl(tgi->laddr), ntohs(tgi->lport), skb->mark); + return NF_ACCEPT; + } + + pr_debug("no socket, dropping: proto %u %08x:%u -> %08x:%u, mark: %x\n", + iph->protocol, ntohl(iph->daddr), ntohs(hp->dest), + ntohl(tgi->laddr), ntohs(tgi->lport), skb->mark); + return NF_DROP; +} + +static bool +tproxy_tg_check(const char *tablename, + const void *entry, + const struct xt_target *target, + void *targetinfo, + unsigned int hook_mask) +{ + const struct ipt_ip *i = entry; + + if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP) + && !(i->invflags & IPT_INV_PROTO)) + return true; + + pr_info("xt_TPROXY: Can be used only in combination with " + "either -p tcp or -p udp\n"); + return false; +} + +static struct xt_target tproxy_tg_reg __read_mostly = { + .name = "TPROXY", + .family = AF_INET, + .table = "mangle", + .target = tproxy_tg, + .targetsize = sizeof(struct xt_tproxy_target_info), + .checkentry = tproxy_tg_check, + .hooks = 1 << NF_INET_PRE_ROUTING, + .me = THIS_MODULE, +}; + +static int __init tproxy_tg_init(void) +{ + nf_defrag_ipv4_enable(); + return xt_register_target(&tproxy_tg_reg); +} + +static void __exit tproxy_tg_exit(void) +{ + xt_unregister_target(&tproxy_tg_reg); +} + +module_init(tproxy_tg_init); +module_exit(tproxy_tg_exit); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Krisztian Kovacs"); +MODULE_DESCRIPTION("Netfilter transparent proxy (TPROXY) target module."); +MODULE_ALIAS("ipt_TPROXY"); -- cgit v1.2.3 From 043ef46c7690bfdbd5b012e15812a14a19ca5604 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:15 +0200 Subject: netfilter: move Ebtables to use Xtables Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/netfilter/x_tables.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index aece6c2d134..0e23f42e341 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -30,7 +30,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte "); -MODULE_DESCRIPTION("[ip,ip6,arp]_tables backend module"); +MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module"); #define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) @@ -325,7 +325,12 @@ int xt_check_match(const struct xt_match *match, unsigned short family, unsigned int size, const char *table, unsigned int hook_mask, unsigned short proto, int inv_proto) { - if (XT_ALIGN(match->matchsize) != size) { + if (XT_ALIGN(match->matchsize) != size && + match->matchsize != -1) { + /* + * ebt_among is exempt from centralized matchsize checking + * because it uses a dynamic-size data set. + */ printk("%s_tables: %s match: invalid size %Zu != %u\n", xt_prefix[family], match->name, XT_ALIGN(match->matchsize), size); -- cgit v1.2.3 From 102befab75c438bfa356c6976026326728771ebc Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:15 +0200 Subject: netfilter: x_tables: output bad hook mask in hexadecimal It is a mask, and masks are most useful in hex. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/netfilter/x_tables.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 0e23f42e341..3b1fc40cc27 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -342,7 +342,7 @@ int xt_check_match(const struct xt_match *match, unsigned short family, return -EINVAL; } if (match->hooks && (hook_mask & ~match->hooks) != 0) { - printk("%s_tables: %s match: bad hook_mask %u/%u\n", + printk("%s_tables: %s match: bad hook_mask %#x/%#x\n", xt_prefix[family], match->name, hook_mask, match->hooks); return -EINVAL; } @@ -483,7 +483,7 @@ int xt_check_target(const struct xt_target *target, unsigned short family, return -EINVAL; } if (target->hooks && (hook_mask & ~target->hooks) != 0) { - printk("%s_tables: %s target: bad hook_mask %u/%u\n", + printk("%s_tables: %s target: bad hook_mask %#x/%#x\n", xt_prefix[family], target->name, hook_mask, target->hooks); return -EINVAL; -- cgit v1.2.3 From 367c679007fa4f990eb7ee381326ec59d8148b0e Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:17 +0200 Subject: netfilter: xtables: do centralized checkentry call (1/2) It used to be that {ip,ip6,etc}_tables called extension->checkentry themselves, but this can be moved into the xtables core. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/netfilter/x_tables.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 3b1fc40cc27..d1f2fb3e8f2 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -323,7 +323,8 @@ EXPORT_SYMBOL_GPL(xt_find_revision); int xt_check_match(const struct xt_match *match, unsigned short family, unsigned int size, const char *table, unsigned int hook_mask, - unsigned short proto, int inv_proto) + unsigned short proto, int inv_proto, const void *entry, + void *matchinfo) { if (XT_ALIGN(match->matchsize) != size && match->matchsize != -1) { @@ -351,6 +352,9 @@ int xt_check_match(const struct xt_match *match, unsigned short family, xt_prefix[family], match->name, match->proto); return -EINVAL; } + if (match->checkentry != NULL && + !match->checkentry(table, entry, match, matchinfo, hook_mask)) + return -EINVAL; return 0; } EXPORT_SYMBOL_GPL(xt_check_match); @@ -469,7 +473,8 @@ EXPORT_SYMBOL_GPL(xt_compat_match_to_user); int xt_check_target(const struct xt_target *target, unsigned short family, unsigned int size, const char *table, unsigned int hook_mask, - unsigned short proto, int inv_proto) + unsigned short proto, int inv_proto, const void *entry, + void *targinfo) { if (XT_ALIGN(target->targetsize) != size) { printk("%s_tables: %s target: invalid size %Zu != %u\n", @@ -493,6 +498,9 @@ int xt_check_target(const struct xt_target *target, unsigned short family, xt_prefix[family], target->name, target->proto); return -EINVAL; } + if (target->checkentry != NULL && + !target->checkentry(table, entry, target, targinfo, hook_mask)) + return -EINVAL; return 0; } EXPORT_SYMBOL_GPL(xt_check_target); -- cgit v1.2.3 From aba0d34800d7f56493b4d5548cc06498a4d69124 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:17 +0200 Subject: netfilter: xtables: sort extensions alphabetically in Kconfig Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/netfilter/Kconfig | 100 +++++++++++++++++++++++++------------------------- 1 file changed, 50 insertions(+), 50 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index de18bba619f..9ad74e8bc5b 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -340,6 +340,18 @@ config NETFILTER_XT_TARGET_CONNMARK . The module will be called ipt_CONNMARK.ko. If unsure, say `N'. +config NETFILTER_XT_TARGET_CONNSECMARK + tristate '"CONNSECMARK" target support' + depends on NETFILTER_XTABLES && NF_CONNTRACK && NF_CONNTRACK_SECMARK + default m if NETFILTER_ADVANCED=n + help + The CONNSECMARK target copies security markings from packets + to connections, and restores security markings from connections + to packets (if the packets are not already marked). This would + normally be used in conjunction with the SECMARK target. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_TARGET_DSCP tristate '"DSCP" and "TOS" target support' depends on NETFILTER_XTABLES @@ -371,18 +383,6 @@ config NETFILTER_XT_TARGET_MARK To compile it as a module, choose M here. If unsure, say N. -config NETFILTER_XT_TARGET_NFQUEUE - tristate '"NFQUEUE" target Support' - depends on NETFILTER_XTABLES - depends on NETFILTER_ADVANCED - help - This target replaced the old obsolete QUEUE target. - - As opposed to QUEUE, it supports 65535 different queues, - not just one. - - To compile it as a module, choose M here. If unsure, say N. - config NETFILTER_XT_TARGET_NFLOG tristate '"NFLOG" target support' depends on NETFILTER_XTABLES @@ -395,6 +395,18 @@ config NETFILTER_XT_TARGET_NFLOG To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_TARGET_NFQUEUE + tristate '"NFQUEUE" target Support' + depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED + help + This target replaced the old obsolete QUEUE target. + + As opposed to QUEUE, it supports 65535 different queues, + not just one. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_TARGET_NOTRACK tristate '"NOTRACK" target support' depends on NETFILTER_XTABLES @@ -459,18 +471,6 @@ config NETFILTER_XT_TARGET_SECMARK To compile it as a module, choose M here. If unsure, say N. -config NETFILTER_XT_TARGET_CONNSECMARK - tristate '"CONNSECMARK" target support' - depends on NETFILTER_XTABLES && NF_CONNTRACK && NF_CONNTRACK_SECMARK - default m if NETFILTER_ADVANCED=n - help - The CONNSECMARK target copies security markings from packets - to connections, and restores security markings from connections - to packets (if the packets are not already marked). This would - normally be used in conjunction with the SECMARK target. - - To compile it as a module, choose M here. If unsure, say N. - config NETFILTER_XT_TARGET_TCPMSS tristate '"TCPMSS" target support' depends on NETFILTER_XTABLES && (IPV6 || IPV6=n) @@ -607,6 +607,21 @@ config NETFILTER_XT_MATCH_ESP To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_MATCH_HASHLIMIT + tristate '"hashlimit" match support' + depends on NETFILTER_XTABLES && (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on NETFILTER_ADVANCED + help + This option adds a `hashlimit' match. + + As opposed to `limit', this match dynamically creates a hash table + of limit buckets, based on your selection of source/destination + addresses and/or ports. + + It enables you to express policies like `10kpps for any given + destination address' or `500pps from any given source address' + with a single rule. + config NETFILTER_XT_MATCH_HELPER tristate '"helper" match support' depends on NETFILTER_XTABLES @@ -671,6 +686,17 @@ config NETFILTER_XT_MATCH_MARK To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_MATCH_MULTIPORT + tristate '"multiport" Multiple port match support' + depends on NETFILTER_XTABLES + depends on NETFILTER_ADVANCED + help + Multiport matching allows you to match TCP or UDP packets based on + a series of source or destination ports: normally a rule can only + match a single range of ports. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_MATCH_OWNER tristate '"owner" match support' depends on NETFILTER_XTABLES @@ -691,17 +717,6 @@ config NETFILTER_XT_MATCH_POLICY To compile it as a module, choose M here. If unsure, say N. -config NETFILTER_XT_MATCH_MULTIPORT - tristate '"multiport" Multiple port match support' - depends on NETFILTER_XTABLES - depends on NETFILTER_ADVANCED - help - Multiport matching allows you to match TCP or UDP packets based on - a series of source or destination ports: normally a rule can only - match a single range of ports. - - To compile it as a module, choose M here. If unsure, say N. - config NETFILTER_XT_MATCH_PHYSDEV tristate '"physdev" match support' depends on NETFILTER_XTABLES && BRIDGE && BRIDGE_NETFILTER @@ -884,20 +899,5 @@ config NETFILTER_XT_MATCH_U32 Details and examples are in the kernel module source. -config NETFILTER_XT_MATCH_HASHLIMIT - tristate '"hashlimit" match support' - depends on NETFILTER_XTABLES && (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) - depends on NETFILTER_ADVANCED - help - This option adds a `hashlimit' match. - - As opposed to `limit', this match dynamically creates a hash table - of limit buckets, based on your selection of source/destination - addresses and/or ports. - - It enables you to express policies like `10kpps for any given - destination address' or `500pps from any given source address' - with a single rule. - endmenu -- cgit v1.2.3 From c2df73de246ae75705af8ceed4f385b261dea108 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:18 +0200 Subject: netfilter: xtables: use "if" blocks in Kconfig Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/netfilter/Kconfig | 84 +++++++++++++-------------------------------------- 1 file changed, 21 insertions(+), 63 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 9ad74e8bc5b..899e78051d8 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -38,10 +38,11 @@ config NF_CONNTRACK To compile it as a module, choose M here. If unsure, say N. +if NF_CONNTRACK + config NF_CT_ACCT bool "Connection tracking flow accounting" depends on NETFILTER_ADVANCED - depends on NF_CONNTRACK help If this option is enabled, the connection tracking code will keep per-flow packet and byte counters. @@ -63,7 +64,6 @@ config NF_CT_ACCT config NF_CONNTRACK_MARK bool 'Connection mark tracking support' depends on NETFILTER_ADVANCED - depends on NF_CONNTRACK help This option enables support for connection marks, used by the `CONNMARK' target and `connmark' match. Similar to the mark value @@ -72,7 +72,7 @@ config NF_CONNTRACK_MARK config NF_CONNTRACK_SECMARK bool 'Connection tracking security mark support' - depends on NF_CONNTRACK && NETWORK_SECMARK + depends on NETWORK_SECMARK default m if NETFILTER_ADVANCED=n help This option enables security markings to be applied to @@ -85,7 +85,6 @@ config NF_CONNTRACK_SECMARK config NF_CONNTRACK_EVENTS bool "Connection tracking events" - depends on NF_CONNTRACK depends on NETFILTER_ADVANCED help If this option is enabled, the connection tracking code will @@ -96,7 +95,7 @@ config NF_CONNTRACK_EVENTS config NF_CT_PROTO_DCCP tristate 'DCCP protocol connection tracking support (EXPERIMENTAL)' - depends on EXPERIMENTAL && NF_CONNTRACK + depends on EXPERIMENTAL depends on NETFILTER_ADVANCED default IP_DCCP help @@ -107,11 +106,10 @@ config NF_CT_PROTO_DCCP config NF_CT_PROTO_GRE tristate - depends on NF_CONNTRACK config NF_CT_PROTO_SCTP tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)' - depends on EXPERIMENTAL && NF_CONNTRACK + depends on EXPERIMENTAL depends on NETFILTER_ADVANCED default IP_SCTP help @@ -123,7 +121,6 @@ config NF_CT_PROTO_SCTP config NF_CT_PROTO_UDPLITE tristate 'UDP-Lite protocol connection tracking support' - depends on NF_CONNTRACK depends on NETFILTER_ADVANCED help With this option enabled, the layer 3 independent connection @@ -134,7 +131,6 @@ config NF_CT_PROTO_UDPLITE config NF_CONNTRACK_AMANDA tristate "Amanda backup protocol support" - depends on NF_CONNTRACK depends on NETFILTER_ADVANCED select TEXTSEARCH select TEXTSEARCH_KMP @@ -150,7 +146,6 @@ config NF_CONNTRACK_AMANDA config NF_CONNTRACK_FTP tristate "FTP protocol support" - depends on NF_CONNTRACK default m if NETFILTER_ADVANCED=n help Tracking FTP connections is problematic: special helpers are @@ -165,7 +160,7 @@ config NF_CONNTRACK_FTP config NF_CONNTRACK_H323 tristate "H.323 protocol support" - depends on NF_CONNTRACK && (IPV6 || IPV6=n) + depends on (IPV6 || IPV6=n) depends on NETFILTER_ADVANCED help H.323 is a VoIP signalling protocol from ITU-T. As one of the most @@ -185,7 +180,6 @@ config NF_CONNTRACK_H323 config NF_CONNTRACK_IRC tristate "IRC protocol support" - depends on NF_CONNTRACK default m if NETFILTER_ADVANCED=n help There is a commonly-used extension to IRC called @@ -201,7 +195,6 @@ config NF_CONNTRACK_IRC config NF_CONNTRACK_NETBIOS_NS tristate "NetBIOS name service protocol support" - depends on NF_CONNTRACK depends on NETFILTER_ADVANCED help NetBIOS name service requests are sent as broadcast messages from an @@ -221,7 +214,6 @@ config NF_CONNTRACK_NETBIOS_NS config NF_CONNTRACK_PPTP tristate "PPtP protocol support" - depends on NF_CONNTRACK depends on NETFILTER_ADVANCED select NF_CT_PROTO_GRE help @@ -241,7 +233,7 @@ config NF_CONNTRACK_PPTP config NF_CONNTRACK_SANE tristate "SANE protocol support (EXPERIMENTAL)" - depends on EXPERIMENTAL && NF_CONNTRACK + depends on EXPERIMENTAL depends on NETFILTER_ADVANCED help SANE is a protocol for remote access to scanners as implemented @@ -255,7 +247,6 @@ config NF_CONNTRACK_SANE config NF_CONNTRACK_SIP tristate "SIP protocol support" - depends on NF_CONNTRACK default m if NETFILTER_ADVANCED=n help SIP is an application-layer control protocol that can establish, @@ -268,7 +259,6 @@ config NF_CONNTRACK_SIP config NF_CONNTRACK_TFTP tristate "TFTP protocol support" - depends on NF_CONNTRACK depends on NETFILTER_ADVANCED help TFTP connection tracking helper, this is required depending @@ -280,7 +270,6 @@ config NF_CONNTRACK_TFTP config NF_CT_NETLINK tristate 'Connection tracking netlink interface' - depends on NF_CONNTRACK select NETFILTER_NETLINK depends on NF_NAT=n || NF_NAT default m if NETFILTER_ADVANCED=n @@ -302,6 +291,8 @@ config NETFILTER_TPROXY To compile it as a module, choose M here. If unsure, say N. +endif # NF_CONNTRACK + config NETFILTER_XTABLES tristate "Netfilter Xtables support (required for ip_tables)" default m if NETFILTER_ADVANCED=n @@ -309,11 +300,12 @@ config NETFILTER_XTABLES This is required if you intend to use any of ip_tables, ip6_tables or arp_tables. +if NETFILTER_XTABLES + # alphabetically ordered list of targets config NETFILTER_XT_TARGET_CLASSIFY tristate '"CLASSIFY" target support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED help This option adds a `CLASSIFY' target, which enables the user to set @@ -326,7 +318,6 @@ config NETFILTER_XT_TARGET_CLASSIFY config NETFILTER_XT_TARGET_CONNMARK tristate '"CONNMARK" target support' - depends on NETFILTER_XTABLES depends on IP_NF_MANGLE || IP6_NF_MANGLE depends on NF_CONNTRACK depends on NETFILTER_ADVANCED @@ -342,7 +333,7 @@ config NETFILTER_XT_TARGET_CONNMARK config NETFILTER_XT_TARGET_CONNSECMARK tristate '"CONNSECMARK" target support' - depends on NETFILTER_XTABLES && NF_CONNTRACK && NF_CONNTRACK_SECMARK + depends on NF_CONNTRACK && NF_CONNTRACK_SECMARK default m if NETFILTER_ADVANCED=n help The CONNSECMARK target copies security markings from packets @@ -354,7 +345,6 @@ config NETFILTER_XT_TARGET_CONNSECMARK config NETFILTER_XT_TARGET_DSCP tristate '"DSCP" and "TOS" target support' - depends on NETFILTER_XTABLES depends on IP_NF_MANGLE || IP6_NF_MANGLE depends on NETFILTER_ADVANCED help @@ -371,7 +361,6 @@ config NETFILTER_XT_TARGET_DSCP config NETFILTER_XT_TARGET_MARK tristate '"MARK" target support' - depends on NETFILTER_XTABLES default m if NETFILTER_ADVANCED=n help This option adds a `MARK' target, which allows you to create rules @@ -385,7 +374,6 @@ config NETFILTER_XT_TARGET_MARK config NETFILTER_XT_TARGET_NFLOG tristate '"NFLOG" target support' - depends on NETFILTER_XTABLES default m if NETFILTER_ADVANCED=n help This option enables the NFLOG target, which allows to LOG @@ -397,7 +385,6 @@ config NETFILTER_XT_TARGET_NFLOG config NETFILTER_XT_TARGET_NFQUEUE tristate '"NFQUEUE" target Support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED help This target replaced the old obsolete QUEUE target. @@ -409,7 +396,6 @@ config NETFILTER_XT_TARGET_NFQUEUE config NETFILTER_XT_TARGET_NOTRACK tristate '"NOTRACK" target support' - depends on NETFILTER_XTABLES depends on IP_NF_RAW || IP6_NF_RAW depends on NF_CONNTRACK depends on NETFILTER_ADVANCED @@ -424,7 +410,6 @@ config NETFILTER_XT_TARGET_NOTRACK config NETFILTER_XT_TARGET_RATEEST tristate '"RATEEST" target support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED help This option adds a `RATEEST' target, which allows to measure @@ -450,7 +435,6 @@ config NETFILTER_XT_TARGET_TPROXY config NETFILTER_XT_TARGET_TRACE tristate '"TRACE" target support' - depends on NETFILTER_XTABLES depends on IP_NF_RAW || IP6_NF_RAW depends on NETFILTER_ADVANCED help @@ -463,7 +447,7 @@ config NETFILTER_XT_TARGET_TRACE config NETFILTER_XT_TARGET_SECMARK tristate '"SECMARK" target support' - depends on NETFILTER_XTABLES && NETWORK_SECMARK + depends on NETWORK_SECMARK default m if NETFILTER_ADVANCED=n help The SECMARK target allows security marking of network @@ -473,7 +457,7 @@ config NETFILTER_XT_TARGET_SECMARK config NETFILTER_XT_TARGET_TCPMSS tristate '"TCPMSS" target support' - depends on NETFILTER_XTABLES && (IPV6 || IPV6=n) + depends on (IPV6 || IPV6=n) default m if NETFILTER_ADVANCED=n ---help--- This option adds a `TCPMSS' target, which allows you to alter the @@ -500,7 +484,7 @@ config NETFILTER_XT_TARGET_TCPMSS config NETFILTER_XT_TARGET_TCPOPTSTRIP tristate '"TCPOPTSTRIP" target support (EXPERIMENTAL)' - depends on EXPERIMENTAL && NETFILTER_XTABLES + depends on EXPERIMENTAL depends on IP_NF_MANGLE || IP6_NF_MANGLE depends on NETFILTER_ADVANCED help @@ -509,7 +493,6 @@ config NETFILTER_XT_TARGET_TCPOPTSTRIP config NETFILTER_XT_MATCH_COMMENT tristate '"comment" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED help This option adds a `comment' dummy-match, which allows you to put @@ -520,7 +503,6 @@ config NETFILTER_XT_MATCH_COMMENT config NETFILTER_XT_MATCH_CONNBYTES tristate '"connbytes" per-connection counter match support' - depends on NETFILTER_XTABLES depends on NF_CONNTRACK depends on NETFILTER_ADVANCED select NF_CT_ACCT @@ -533,7 +515,6 @@ config NETFILTER_XT_MATCH_CONNBYTES config NETFILTER_XT_MATCH_CONNLIMIT tristate '"connlimit" match support"' - depends on NETFILTER_XTABLES depends on NF_CONNTRACK depends on NETFILTER_ADVANCED ---help--- @@ -542,7 +523,6 @@ config NETFILTER_XT_MATCH_CONNLIMIT config NETFILTER_XT_MATCH_CONNMARK tristate '"connmark" connection mark match support' - depends on NETFILTER_XTABLES depends on NF_CONNTRACK depends on NETFILTER_ADVANCED select NF_CONNTRACK_MARK @@ -556,7 +536,6 @@ config NETFILTER_XT_MATCH_CONNMARK config NETFILTER_XT_MATCH_CONNTRACK tristate '"conntrack" connection tracking match support' - depends on NETFILTER_XTABLES depends on NF_CONNTRACK default m if NETFILTER_ADVANCED=n help @@ -570,7 +549,6 @@ config NETFILTER_XT_MATCH_CONNTRACK config NETFILTER_XT_MATCH_DCCP tristate '"dccp" protocol match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED default IP_DCCP help @@ -583,7 +561,6 @@ config NETFILTER_XT_MATCH_DCCP config NETFILTER_XT_MATCH_DSCP tristate '"dscp" and "tos" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED help This option adds a `DSCP' match, which allows you to match against @@ -599,7 +576,6 @@ config NETFILTER_XT_MATCH_DSCP config NETFILTER_XT_MATCH_ESP tristate '"esp" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED help This match extension allows you to match a range of SPIs @@ -609,7 +585,7 @@ config NETFILTER_XT_MATCH_ESP config NETFILTER_XT_MATCH_HASHLIMIT tristate '"hashlimit" match support' - depends on NETFILTER_XTABLES && (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) + depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n) depends on NETFILTER_ADVANCED help This option adds a `hashlimit' match. @@ -624,7 +600,6 @@ config NETFILTER_XT_MATCH_HASHLIMIT config NETFILTER_XT_MATCH_HELPER tristate '"helper" match support' - depends on NETFILTER_XTABLES depends on NF_CONNTRACK depends on NETFILTER_ADVANCED help @@ -635,7 +610,6 @@ config NETFILTER_XT_MATCH_HELPER config NETFILTER_XT_MATCH_IPRANGE tristate '"iprange" address range match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED ---help--- This option adds a "iprange" match, which allows you to match based on @@ -646,7 +620,6 @@ config NETFILTER_XT_MATCH_IPRANGE config NETFILTER_XT_MATCH_LENGTH tristate '"length" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED help This option allows you to match the length of a packet against a @@ -656,7 +629,6 @@ config NETFILTER_XT_MATCH_LENGTH config NETFILTER_XT_MATCH_LIMIT tristate '"limit" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED help limit matching allows you to control the rate at which a rule can be @@ -667,7 +639,6 @@ config NETFILTER_XT_MATCH_LIMIT config NETFILTER_XT_MATCH_MAC tristate '"mac" address match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED help MAC matching allows you to match packets based on the source @@ -677,7 +648,6 @@ config NETFILTER_XT_MATCH_MAC config NETFILTER_XT_MATCH_MARK tristate '"mark" match support' - depends on NETFILTER_XTABLES default m if NETFILTER_ADVANCED=n help Netfilter mark matching allows you to match packets based on the @@ -688,7 +658,6 @@ config NETFILTER_XT_MATCH_MARK config NETFILTER_XT_MATCH_MULTIPORT tristate '"multiport" Multiple port match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED help Multiport matching allows you to match TCP or UDP packets based on @@ -699,7 +668,6 @@ config NETFILTER_XT_MATCH_MULTIPORT config NETFILTER_XT_MATCH_OWNER tristate '"owner" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED ---help--- Socket owner matching allows you to match locally-generated packets @@ -708,7 +676,7 @@ config NETFILTER_XT_MATCH_OWNER config NETFILTER_XT_MATCH_POLICY tristate 'IPsec "policy" match support' - depends on NETFILTER_XTABLES && XFRM + depends on XFRM default m if NETFILTER_ADVANCED=n help Policy matching allows you to match packets based on the @@ -719,7 +687,7 @@ config NETFILTER_XT_MATCH_POLICY config NETFILTER_XT_MATCH_PHYSDEV tristate '"physdev" match support' - depends on NETFILTER_XTABLES && BRIDGE && BRIDGE_NETFILTER + depends on BRIDGE && BRIDGE_NETFILTER depends on NETFILTER_ADVANCED help Physdev packet matching matches against the physical bridge ports @@ -729,7 +697,6 @@ config NETFILTER_XT_MATCH_PHYSDEV config NETFILTER_XT_MATCH_PKTTYPE tristate '"pkttype" packet type match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED help Packet type matching allows you to match a packet by @@ -742,7 +709,6 @@ config NETFILTER_XT_MATCH_PKTTYPE config NETFILTER_XT_MATCH_QUOTA tristate '"quota" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED help This option adds a `quota' match, which allows to match on a @@ -753,7 +719,6 @@ config NETFILTER_XT_MATCH_QUOTA config NETFILTER_XT_MATCH_RATEEST tristate '"rateest" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED select NETFILTER_XT_TARGET_RATEEST help @@ -764,7 +729,6 @@ config NETFILTER_XT_MATCH_RATEEST config NETFILTER_XT_MATCH_REALM tristate '"realm" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED select NET_CLS_ROUTE help @@ -779,7 +743,6 @@ config NETFILTER_XT_MATCH_REALM config NETFILTER_XT_MATCH_RECENT tristate '"recent" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED ---help--- This match is used for creating one or many lists of recently @@ -797,7 +760,7 @@ config NETFILTER_XT_MATCH_RECENT_PROC_COMPAT config NETFILTER_XT_MATCH_SCTP tristate '"sctp" protocol match support (EXPERIMENTAL)' - depends on NETFILTER_XTABLES && EXPERIMENTAL + depends on EXPERIMENTAL depends on NETFILTER_ADVANCED default IP_SCTP help @@ -825,7 +788,6 @@ config NETFILTER_XT_MATCH_SOCKET config NETFILTER_XT_MATCH_STATE tristate '"state" match support' - depends on NETFILTER_XTABLES depends on NF_CONNTRACK default m if NETFILTER_ADVANCED=n help @@ -837,7 +799,6 @@ config NETFILTER_XT_MATCH_STATE config NETFILTER_XT_MATCH_STATISTIC tristate '"statistic" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED help This option adds a `statistic' match, which allows you to match @@ -847,7 +808,6 @@ config NETFILTER_XT_MATCH_STATISTIC config NETFILTER_XT_MATCH_STRING tristate '"string" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED select TEXTSEARCH select TEXTSEARCH_KMP @@ -861,7 +821,6 @@ config NETFILTER_XT_MATCH_STRING config NETFILTER_XT_MATCH_TCPMSS tristate '"tcpmss" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED help This option adds a `tcpmss' match, which allows you to examine the @@ -872,7 +831,6 @@ config NETFILTER_XT_MATCH_TCPMSS config NETFILTER_XT_MATCH_TIME tristate '"time" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED ---help--- This option adds a "time" match, which allows you to match based on @@ -887,7 +845,6 @@ config NETFILTER_XT_MATCH_TIME config NETFILTER_XT_MATCH_U32 tristate '"u32" match support' - depends on NETFILTER_XTABLES depends on NETFILTER_ADVANCED ---help--- u32 allows you to extract quantities of up to 4 bytes from a packet, @@ -899,5 +856,6 @@ config NETFILTER_XT_MATCH_U32 Details and examples are in the kernel module source. -endmenu +endif # NETFILTER_XTABLES +endmenu -- cgit v1.2.3 From f7108a20dee44e5bb037f9e48f6a207b42e6ae1c Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:18 +0200 Subject: netfilter: xtables: move extension arguments into compound structure (1/6) The function signatures for Xtables extensions have grown over time. It involves a lot of typing/replication, and also a bit of stack space even if they are not used. Realize an NFWS2008 idea and pack them into structs. The skb remains outside of the struct so gcc can continue to apply its optimizations. This patch does this for match extensions' match functions. A few ambiguities have also been addressed. The "offset" parameter for example has been renamed to "fragoff" (there are so many different offsets already) and "protoff" to "thoff" (there is more than just one protocol here, so clarify). Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/netfilter/xt_comment.c | 5 +---- net/netfilter/xt_connbytes.c | 7 ++----- net/netfilter/xt_connlimit.c | 17 +++++++---------- net/netfilter/xt_connmark.c | 14 ++++---------- net/netfilter/xt_conntrack.c | 22 ++++++++-------------- net/netfilter/xt_dccp.c | 16 +++++++--------- net/netfilter/xt_dscp.c | 30 ++++++++++-------------------- net/netfilter/xt_esp.c | 13 +++++-------- net/netfilter/xt_hashlimit.c | 22 ++++++++-------------- net/netfilter/xt_helper.c | 7 ++----- net/netfilter/xt_iprange.c | 21 ++++++--------------- net/netfilter/xt_length.c | 14 ++++---------- net/netfilter/xt_limit.c | 7 ++----- net/netfilter/xt_mac.c | 7 ++----- net/netfilter/xt_mark.c | 13 ++++--------- net/netfilter/xt_multiport.c | 26 ++++++++++---------------- net/netfilter/xt_owner.c | 21 ++++++--------------- net/netfilter/xt_physdev.c | 7 ++----- net/netfilter/xt_pkttype.c | 11 ++++------- net/netfilter/xt_policy.c | 11 ++++------- net/netfilter/xt_quota.c | 7 ++----- net/netfilter/xt_rateest.c | 12 +++--------- net/netfilter/xt_realm.c | 7 ++----- net/netfilter/xt_recent.c | 17 +++++++---------- net/netfilter/xt_sctp.c | 16 +++++++--------- net/netfilter/xt_socket.c | 11 ++--------- net/netfilter/xt_state.c | 7 ++----- net/netfilter/xt_statistic.c | 7 ++----- net/netfilter/xt_string.c | 9 +++------ net/netfilter/xt_tcpmss.c | 13 +++++-------- net/netfilter/xt_tcpudp.c | 36 +++++++++++++++--------------------- net/netfilter/xt_time.c | 6 ++---- net/netfilter/xt_u32.c | 7 ++----- 33 files changed, 152 insertions(+), 294 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/xt_comment.c b/net/netfilter/xt_comment.c index fa211b2ab87..bd7aa57af42 100644 --- a/net/netfilter/xt_comment.c +++ b/net/netfilter/xt_comment.c @@ -16,10 +16,7 @@ MODULE_ALIAS("ipt_comment"); MODULE_ALIAS("ip6t_comment"); static bool -comment_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protooff, - bool *hotdrop) +comment_mt(const struct sk_buff *skb, const struct xt_match_param *par) { /* We always match */ return true; diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c index d2cd22a49c9..30c19b5fe90 100644 --- a/net/netfilter/xt_connbytes.c +++ b/net/netfilter/xt_connbytes.c @@ -17,12 +17,9 @@ MODULE_ALIAS("ipt_connbytes"); MODULE_ALIAS("ip6t_connbytes"); static bool -connbytes_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +connbytes_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_connbytes_info *sinfo = matchinfo; + const struct xt_connbytes_info *sinfo = par->matchinfo; const struct nf_conn *ct; enum ip_conntrack_info ctinfo; u_int64_t what = 0; /* initialize to make gcc happy */ diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index bd00830ff69..8b8f70e7664 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -178,12 +178,9 @@ static int count_them(struct xt_connlimit_data *data, } static bool -connlimit_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +connlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_connlimit_info *info = matchinfo; + const struct xt_connlimit_info *info = par->matchinfo; union nf_inet_addr addr; struct nf_conntrack_tuple tuple; const struct nf_conntrack_tuple *tuple_ptr = &tuple; @@ -195,10 +192,10 @@ connlimit_mt(const struct sk_buff *skb, const struct net_device *in, if (ct != NULL) tuple_ptr = &ct->tuplehash[0].tuple; else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), - match->family, &tuple)) + par->match->family, &tuple)) goto hotdrop; - if (match->family == NFPROTO_IPV6) { + if (par->match->family == NFPROTO_IPV6) { const struct ipv6hdr *iph = ipv6_hdr(skb); memcpy(&addr.ip6, &iph->saddr, sizeof(iph->saddr)); } else { @@ -208,19 +205,19 @@ connlimit_mt(const struct sk_buff *skb, const struct net_device *in, spin_lock_bh(&info->data->lock); connections = count_them(info->data, tuple_ptr, &addr, - &info->mask, match); + &info->mask, par->match); spin_unlock_bh(&info->data->lock); if (connections < 0) { /* kmalloc failed, drop it entirely */ - *hotdrop = true; + *par->hotdrop = true; return false; } return (connections > info->limit) ^ info->inverse; hotdrop: - *hotdrop = true; + *par->hotdrop = true; return false; } diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index 0577b8ff4e1..df4f4a865a5 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -34,12 +34,9 @@ MODULE_ALIAS("ipt_connmark"); MODULE_ALIAS("ip6t_connmark"); static bool -connmark_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +connmark_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_connmark_mtinfo1 *info = matchinfo; + const struct xt_connmark_mtinfo1 *info = par->matchinfo; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; @@ -51,12 +48,9 @@ connmark_mt(const struct sk_buff *skb, const struct net_device *in, } static bool -connmark_mt_v0(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +connmark_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_connmark_info *info = matchinfo; + const struct xt_connmark_info *info = par->matchinfo; const struct nf_conn *ct; enum ip_conntrack_info ctinfo; diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index 392b457f9c2..13a7e4eacdf 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -25,12 +25,9 @@ MODULE_ALIAS("ipt_conntrack"); MODULE_ALIAS("ip6t_conntrack"); static bool -conntrack_mt_v0(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +conntrack_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_conntrack_info *sinfo = matchinfo; + const struct xt_conntrack_info *sinfo = par->matchinfo; const struct nf_conn *ct; enum ip_conntrack_info ctinfo; unsigned int statebit; @@ -205,12 +202,9 @@ ct_proto_port_check(const struct xt_conntrack_mtinfo1 *info, } static bool -conntrack_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +conntrack_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_conntrack_mtinfo1 *info = matchinfo; + const struct xt_conntrack_mtinfo1 *info = par->matchinfo; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; unsigned int statebit; @@ -244,22 +238,22 @@ conntrack_mt(const struct sk_buff *skb, const struct net_device *in, return false; if (info->match_flags & XT_CONNTRACK_ORIGSRC) - if (conntrack_mt_origsrc(ct, info, match->family) ^ + if (conntrack_mt_origsrc(ct, info, par->match->family) ^ !(info->invert_flags & XT_CONNTRACK_ORIGSRC)) return false; if (info->match_flags & XT_CONNTRACK_ORIGDST) - if (conntrack_mt_origdst(ct, info, match->family) ^ + if (conntrack_mt_origdst(ct, info, par->match->family) ^ !(info->invert_flags & XT_CONNTRACK_ORIGDST)) return false; if (info->match_flags & XT_CONNTRACK_REPLSRC) - if (conntrack_mt_replsrc(ct, info, match->family) ^ + if (conntrack_mt_replsrc(ct, info, par->match->family) ^ !(info->invert_flags & XT_CONNTRACK_REPLSRC)) return false; if (info->match_flags & XT_CONNTRACK_REPLDST) - if (conntrack_mt_repldst(ct, info, match->family) ^ + if (conntrack_mt_repldst(ct, info, par->match->family) ^ !(info->invert_flags & XT_CONNTRACK_REPLDST)) return false; diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c index 87971f47132..7aa30bb9105 100644 --- a/net/netfilter/xt_dccp.c +++ b/net/netfilter/xt_dccp.c @@ -93,20 +93,18 @@ match_option(u_int8_t option, const struct sk_buff *skb, unsigned int protoff, } static bool -dccp_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +dccp_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_dccp_info *info = matchinfo; + const struct xt_dccp_info *info = par->matchinfo; const struct dccp_hdr *dh; struct dccp_hdr _dh; - if (offset) + if (par->fragoff != 0) return false; - dh = skb_header_pointer(skb, protoff, sizeof(_dh), &_dh); + dh = skb_header_pointer(skb, par->thoff, sizeof(_dh), &_dh); if (dh == NULL) { - *hotdrop = true; + *par->hotdrop = true; return false; } @@ -118,8 +116,8 @@ dccp_mt(const struct sk_buff *skb, const struct net_device *in, XT_DCCP_DEST_PORTS, info->flags, info->invflags) && DCCHECK(match_types(dh, info->typemask), XT_DCCP_TYPE, info->flags, info->invflags) - && DCCHECK(match_option(info->option, skb, protoff, dh, - hotdrop), + && DCCHECK(match_option(info->option, skb, par->thoff, dh, + par->hotdrop), XT_DCCP_OPTION, info->flags, info->invflags); } diff --git a/net/netfilter/xt_dscp.c b/net/netfilter/xt_dscp.c index 7f03aa13a95..57d61206135 100644 --- a/net/netfilter/xt_dscp.c +++ b/net/netfilter/xt_dscp.c @@ -26,23 +26,18 @@ MODULE_ALIAS("ipt_tos"); MODULE_ALIAS("ip6t_tos"); static bool -dscp_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +dscp_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_dscp_info *info = matchinfo; + const struct xt_dscp_info *info = par->matchinfo; u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; return (dscp == info->dscp) ^ !!info->invert; } static bool -dscp_mt6(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +dscp_mt6(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_dscp_info *info = matchinfo; + const struct xt_dscp_info *info = par->matchinfo; u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; return (dscp == info->dscp) ^ !!info->invert; @@ -63,24 +58,19 @@ dscp_mt_check(const char *tablename, const void *info, return true; } -static bool tos_mt_v0(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, const void *matchinfo, - int offset, unsigned int protoff, bool *hotdrop) +static bool +tos_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct ipt_tos_info *info = matchinfo; + const struct ipt_tos_info *info = par->matchinfo; return (ip_hdr(skb)->tos == info->tos) ^ info->invert; } -static bool tos_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +static bool tos_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_tos_match_info *info = matchinfo; + const struct xt_tos_match_info *info = par->matchinfo; - if (match->family == NFPROTO_IPV4) + if (par->match->family == NFPROTO_IPV4) return ((ip_hdr(skb)->tos & info->tos_mask) == info->tos_value) ^ !!info->invert; else diff --git a/net/netfilter/xt_esp.c b/net/netfilter/xt_esp.c index 045c4deecaf..6d59f2e7c1c 100644 --- a/net/netfilter/xt_esp.c +++ b/net/netfilter/xt_esp.c @@ -42,26 +42,23 @@ spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, bool invert) return r; } -static bool -esp_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +static bool esp_mt(const struct sk_buff *skb, const struct xt_match_param *par) { const struct ip_esp_hdr *eh; struct ip_esp_hdr _esp; - const struct xt_esp *espinfo = matchinfo; + const struct xt_esp *espinfo = par->matchinfo; /* Must not be a fragment. */ - if (offset) + if (par->fragoff != 0) return false; - eh = skb_header_pointer(skb, protoff, sizeof(_esp), &_esp); + eh = skb_header_pointer(skb, par->thoff, sizeof(_esp), &_esp); if (eh == NULL) { /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ duprintf("Dropping evil ESP tinygram.\n"); - *hotdrop = true; + *par->hotdrop = true; return false; } diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 7bae369603d..22a60a728cf 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -563,19 +563,16 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, } static bool -hashlimit_mt_v0(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +hashlimit_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) { const struct xt_hashlimit_info *r = - ((const struct xt_hashlimit_info *)matchinfo)->u.master; + ((const struct xt_hashlimit_info *)par->matchinfo)->u.master; struct xt_hashlimit_htable *hinfo = r->hinfo; unsigned long now = jiffies; struct dsthash_ent *dh; struct dsthash_dst dst; - if (hashlimit_init_dst(hinfo, &dst, skb, protoff) < 0) + if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0) goto hotdrop; spin_lock_bh(&hinfo->lock); @@ -613,23 +610,20 @@ hashlimit_mt_v0(const struct sk_buff *skb, const struct net_device *in, return false; hotdrop: - *hotdrop = true; + *par->hotdrop = true; return false; } static bool -hashlimit_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +hashlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_hashlimit_mtinfo1 *info = matchinfo; + const struct xt_hashlimit_mtinfo1 *info = par->matchinfo; struct xt_hashlimit_htable *hinfo = info->hinfo; unsigned long now = jiffies; struct dsthash_ent *dh; struct dsthash_dst dst; - if (hashlimit_init_dst(hinfo, &dst, skb, protoff) < 0) + if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0) goto hotdrop; spin_lock_bh(&hinfo->lock); @@ -666,7 +660,7 @@ hashlimit_mt(const struct sk_buff *skb, const struct net_device *in, return info->cfg.mode & XT_HASHLIMIT_INVERT; hotdrop: - *hotdrop = true; + *par->hotdrop = true; return false; } diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c index 134d94324eb..73bdc3ba13f 100644 --- a/net/netfilter/xt_helper.c +++ b/net/netfilter/xt_helper.c @@ -24,12 +24,9 @@ MODULE_ALIAS("ip6t_helper"); static bool -helper_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +helper_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_helper_info *info = matchinfo; + const struct xt_helper_info *info = par->matchinfo; const struct nf_conn *ct; const struct nf_conn_help *master_help; const struct nf_conntrack_helper *helper; diff --git a/net/netfilter/xt_iprange.c b/net/netfilter/xt_iprange.c index a7498cc48dc..6f62c36948d 100644 --- a/net/netfilter/xt_iprange.c +++ b/net/netfilter/xt_iprange.c @@ -17,12 +17,9 @@ #include static bool -iprange_mt_v0(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +iprange_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct ipt_iprange_info *info = matchinfo; + const struct ipt_iprange_info *info = par->matchinfo; const struct iphdr *iph = ip_hdr(skb); if (info->flags & IPRANGE_SRC) { @@ -55,12 +52,9 @@ iprange_mt_v0(const struct sk_buff *skb, const struct net_device *in, } static bool -iprange_mt4(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +iprange_mt4(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_iprange_mtinfo *info = matchinfo; + const struct xt_iprange_mtinfo *info = par->matchinfo; const struct iphdr *iph = ip_hdr(skb); bool m; @@ -111,12 +105,9 @@ iprange_ipv6_sub(const struct in6_addr *a, const struct in6_addr *b) } static bool -iprange_mt6(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +iprange_mt6(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_iprange_mtinfo *info = matchinfo; + const struct xt_iprange_mtinfo *info = par->matchinfo; const struct ipv6hdr *iph = ipv6_hdr(skb); bool m; diff --git a/net/netfilter/xt_length.c b/net/netfilter/xt_length.c index b8612d1914b..c4871ca6c86 100644 --- a/net/netfilter/xt_length.c +++ b/net/netfilter/xt_length.c @@ -21,24 +21,18 @@ MODULE_ALIAS("ipt_length"); MODULE_ALIAS("ip6t_length"); static bool -length_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +length_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_length_info *info = matchinfo; + const struct xt_length_info *info = par->matchinfo; u_int16_t pktlen = ntohs(ip_hdr(skb)->tot_len); return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; } static bool -length_mt6(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +length_mt6(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_length_info *info = matchinfo; + const struct xt_length_info *info = par->matchinfo; const u_int16_t pktlen = ntohs(ipv6_hdr(skb)->payload_len) + sizeof(struct ipv6hdr); diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index 00247bd1095..c475eac5dbe 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -58,13 +58,10 @@ static DEFINE_SPINLOCK(limit_lock); #define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ) static bool -limit_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +limit_mt(const struct sk_buff *skb, const struct xt_match_param *par) { struct xt_rateinfo *r = - ((const struct xt_rateinfo *)matchinfo)->master; + ((const struct xt_rateinfo *)par->matchinfo)->master; unsigned long now = jiffies; spin_lock_bh(&limit_lock); diff --git a/net/netfilter/xt_mac.c b/net/netfilter/xt_mac.c index 60db240098a..269f9d8aef5 100644 --- a/net/netfilter/xt_mac.c +++ b/net/netfilter/xt_mac.c @@ -24,12 +24,9 @@ MODULE_DESCRIPTION("Xtables: MAC address match"); MODULE_ALIAS("ipt_mac"); MODULE_ALIAS("ip6t_mac"); -static bool -mac_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +static bool mac_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_mac_info *info = matchinfo; + const struct xt_mac_info *info = par->matchinfo; /* Is mac pointer valid? */ return skb_mac_header(skb) >= skb->head && diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c index 96dd2b63b6b..88547614653 100644 --- a/net/netfilter/xt_mark.c +++ b/net/netfilter/xt_mark.c @@ -23,22 +23,17 @@ MODULE_ALIAS("ipt_mark"); MODULE_ALIAS("ip6t_mark"); static bool -mark_mt_v0(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +mark_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_mark_info *info = matchinfo; + const struct xt_mark_info *info = par->matchinfo; return ((skb->mark & info->mask) == info->mark) ^ info->invert; } static bool -mark_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +mark_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_mark_mtinfo1 *info = matchinfo; + const struct xt_mark_mtinfo1 *info = par->matchinfo; return ((skb->mark & info->mask) == info->mark) ^ info->invert; } diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c index f6fe008ab8c..7087e291528 100644 --- a/net/netfilter/xt_multiport.c +++ b/net/netfilter/xt_multiport.c @@ -95,25 +95,22 @@ ports_match_v1(const struct xt_multiport_v1 *minfo, } static bool -multiport_mt_v0(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +multiport_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) { const __be16 *pptr; __be16 _ports[2]; - const struct xt_multiport *multiinfo = matchinfo; + const struct xt_multiport *multiinfo = par->matchinfo; - if (offset) + if (par->fragoff != 0) return false; - pptr = skb_header_pointer(skb, protoff, sizeof(_ports), _ports); + pptr = skb_header_pointer(skb, par->thoff, sizeof(_ports), _ports); if (pptr == NULL) { /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ duprintf("xt_multiport: Dropping evil offset=0 tinygram.\n"); - *hotdrop = true; + *par->hotdrop = true; return false; } @@ -122,25 +119,22 @@ multiport_mt_v0(const struct sk_buff *skb, const struct net_device *in, } static bool -multiport_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +multiport_mt(const struct sk_buff *skb, const struct xt_match_param *par) { const __be16 *pptr; __be16 _ports[2]; - const struct xt_multiport_v1 *multiinfo = matchinfo; + const struct xt_multiport_v1 *multiinfo = par->matchinfo; - if (offset) + if (par->fragoff != 0) return false; - pptr = skb_header_pointer(skb, protoff, sizeof(_ports), _ports); + pptr = skb_header_pointer(skb, par->thoff, sizeof(_ports), _ports); if (pptr == NULL) { /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ duprintf("xt_multiport: Dropping evil offset=0 tinygram.\n"); - *hotdrop = true; + *par->hotdrop = true; return false; } diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index d1c3b7ae9b4..493b5eb8d14 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -21,12 +21,9 @@ #include static bool -owner_mt_v0(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +owner_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct ipt_owner_info *info = matchinfo; + const struct ipt_owner_info *info = par->matchinfo; const struct file *filp; if (skb->sk == NULL || skb->sk->sk_socket == NULL) @@ -50,12 +47,9 @@ owner_mt_v0(const struct sk_buff *skb, const struct net_device *in, } static bool -owner_mt6_v0(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +owner_mt6_v0(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct ip6t_owner_info *info = matchinfo; + const struct ip6t_owner_info *info = par->matchinfo; const struct file *filp; if (skb->sk == NULL || skb->sk->sk_socket == NULL) @@ -79,12 +73,9 @@ owner_mt6_v0(const struct sk_buff *skb, const struct net_device *in, } static bool -owner_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +owner_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_owner_match_info *info = matchinfo; + const struct xt_owner_match_info *info = par->matchinfo; const struct file *filp; if (skb->sk == NULL || skb->sk->sk_socket == NULL) diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index 72a0bdd53fa..e980e179d4f 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -21,14 +21,11 @@ MODULE_ALIAS("ipt_physdev"); MODULE_ALIAS("ip6t_physdev"); static bool -physdev_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +physdev_mt(const struct sk_buff *skb, const struct xt_match_param *par) { int i; static const char nulldevname[IFNAMSIZ]; - const struct xt_physdev_info *info = matchinfo; + const struct xt_physdev_info *info = par->matchinfo; bool ret; const char *indev, *outdev; const struct nf_bridge_info *nf_bridge; diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c index 81e86d319a8..37753a37760 100644 --- a/net/netfilter/xt_pkttype.c +++ b/net/netfilter/xt_pkttype.c @@ -23,20 +23,17 @@ MODULE_ALIAS("ipt_pkttype"); MODULE_ALIAS("ip6t_pkttype"); static bool -pkttype_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +pkttype_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_pkttype_info *info = matchinfo; + const struct xt_pkttype_info *info = par->matchinfo; u_int8_t type; if (skb->pkt_type != PACKET_LOOPBACK) type = skb->pkt_type; - else if (match->family == NFPROTO_IPV4 && + else if (par->match->family == NFPROTO_IPV4 && ipv4_is_multicast(ip_hdr(skb)->daddr)) type = PACKET_MULTICAST; - else if (match->family == NFPROTO_IPV6 && + else if (par->match->family == NFPROTO_IPV6 && ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF) type = PACKET_MULTICAST; else diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c index f1d514e9d0a..b0a00fb0511 100644 --- a/net/netfilter/xt_policy.c +++ b/net/netfilter/xt_policy.c @@ -110,18 +110,15 @@ match_policy_out(const struct sk_buff *skb, const struct xt_policy_info *info, } static bool -policy_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +policy_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_policy_info *info = matchinfo; + const struct xt_policy_info *info = par->matchinfo; int ret; if (info->flags & XT_POLICY_MATCH_IN) - ret = match_policy_in(skb, info, match->family); + ret = match_policy_in(skb, info, par->match->family); else - ret = match_policy_out(skb, info, match->family); + ret = match_policy_out(skb, info, par->match->family); if (ret < 0) ret = info->flags & XT_POLICY_MATCH_NONE ? true : false; diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c index a3c8798f0cc..3ab92666c14 100644 --- a/net/netfilter/xt_quota.c +++ b/net/netfilter/xt_quota.c @@ -18,13 +18,10 @@ MODULE_ALIAS("ip6t_quota"); static DEFINE_SPINLOCK(quota_lock); static bool -quota_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +quota_mt(const struct sk_buff *skb, const struct xt_match_param *par) { struct xt_quota_info *q = - ((const struct xt_quota_info *)matchinfo)->master; + ((const struct xt_quota_info *)par->matchinfo)->master; bool ret = q->flags & XT_QUOTA_INVERT; spin_lock_bh("a_lock); diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c index 4dcfd7353db..e9f64ef4565 100644 --- a/net/netfilter/xt_rateest.c +++ b/net/netfilter/xt_rateest.c @@ -14,16 +14,10 @@ #include -static bool xt_rateest_mt(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +static bool +xt_rateest_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_rateest_match_info *info = matchinfo; + const struct xt_rateest_match_info *info = par->matchinfo; struct gnet_stats_rate_est *r; u_int32_t bps1, bps2, pps1, pps2; bool ret = true; diff --git a/net/netfilter/xt_realm.c b/net/netfilter/xt_realm.c index ef65756d489..b25942110ed 100644 --- a/net/netfilter/xt_realm.c +++ b/net/netfilter/xt_realm.c @@ -22,12 +22,9 @@ MODULE_DESCRIPTION("Xtables: Routing realm match"); MODULE_ALIAS("ipt_realm"); static bool -realm_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +realm_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_realm_info *info = matchinfo; + const struct xt_realm_info *info = par->matchinfo; const struct dst_entry *dst = skb->dst; return (info->id == (dst->tclassid & info->mask)) ^ info->invert; diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 4a916e2624d..baeb90a5623 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -204,19 +204,16 @@ static void recent_table_flush(struct recent_table *t) } static bool -recent_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +recent_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_recent_mtinfo *info = matchinfo; + const struct xt_recent_mtinfo *info = par->matchinfo; struct recent_table *t; struct recent_entry *e; union nf_inet_addr addr = {}; u_int8_t ttl; bool ret = info->invert; - if (match->family == NFPROTO_IPV4) { + if (par->match->family == NFPROTO_IPV4) { const struct iphdr *iph = ip_hdr(skb); if (info->side == XT_RECENT_DEST) @@ -237,19 +234,19 @@ recent_mt(const struct sk_buff *skb, const struct net_device *in, } /* use TTL as seen before forwarding */ - if (out && !skb->sk) + if (par->out != NULL && skb->sk == NULL) ttl++; spin_lock_bh(&recent_lock); t = recent_table_lookup(info->name); - e = recent_entry_lookup(t, &addr, match->family, + e = recent_entry_lookup(t, &addr, par->match->family, (info->check_set & XT_RECENT_TTL) ? ttl : 0); if (e == NULL) { if (!(info->check_set & XT_RECENT_SET)) goto out; - e = recent_entry_init(t, &addr, match->family, ttl); + e = recent_entry_init(t, &addr, par->match->family, ttl); if (e == NULL) - *hotdrop = true; + *par->hotdrop = true; ret = !ret; goto out; } diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c index ab67aca4d8f..b0014ab65da 100644 --- a/net/netfilter/xt_sctp.c +++ b/net/netfilter/xt_sctp.c @@ -117,23 +117,21 @@ match_packet(const struct sk_buff *skb, } static bool -sctp_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +sctp_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_sctp_info *info = matchinfo; + const struct xt_sctp_info *info = par->matchinfo; const sctp_sctphdr_t *sh; sctp_sctphdr_t _sh; - if (offset) { + if (par->fragoff != 0) { duprintf("Dropping non-first fragment.. FIXME\n"); return false; } - sh = skb_header_pointer(skb, protoff, sizeof(_sh), &_sh); + sh = skb_header_pointer(skb, par->thoff, sizeof(_sh), &_sh); if (sh == NULL) { duprintf("Dropping evil TCP offset=0 tinygram.\n"); - *hotdrop = true; + *par->hotdrop = true; return false; } duprintf("spt: %d\tdpt: %d\n", ntohs(sh->source), ntohs(sh->dest)); @@ -144,8 +142,8 @@ sctp_mt(const struct sk_buff *skb, const struct net_device *in, && SCCHECK(ntohs(sh->dest) >= info->dpts[0] && ntohs(sh->dest) <= info->dpts[1], XT_SCTP_DEST_PORTS, info->flags, info->invflags) - && SCCHECK(match_packet(skb, protoff + sizeof (sctp_sctphdr_t), - info, hotdrop), + && SCCHECK(match_packet(skb, par->thoff + sizeof(sctp_sctphdr_t), + info, par->hotdrop), XT_SCTP_CHUNK_TYPES, info->flags, info->invflags); } diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index ac9db17c7b9..02a8fed2108 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -86,14 +86,7 @@ extract_icmp_fields(const struct sk_buff *skb, static bool -socket_mt(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - bool *hotdrop) +socket_mt(const struct sk_buff *skb, const struct xt_match_param *par) { const struct iphdr *iph = ip_hdr(skb); struct udphdr _hdr, *hp = NULL; @@ -146,7 +139,7 @@ socket_mt(const struct sk_buff *skb, #endif sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), protocol, - saddr, daddr, sport, dport, in, false); + saddr, daddr, sport, dport, par->in, false); if (sk != NULL) { bool wildcard = (inet_sk(sk)->rcv_saddr == 0); diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c index f92f8bcc1e3..29f5a8a1b02 100644 --- a/net/netfilter/xt_state.c +++ b/net/netfilter/xt_state.c @@ -21,12 +21,9 @@ MODULE_ALIAS("ipt_state"); MODULE_ALIAS("ip6t_state"); static bool -state_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +state_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_state_info *sinfo = matchinfo; + const struct xt_state_info *sinfo = par->matchinfo; enum ip_conntrack_info ctinfo; unsigned int statebit; diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c index f41a92322e6..dcadc491db2 100644 --- a/net/netfilter/xt_statistic.c +++ b/net/netfilter/xt_statistic.c @@ -25,12 +25,9 @@ MODULE_ALIAS("ip6t_statistic"); static DEFINE_SPINLOCK(nth_lock); static bool -statistic_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +statistic_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - struct xt_statistic_info *info = (struct xt_statistic_info *)matchinfo; + struct xt_statistic_info *info = (void *)par->matchinfo; bool ret = info->flags & XT_STATISTIC_INVERT; switch (info->mode) { diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c index 18d8884e737..33f2d29ca4f 100644 --- a/net/netfilter/xt_string.c +++ b/net/netfilter/xt_string.c @@ -22,18 +22,15 @@ MODULE_ALIAS("ipt_string"); MODULE_ALIAS("ip6t_string"); static bool -string_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +string_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_string_info *conf = matchinfo; + const struct xt_string_info *conf = par->matchinfo; struct ts_state state; int invert; memset(&state, 0, sizeof(struct ts_state)); - invert = (match->revision == 0 ? conf->u.v0.invert : + invert = (par->match->revision == 0 ? conf->u.v0.invert : conf->u.v1.flags & XT_STRING_FLAG_INVERT); return (skb_find_text((struct sk_buff *)skb, conf->from_offset, diff --git a/net/netfilter/xt_tcpmss.c b/net/netfilter/xt_tcpmss.c index 4791c7cbe5a..4809b34b10f 100644 --- a/net/netfilter/xt_tcpmss.c +++ b/net/netfilter/xt_tcpmss.c @@ -25,12 +25,9 @@ MODULE_ALIAS("ipt_tcpmss"); MODULE_ALIAS("ip6t_tcpmss"); static bool -tcpmss_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, - bool *hotdrop) +tcpmss_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_tcpmss_match_info *info = matchinfo; + const struct xt_tcpmss_match_info *info = par->matchinfo; const struct tcphdr *th; struct tcphdr _tcph; /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ @@ -39,7 +36,7 @@ tcpmss_mt(const struct sk_buff *skb, const struct net_device *in, unsigned int i, optlen; /* If we don't have the whole header, drop packet. */ - th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); + th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph); if (th == NULL) goto dropit; @@ -52,7 +49,7 @@ tcpmss_mt(const struct sk_buff *skb, const struct net_device *in, goto out; /* Truncated options. */ - op = skb_header_pointer(skb, protoff + sizeof(*th), optlen, _opt); + op = skb_header_pointer(skb, par->thoff + sizeof(*th), optlen, _opt); if (op == NULL) goto dropit; @@ -76,7 +73,7 @@ out: return info->invert; dropit: - *hotdrop = true; + *par->hotdrop = true; return false; } diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c index 5a6268cbb9f..66cf71b1d59 100644 --- a/net/netfilter/xt_tcpudp.c +++ b/net/netfilter/xt_tcpudp.c @@ -68,25 +68,22 @@ tcp_find_option(u_int8_t option, return invert; } -static bool -tcp_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +static bool tcp_mt(const struct sk_buff *skb, const struct xt_match_param *par) { const struct tcphdr *th; struct tcphdr _tcph; - const struct xt_tcp *tcpinfo = matchinfo; + const struct xt_tcp *tcpinfo = par->matchinfo; - if (offset) { + if (par->fragoff != 0) { /* To quote Alan: Don't allow a fragment of TCP 8 bytes in. Nobody normal causes this. Its a cracker trying to break in by doing a flag overwrite to pass the direction checks. */ - if (offset == 1) { + if (par->fragoff == 1) { duprintf("Dropping evil TCP offset=1 frag.\n"); - *hotdrop = true; + *par->hotdrop = true; } /* Must not be a fragment. */ return false; @@ -94,12 +91,12 @@ tcp_mt(const struct sk_buff *skb, const struct net_device *in, #define FWINVTCP(bool, invflg) ((bool) ^ !!(tcpinfo->invflags & (invflg))) - th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); + th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph); if (th == NULL) { /* We've been asked to examine this packet, and we can't. Hence, no choice but to drop. */ duprintf("Dropping evil TCP offset=0 tinygram.\n"); - *hotdrop = true; + *par->hotdrop = true; return false; } @@ -117,13 +114,13 @@ tcp_mt(const struct sk_buff *skb, const struct net_device *in, return false; if (tcpinfo->option) { if (th->doff * 4 < sizeof(_tcph)) { - *hotdrop = true; + *par->hotdrop = true; return false; } - if (!tcp_find_option(tcpinfo->option, skb, protoff, + if (!tcp_find_option(tcpinfo->option, skb, par->thoff, th->doff*4 - sizeof(_tcph), tcpinfo->invflags & XT_TCP_INV_OPTION, - hotdrop)) + par->hotdrop)) return false; } return true; @@ -141,25 +138,22 @@ tcp_mt_check(const char *tablename, const void *info, return !(tcpinfo->invflags & ~XT_TCP_INV_MASK); } -static bool -udp_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +static bool udp_mt(const struct sk_buff *skb, const struct xt_match_param *par) { const struct udphdr *uh; struct udphdr _udph; - const struct xt_udp *udpinfo = matchinfo; + const struct xt_udp *udpinfo = par->matchinfo; /* Must not be a fragment. */ - if (offset) + if (par->fragoff != 0) return false; - uh = skb_header_pointer(skb, protoff, sizeof(_udph), &_udph); + uh = skb_header_pointer(skb, par->thoff, sizeof(_udph), &_udph); if (uh == NULL) { /* We've been asked to examine this packet, and we can't. Hence, no choice but to drop. */ duprintf("Dropping evil UDP tinygram.\n"); - *hotdrop = true; + *par->hotdrop = true; return false; } diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index 32d4c769caa..28599d3979c 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -153,11 +153,9 @@ static void localtime_3(struct xtm *r, time_t time) } static bool -time_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +time_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_time_info *info = matchinfo; + const struct xt_time_info *info = par->matchinfo; unsigned int packet_time; struct xtm current_time; s64 stamp; diff --git a/net/netfilter/xt_u32.c b/net/netfilter/xt_u32.c index a6b971dc5d3..24a52762450 100644 --- a/net/netfilter/xt_u32.c +++ b/net/netfilter/xt_u32.c @@ -87,12 +87,9 @@ static bool u32_match_it(const struct xt_u32 *data, return true; } -static bool -u32_mt(const struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, const struct xt_match *match, - const void *matchinfo, int offset, unsigned int protoff, bool *hotdrop) +static bool u32_mt(const struct sk_buff *skb, const struct xt_match_param *par) { - const struct xt_u32 *data = matchinfo; + const struct xt_u32 *data = par->matchinfo; bool ret; ret = u32_match_it(data, skb); -- cgit v1.2.3 From 9b4fce7a3508a9776534188b6065b206a9608ccf Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:18 +0200 Subject: netfilter: xtables: move extension arguments into compound structure (2/6) This patch does this for match extensions' checkentry functions. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/netfilter/x_tables.c | 32 ++++++++++++++++---------------- net/netfilter/xt_connbytes.c | 14 +++++--------- net/netfilter/xt_connlimit.c | 13 +++++-------- net/netfilter/xt_connmark.c | 20 +++++++------------- net/netfilter/xt_conntrack.c | 9 +++------ net/netfilter/xt_dccp.c | 7 ++----- net/netfilter/xt_dscp.c | 11 ++++------- net/netfilter/xt_esp.c | 8 ++------ net/netfilter/xt_hashlimit.c | 24 +++++++++--------------- net/netfilter/xt_helper.c | 11 ++++------- net/netfilter/xt_limit.c | 7 ++----- net/netfilter/xt_mark.c | 7 ++----- net/netfilter/xt_multiport.c | 37 ++++++++++++------------------------- net/netfilter/xt_owner.c | 14 ++++---------- net/netfilter/xt_physdev.c | 13 +++++-------- net/netfilter/xt_policy.c | 15 ++++++--------- net/netfilter/xt_quota.c | 7 ++----- net/netfilter/xt_rateest.c | 8 ++------ net/netfilter/xt_recent.c | 7 ++----- net/netfilter/xt_sctp.c | 7 ++----- net/netfilter/xt_state.c | 9 +++------ net/netfilter/xt_statistic.c | 7 ++----- net/netfilter/xt_string.c | 9 +++------ net/netfilter/xt_tcpudp.c | 16 ++++------------ net/netfilter/xt_time.c | 7 ++----- 25 files changed, 110 insertions(+), 209 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index d1f2fb3e8f2..817ab14f7cd 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -321,39 +321,39 @@ int xt_find_revision(u8 af, const char *name, u8 revision, int target, } EXPORT_SYMBOL_GPL(xt_find_revision); -int xt_check_match(const struct xt_match *match, unsigned short family, - unsigned int size, const char *table, unsigned int hook_mask, - unsigned short proto, int inv_proto, const void *entry, - void *matchinfo) +int xt_check_match(struct xt_mtchk_param *par, u_int8_t family, + unsigned int size, u_int8_t proto, bool inv_proto) { - if (XT_ALIGN(match->matchsize) != size && - match->matchsize != -1) { + if (XT_ALIGN(par->match->matchsize) != size && + par->match->matchsize != -1) { /* * ebt_among is exempt from centralized matchsize checking * because it uses a dynamic-size data set. */ printk("%s_tables: %s match: invalid size %Zu != %u\n", - xt_prefix[family], match->name, - XT_ALIGN(match->matchsize), size); + xt_prefix[family], par->match->name, + XT_ALIGN(par->match->matchsize), size); return -EINVAL; } - if (match->table && strcmp(match->table, table)) { + if (par->match->table != NULL && + strcmp(par->match->table, par->table) != 0) { printk("%s_tables: %s match: only valid in %s table, not %s\n", - xt_prefix[family], match->name, match->table, table); + xt_prefix[family], par->match->name, + par->match->table, par->table); return -EINVAL; } - if (match->hooks && (hook_mask & ~match->hooks) != 0) { + if (par->match->hooks && (par->hook_mask & ~par->match->hooks) != 0) { printk("%s_tables: %s match: bad hook_mask %#x/%#x\n", - xt_prefix[family], match->name, hook_mask, match->hooks); + xt_prefix[family], par->match->name, + par->hook_mask, par->match->hooks); return -EINVAL; } - if (match->proto && (match->proto != proto || inv_proto)) { + if (par->match->proto && (par->match->proto != proto || inv_proto)) { printk("%s_tables: %s match: only valid for protocol %u\n", - xt_prefix[family], match->name, match->proto); + xt_prefix[family], par->match->name, par->match->proto); return -EINVAL; } - if (match->checkentry != NULL && - !match->checkentry(table, entry, match, matchinfo, hook_mask)) + if (par->match->checkentry != NULL && !par->match->checkentry(par)) return -EINVAL; return 0; } diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c index 30c19b5fe90..43a36c728e5 100644 --- a/net/netfilter/xt_connbytes.c +++ b/net/netfilter/xt_connbytes.c @@ -92,12 +92,9 @@ connbytes_mt(const struct sk_buff *skb, const struct xt_match_param *par) return what >= sinfo->count.from; } -static bool -connbytes_mt_check(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool connbytes_mt_check(const struct xt_mtchk_param *par) { - const struct xt_connbytes_info *sinfo = matchinfo; + const struct xt_connbytes_info *sinfo = par->matchinfo; if (sinfo->what != XT_CONNBYTES_PKTS && sinfo->what != XT_CONNBYTES_BYTES && @@ -109,17 +106,16 @@ connbytes_mt_check(const char *tablename, const void *ip, sinfo->direction != XT_CONNBYTES_DIR_BOTH) return false; - if (nf_ct_l3proto_try_module_get(match->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->match->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%u\n", match->family); + "proto=%u\n", par->match->family); return false; } return true; } -static void -connbytes_mt_destroy(const struct xt_match *match, void *matchinfo) +static void connbytes_mt_destroy(const struct xt_match *match, void *matchinfo) { nf_ct_l3proto_module_put(match->family); } diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 8b8f70e7664..1361e9919cf 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -221,24 +221,21 @@ connlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par) return false; } -static bool -connlimit_mt_check(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool connlimit_mt_check(const struct xt_mtchk_param *par) { - struct xt_connlimit_info *info = matchinfo; + struct xt_connlimit_info *info = par->matchinfo; unsigned int i; - if (nf_ct_l3proto_try_module_get(match->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->match->family) < 0) { printk(KERN_WARNING "cannot load conntrack support for " - "address family %u\n", match->family); + "address family %u\n", par->match->family); return false; } /* init private data */ info->data = kmalloc(sizeof(struct xt_connlimit_data), GFP_KERNEL); if (info->data == NULL) { - nf_ct_l3proto_module_put(match->family); + nf_ct_l3proto_module_put(par->match->family); return false; } diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index df4f4a865a5..b935b7888a9 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -61,33 +61,27 @@ connmark_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par) return ((ct->mark & info->mask) == info->mark) ^ info->invert; } -static bool -connmark_mt_check_v0(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool connmark_mt_check_v0(const struct xt_mtchk_param *par) { - const struct xt_connmark_info *cm = matchinfo; + const struct xt_connmark_info *cm = par->matchinfo; if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) { printk(KERN_WARNING "connmark: only support 32bit mark\n"); return false; } - if (nf_ct_l3proto_try_module_get(match->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->match->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%u\n", match->family); + "proto=%u\n", par->match->family); return false; } return true; } -static bool -connmark_mt_check(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool connmark_mt_check(const struct xt_mtchk_param *par) { - if (nf_ct_l3proto_try_module_get(match->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->match->family) < 0) { printk(KERN_WARNING "cannot load conntrack support for " - "proto=%u\n", match->family); + "proto=%u\n", par->match->family); return false; } return true; diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index 13a7e4eacdf..f04c46a02ce 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -278,14 +278,11 @@ conntrack_mt(const struct sk_buff *skb, const struct xt_match_param *par) return true; } -static bool -conntrack_mt_check(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool conntrack_mt_check(const struct xt_mtchk_param *par) { - if (nf_ct_l3proto_try_module_get(match->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->match->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%u\n", match->family); + "proto=%u\n", par->match->family); return false; } return true; diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c index 7aa30bb9105..e5d3e867328 100644 --- a/net/netfilter/xt_dccp.c +++ b/net/netfilter/xt_dccp.c @@ -121,12 +121,9 @@ dccp_mt(const struct sk_buff *skb, const struct xt_match_param *par) XT_DCCP_OPTION, info->flags, info->invflags); } -static bool -dccp_mt_check(const char *tablename, const void *inf, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool dccp_mt_check(const struct xt_mtchk_param *par) { - const struct xt_dccp_info *info = matchinfo; + const struct xt_dccp_info *info = par->matchinfo; return !(info->flags & ~XT_DCCP_VALID_FLAGS) && !(info->invflags & ~XT_DCCP_VALID_FLAGS) diff --git a/net/netfilter/xt_dscp.c b/net/netfilter/xt_dscp.c index 57d61206135..c3f8085460d 100644 --- a/net/netfilter/xt_dscp.c +++ b/net/netfilter/xt_dscp.c @@ -43,15 +43,12 @@ dscp_mt6(const struct sk_buff *skb, const struct xt_match_param *par) return (dscp == info->dscp) ^ !!info->invert; } -static bool -dscp_mt_check(const char *tablename, const void *info, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool dscp_mt_check(const struct xt_mtchk_param *par) { - const u_int8_t dscp = ((struct xt_dscp_info *)matchinfo)->dscp; + const struct xt_dscp_info *info = par->matchinfo; - if (dscp > XT_DSCP_MAX) { - printk(KERN_ERR "xt_dscp: dscp %x out of range\n", dscp); + if (info->dscp > XT_DSCP_MAX) { + printk(KERN_ERR "xt_dscp: dscp %x out of range\n", info->dscp); return false; } diff --git a/net/netfilter/xt_esp.c b/net/netfilter/xt_esp.c index 6d59f2e7c1c..609439967c2 100644 --- a/net/netfilter/xt_esp.c +++ b/net/netfilter/xt_esp.c @@ -66,13 +66,9 @@ static bool esp_mt(const struct sk_buff *skb, const struct xt_match_param *par) !!(espinfo->invflags & XT_ESP_INV_SPI)); } -/* Called when user tries to insert an entry of this type. */ -static bool -esp_mt_check(const char *tablename, const void *ip_void, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool esp_mt_check(const struct xt_mtchk_param *par) { - const struct xt_esp *espinfo = matchinfo; + const struct xt_esp *espinfo = par->matchinfo; if (espinfo->invflags & ~XT_ESP_INV_MASK) { duprintf("xt_esp: unknown flags %X\n", espinfo->invflags); diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 22a60a728cf..2f73820e46d 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -664,12 +664,9 @@ hashlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par) return false; } -static bool -hashlimit_mt_check_v0(const char *tablename, const void *inf, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool hashlimit_mt_check_v0(const struct xt_mtchk_param *par) { - struct xt_hashlimit_info *r = matchinfo; + struct xt_hashlimit_info *r = par->matchinfo; /* Check for overflow. */ if (r->cfg.burst == 0 || @@ -698,8 +695,8 @@ hashlimit_mt_check_v0(const char *tablename, const void *inf, * the list of htable's in htable_create(), since then we would * create duplicate proc files. -HW */ mutex_lock(&hlimit_mutex); - r->hinfo = htable_find_get(r->name, match->family); - if (!r->hinfo && htable_create_v0(r, match->family) != 0) { + r->hinfo = htable_find_get(r->name, par->match->family); + if (!r->hinfo && htable_create_v0(r, par->match->family) != 0) { mutex_unlock(&hlimit_mutex); return false; } @@ -710,12 +707,9 @@ hashlimit_mt_check_v0(const char *tablename, const void *inf, return true; } -static bool -hashlimit_mt_check(const char *tablename, const void *inf, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool hashlimit_mt_check(const struct xt_mtchk_param *par) { - struct xt_hashlimit_mtinfo1 *info = matchinfo; + struct xt_hashlimit_mtinfo1 *info = par->matchinfo; /* Check for overflow. */ if (info->cfg.burst == 0 || @@ -729,7 +723,7 @@ hashlimit_mt_check(const char *tablename, const void *inf, return false; if (info->name[sizeof(info->name)-1] != '\0') return false; - if (match->family == NFPROTO_IPV4) { + if (par->match->family == NFPROTO_IPV4) { if (info->cfg.srcmask > 32 || info->cfg.dstmask > 32) return false; } else { @@ -744,8 +738,8 @@ hashlimit_mt_check(const char *tablename, const void *inf, * the list of htable's in htable_create(), since then we would * create duplicate proc files. -HW */ mutex_lock(&hlimit_mutex); - info->hinfo = htable_find_get(info->name, match->family); - if (!info->hinfo && htable_create(info, match->family) != 0) { + info->hinfo = htable_find_get(info->name, par->match->family); + if (!info->hinfo && htable_create(info, par->match->family) != 0) { mutex_unlock(&hlimit_mutex); return false; } diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c index 73bdc3ba13f..86d3c332fcb 100644 --- a/net/netfilter/xt_helper.c +++ b/net/netfilter/xt_helper.c @@ -54,16 +54,13 @@ helper_mt(const struct sk_buff *skb, const struct xt_match_param *par) return ret; } -static bool -helper_mt_check(const char *tablename, const void *inf, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool helper_mt_check(const struct xt_mtchk_param *par) { - struct xt_helper_info *info = matchinfo; + struct xt_helper_info *info = par->matchinfo; - if (nf_ct_l3proto_try_module_get(match->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->match->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%u\n", match->family); + "proto=%u\n", par->match->family); return false; } info->name[29] = '\0'; diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index c475eac5dbe..c908d69a559 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -92,12 +92,9 @@ user2credits(u_int32_t user) return (user * HZ * CREDITS_PER_JIFFY) / XT_LIMIT_SCALE; } -static bool -limit_mt_check(const char *tablename, const void *inf, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool limit_mt_check(const struct xt_mtchk_param *par) { - struct xt_rateinfo *r = matchinfo; + struct xt_rateinfo *r = par->matchinfo; /* Check for overflow. */ if (r->burst == 0 diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c index 88547614653..10b9e34bbc5 100644 --- a/net/netfilter/xt_mark.c +++ b/net/netfilter/xt_mark.c @@ -38,12 +38,9 @@ mark_mt(const struct sk_buff *skb, const struct xt_match_param *par) return ((skb->mark & info->mask) == info->mark) ^ info->invert; } -static bool -mark_mt_check_v0(const char *tablename, const void *entry, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool mark_mt_check_v0(const struct xt_mtchk_param *par) { - const struct xt_mark_info *minfo = matchinfo; + const struct xt_mark_info *minfo = par->matchinfo; if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) { printk(KERN_WARNING "mark: only supports 32bit mark\n"); diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c index 7087e291528..d06bb2dd390 100644 --- a/net/netfilter/xt_multiport.c +++ b/net/netfilter/xt_multiport.c @@ -158,50 +158,37 @@ check(u_int16_t proto, && count <= XT_MULTI_PORTS; } -/* Called when user tries to insert an entry of this type. */ -static bool -multiport_mt_check_v0(const char *tablename, const void *info, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool multiport_mt_check_v0(const struct xt_mtchk_param *par) { - const struct ipt_ip *ip = info; - const struct xt_multiport *multiinfo = matchinfo; + const struct ipt_ip *ip = par->entryinfo; + const struct xt_multiport *multiinfo = par->matchinfo; return check(ip->proto, ip->invflags, multiinfo->flags, multiinfo->count); } -static bool -multiport_mt_check(const char *tablename, const void *info, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool multiport_mt_check(const struct xt_mtchk_param *par) { - const struct ipt_ip *ip = info; - const struct xt_multiport_v1 *multiinfo = matchinfo; + const struct ipt_ip *ip = par->entryinfo; + const struct xt_multiport_v1 *multiinfo = par->matchinfo; return check(ip->proto, ip->invflags, multiinfo->flags, multiinfo->count); } -static bool -multiport_mt6_check_v0(const char *tablename, const void *info, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool multiport_mt6_check_v0(const struct xt_mtchk_param *par) { - const struct ip6t_ip6 *ip = info; - const struct xt_multiport *multiinfo = matchinfo; + const struct ip6t_ip6 *ip = par->entryinfo; + const struct xt_multiport *multiinfo = par->matchinfo; return check(ip->proto, ip->invflags, multiinfo->flags, multiinfo->count); } -static bool -multiport_mt6_check(const char *tablename, const void *info, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool multiport_mt6_check(const struct xt_mtchk_param *par) { - const struct ip6t_ip6 *ip = info; - const struct xt_multiport_v1 *multiinfo = matchinfo; + const struct ip6t_ip6 *ip = par->entryinfo; + const struct xt_multiport_v1 *multiinfo = par->matchinfo; return check(ip->proto, ip->invflags, multiinfo->flags, multiinfo->count); diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index 493b5eb8d14..32f84e84d9e 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -107,12 +107,9 @@ owner_mt(const struct sk_buff *skb, const struct xt_match_param *par) return true; } -static bool -owner_mt_check_v0(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool owner_mt_check_v0(const struct xt_mtchk_param *par) { - const struct ipt_owner_info *info = matchinfo; + const struct ipt_owner_info *info = par->matchinfo; if (info->match & (IPT_OWNER_PID | IPT_OWNER_SID | IPT_OWNER_COMM)) { printk(KERN_WARNING KBUILD_MODNAME @@ -124,12 +121,9 @@ owner_mt_check_v0(const char *tablename, const void *ip, return true; } -static bool -owner_mt6_check_v0(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool owner_mt6_check_v0(const struct xt_mtchk_param *par) { - const struct ip6t_owner_info *info = matchinfo; + const struct ip6t_owner_info *info = par->matchinfo; if (info->match & (IP6T_OWNER_PID | IP6T_OWNER_SID)) { printk(KERN_WARNING KBUILD_MODNAME diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index e980e179d4f..b01786d2dd9 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -91,12 +91,9 @@ match_outdev: return ret ^ !(info->invert & XT_PHYSDEV_OP_OUT); } -static bool -physdev_mt_check(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool physdev_mt_check(const struct xt_mtchk_param *par) { - const struct xt_physdev_info *info = matchinfo; + const struct xt_physdev_info *info = par->matchinfo; if (!(info->bitmask & XT_PHYSDEV_OP_MASK) || info->bitmask & ~XT_PHYSDEV_OP_MASK) @@ -104,12 +101,12 @@ physdev_mt_check(const char *tablename, const void *ip, if (info->bitmask & XT_PHYSDEV_OP_OUT && (!(info->bitmask & XT_PHYSDEV_OP_BRIDGED) || info->invert & XT_PHYSDEV_OP_BRIDGED) && - hook_mask & ((1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | - (1 << NF_INET_POST_ROUTING))) { + par->hook_mask & ((1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_FORWARD) | (1 << NF_INET_POST_ROUTING))) { printk(KERN_WARNING "physdev match: using --physdev-out in the " "OUTPUT, FORWARD and POSTROUTING chains for non-bridged " "traffic is not supported anymore.\n"); - if (hook_mask & (1 << NF_INET_LOCAL_OUT)) + if (par->hook_mask & (1 << NF_INET_LOCAL_OUT)) return false; } return true; diff --git a/net/netfilter/xt_policy.c b/net/netfilter/xt_policy.c index b0a00fb0511..328bd20ddd2 100644 --- a/net/netfilter/xt_policy.c +++ b/net/netfilter/xt_policy.c @@ -128,26 +128,23 @@ policy_mt(const struct sk_buff *skb, const struct xt_match_param *par) return ret; } -static bool -policy_mt_check(const char *tablename, const void *ip_void, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool policy_mt_check(const struct xt_mtchk_param *par) { - const struct xt_policy_info *info = matchinfo; + const struct xt_policy_info *info = par->matchinfo; if (!(info->flags & (XT_POLICY_MATCH_IN|XT_POLICY_MATCH_OUT))) { printk(KERN_ERR "xt_policy: neither incoming nor " "outgoing policy selected\n"); return false; } - if (hook_mask & (1 << NF_INET_PRE_ROUTING | 1 << NF_INET_LOCAL_IN) - && info->flags & XT_POLICY_MATCH_OUT) { + if (par->hook_mask & ((1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN)) && info->flags & XT_POLICY_MATCH_OUT) { printk(KERN_ERR "xt_policy: output policy not valid in " "PRE_ROUTING and INPUT\n"); return false; } - if (hook_mask & (1 << NF_INET_POST_ROUTING | 1 << NF_INET_LOCAL_OUT) - && info->flags & XT_POLICY_MATCH_IN) { + if (par->hook_mask & ((1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_OUT)) && info->flags & XT_POLICY_MATCH_IN) { printk(KERN_ERR "xt_policy: input policy not valid in " "POST_ROUTING and OUTPUT\n"); return false; diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c index 3ab92666c14..c84fce5e0f3 100644 --- a/net/netfilter/xt_quota.c +++ b/net/netfilter/xt_quota.c @@ -37,12 +37,9 @@ quota_mt(const struct sk_buff *skb, const struct xt_match_param *par) return ret; } -static bool -quota_mt_check(const char *tablename, const void *entry, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool quota_mt_check(const struct xt_mtchk_param *par) { - struct xt_quota_info *q = matchinfo; + struct xt_quota_info *q = par->matchinfo; if (q->flags & ~XT_QUOTA_MASK) return false; diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c index e9f64ef4565..4b05ce168a7 100644 --- a/net/netfilter/xt_rateest.c +++ b/net/netfilter/xt_rateest.c @@ -74,13 +74,9 @@ xt_rateest_mt(const struct sk_buff *skb, const struct xt_match_param *par) return ret; } -static bool xt_rateest_mt_checkentry(const char *tablename, - const void *ip, - const struct xt_match *match, - void *matchinfo, - unsigned int hook_mask) +static bool xt_rateest_mt_checkentry(const struct xt_mtchk_param *par) { - struct xt_rateest_match_info *info = matchinfo; + struct xt_rateest_match_info *info = par->matchinfo; struct xt_rateest *est1, *est2; if (hweight32(info->flags & (XT_RATEEST_MATCH_ABS | diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index baeb90a5623..a512b49f3fe 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -280,12 +280,9 @@ out: return ret; } -static bool -recent_mt_check(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool recent_mt_check(const struct xt_mtchk_param *par) { - const struct xt_recent_mtinfo *info = matchinfo; + const struct xt_recent_mtinfo *info = par->matchinfo; struct recent_table *t; unsigned i; bool ret = false; diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c index b0014ab65da..e223cb43ae8 100644 --- a/net/netfilter/xt_sctp.c +++ b/net/netfilter/xt_sctp.c @@ -147,12 +147,9 @@ sctp_mt(const struct sk_buff *skb, const struct xt_match_param *par) XT_SCTP_CHUNK_TYPES, info->flags, info->invflags); } -static bool -sctp_mt_check(const char *tablename, const void *inf, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool sctp_mt_check(const struct xt_mtchk_param *par) { - const struct xt_sctp_info *info = matchinfo; + const struct xt_sctp_info *info = par->matchinfo; return !(info->flags & ~XT_SCTP_VALID_FLAGS) && !(info->invflags & ~XT_SCTP_VALID_FLAGS) diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c index 29f5a8a1b02..88b1235519d 100644 --- a/net/netfilter/xt_state.c +++ b/net/netfilter/xt_state.c @@ -37,14 +37,11 @@ state_mt(const struct sk_buff *skb, const struct xt_match_param *par) return (sinfo->statemask & statebit); } -static bool -state_mt_check(const char *tablename, const void *inf, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool state_mt_check(const struct xt_mtchk_param *par) { - if (nf_ct_l3proto_try_module_get(match->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->match->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%u\n", match->family); + "proto=%u\n", par->match->family); return false; } return true; diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c index dcadc491db2..0d75141139d 100644 --- a/net/netfilter/xt_statistic.c +++ b/net/netfilter/xt_statistic.c @@ -49,12 +49,9 @@ statistic_mt(const struct sk_buff *skb, const struct xt_match_param *par) return ret; } -static bool -statistic_mt_check(const char *tablename, const void *entry, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool statistic_mt_check(const struct xt_mtchk_param *par) { - struct xt_statistic_info *info = matchinfo; + struct xt_statistic_info *info = par->matchinfo; if (info->mode > XT_STATISTIC_MODE_MAX || info->flags & ~XT_STATISTIC_MASK) diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c index 33f2d29ca4f..c9407aa78f7 100644 --- a/net/netfilter/xt_string.c +++ b/net/netfilter/xt_string.c @@ -40,12 +40,9 @@ string_mt(const struct sk_buff *skb, const struct xt_match_param *par) #define STRING_TEXT_PRIV(m) ((struct xt_string_info *)(m)) -static bool -string_mt_check(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool string_mt_check(const struct xt_mtchk_param *par) { - struct xt_string_info *conf = matchinfo; + struct xt_string_info *conf = par->matchinfo; struct ts_config *ts_conf; int flags = TS_AUTOLOAD; @@ -56,7 +53,7 @@ string_mt_check(const char *tablename, const void *ip, return false; if (conf->patlen > XT_STRING_MAX_PATTERN_SIZE) return false; - if (match->revision == 1) { + if (par->match->revision == 1) { if (conf->u.v1.flags & ~(XT_STRING_FLAG_IGNORECASE | XT_STRING_FLAG_INVERT)) return false; diff --git a/net/netfilter/xt_tcpudp.c b/net/netfilter/xt_tcpudp.c index 66cf71b1d59..1ebdc4934ee 100644 --- a/net/netfilter/xt_tcpudp.c +++ b/net/netfilter/xt_tcpudp.c @@ -126,13 +126,9 @@ static bool tcp_mt(const struct sk_buff *skb, const struct xt_match_param *par) return true; } -/* Called when user tries to insert an entry of this type. */ -static bool -tcp_mt_check(const char *tablename, const void *info, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool tcp_mt_check(const struct xt_mtchk_param *par) { - const struct xt_tcp *tcpinfo = matchinfo; + const struct xt_tcp *tcpinfo = par->matchinfo; /* Must specify no unknown invflags */ return !(tcpinfo->invflags & ~XT_TCP_INV_MASK); @@ -165,13 +161,9 @@ static bool udp_mt(const struct sk_buff *skb, const struct xt_match_param *par) !!(udpinfo->invflags & XT_UDP_INV_DSTPT)); } -/* Called when user tries to insert an entry of this type. */ -static bool -udp_mt_check(const char *tablename, const void *info, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool udp_mt_check(const struct xt_mtchk_param *par) { - const struct xt_udp *udpinfo = matchinfo; + const struct xt_udp *udpinfo = par->matchinfo; /* Must specify no unknown invflags */ return !(udpinfo->invflags & ~XT_UDP_INV_MASK); diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index 28599d3979c..29375ba8db7 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -218,12 +218,9 @@ time_mt(const struct sk_buff *skb, const struct xt_match_param *par) return true; } -static bool -time_mt_check(const char *tablename, const void *ip, - const struct xt_match *match, void *matchinfo, - unsigned int hook_mask) +static bool time_mt_check(const struct xt_mtchk_param *par) { - const struct xt_time_info *info = matchinfo; + const struct xt_time_info *info = par->matchinfo; if (info->daytime_start > XT_TIME_MAX_DAYTIME || info->daytime_stop > XT_TIME_MAX_DAYTIME) { -- cgit v1.2.3 From 6be3d8598e883fb632edf059ba2f8d1b9f4da138 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:19 +0200 Subject: netfilter: xtables: move extension arguments into compound structure (3/6) This patch does this for match extensions' destroy functions. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/netfilter/xt_connbytes.c | 4 ++-- net/netfilter/xt_connlimit.c | 7 +++---- net/netfilter/xt_connmark.c | 5 ++--- net/netfilter/xt_conntrack.c | 5 ++--- net/netfilter/xt_hashlimit.c | 9 ++++----- net/netfilter/xt_helper.c | 4 ++-- net/netfilter/xt_rateest.c | 5 ++--- net/netfilter/xt_recent.c | 4 ++-- net/netfilter/xt_state.c | 4 ++-- net/netfilter/xt_string.c | 4 ++-- 10 files changed, 23 insertions(+), 28 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c index 43a36c728e5..5bf4aa08b0f 100644 --- a/net/netfilter/xt_connbytes.c +++ b/net/netfilter/xt_connbytes.c @@ -115,9 +115,9 @@ static bool connbytes_mt_check(const struct xt_mtchk_param *par) return true; } -static void connbytes_mt_destroy(const struct xt_match *match, void *matchinfo) +static void connbytes_mt_destroy(const struct xt_mtdtor_param *par) { - nf_ct_l3proto_module_put(match->family); + nf_ct_l3proto_module_put(par->match->family); } static struct xt_match connbytes_mt_reg[] __read_mostly = { diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 1361e9919cf..bfb3ee6c712 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -246,16 +246,15 @@ static bool connlimit_mt_check(const struct xt_mtchk_param *par) return true; } -static void -connlimit_mt_destroy(const struct xt_match *match, void *matchinfo) +static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) { - const struct xt_connlimit_info *info = matchinfo; + const struct xt_connlimit_info *info = par->matchinfo; struct xt_connlimit_conn *conn; struct xt_connlimit_conn *tmp; struct list_head *hash = info->data->iphash; unsigned int i; - nf_ct_l3proto_module_put(match->family); + nf_ct_l3proto_module_put(par->match->family); for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) { list_for_each_entry_safe(conn, tmp, &hash[i], list) { diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index b935b7888a9..c708577ea1b 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -87,10 +87,9 @@ static bool connmark_mt_check(const struct xt_mtchk_param *par) return true; } -static void -connmark_mt_destroy(const struct xt_match *match, void *matchinfo) +static void connmark_mt_destroy(const struct xt_mtdtor_param *par) { - nf_ct_l3proto_module_put(match->family); + nf_ct_l3proto_module_put(par->match->family); } #ifdef CONFIG_COMPAT diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index f04c46a02ce..5cd58d7fcb1 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -288,10 +288,9 @@ static bool conntrack_mt_check(const struct xt_mtchk_param *par) return true; } -static void -conntrack_mt_destroy(const struct xt_match *match, void *matchinfo) +static void conntrack_mt_destroy(const struct xt_mtdtor_param *par) { - nf_ct_l3proto_module_put(match->family); + nf_ct_l3proto_module_put(par->match->family); } #ifdef CONFIG_COMPAT diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 2f73820e46d..6fc4292d46e 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -748,17 +748,16 @@ static bool hashlimit_mt_check(const struct xt_mtchk_param *par) } static void -hashlimit_mt_destroy_v0(const struct xt_match *match, void *matchinfo) +hashlimit_mt_destroy_v0(const struct xt_mtdtor_param *par) { - const struct xt_hashlimit_info *r = matchinfo; + const struct xt_hashlimit_info *r = par->matchinfo; htable_put(r->hinfo); } -static void -hashlimit_mt_destroy(const struct xt_match *match, void *matchinfo) +static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par) { - const struct xt_hashlimit_mtinfo1 *info = matchinfo; + const struct xt_hashlimit_mtinfo1 *info = par->matchinfo; htable_put(info->hinfo); } diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c index 86d3c332fcb..280c984349f 100644 --- a/net/netfilter/xt_helper.c +++ b/net/netfilter/xt_helper.c @@ -67,9 +67,9 @@ static bool helper_mt_check(const struct xt_mtchk_param *par) return true; } -static void helper_mt_destroy(const struct xt_match *match, void *matchinfo) +static void helper_mt_destroy(const struct xt_mtdtor_param *par) { - nf_ct_l3proto_module_put(match->family); + nf_ct_l3proto_module_put(par->match->family); } static struct xt_match helper_mt_reg[] __read_mostly = { diff --git a/net/netfilter/xt_rateest.c b/net/netfilter/xt_rateest.c index 4b05ce168a7..220a1d588ee 100644 --- a/net/netfilter/xt_rateest.c +++ b/net/netfilter/xt_rateest.c @@ -117,10 +117,9 @@ err1: return false; } -static void xt_rateest_mt_destroy(const struct xt_match *match, - void *matchinfo) +static void xt_rateest_mt_destroy(const struct xt_mtdtor_param *par) { - struct xt_rateest_match_info *info = matchinfo; + struct xt_rateest_match_info *info = par->matchinfo; xt_rateest_put(info->est1); if (info->est2) diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index a512b49f3fe..4ebd4ca9a99 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -349,9 +349,9 @@ out: return ret; } -static void recent_mt_destroy(const struct xt_match *match, void *matchinfo) +static void recent_mt_destroy(const struct xt_mtdtor_param *par) { - const struct xt_recent_mtinfo *info = matchinfo; + const struct xt_recent_mtinfo *info = par->matchinfo; struct recent_table *t; mutex_lock(&recent_mutex); diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c index 88b1235519d..4c946cbd731 100644 --- a/net/netfilter/xt_state.c +++ b/net/netfilter/xt_state.c @@ -47,9 +47,9 @@ static bool state_mt_check(const struct xt_mtchk_param *par) return true; } -static void state_mt_destroy(const struct xt_match *match, void *matchinfo) +static void state_mt_destroy(const struct xt_mtdtor_param *par) { - nf_ct_l3proto_module_put(match->family); + nf_ct_l3proto_module_put(par->match->family); } static struct xt_match state_mt_reg[] __read_mostly = { diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c index c9407aa78f7..b4d77411131 100644 --- a/net/netfilter/xt_string.c +++ b/net/netfilter/xt_string.c @@ -70,9 +70,9 @@ static bool string_mt_check(const struct xt_mtchk_param *par) return true; } -static void string_mt_destroy(const struct xt_match *match, void *matchinfo) +static void string_mt_destroy(const struct xt_mtdtor_param *par) { - textsearch_destroy(STRING_TEXT_PRIV(matchinfo)->config); + textsearch_destroy(STRING_TEXT_PRIV(par->matchinfo)->config); } static struct xt_match xt_string_mt_reg[] __read_mostly = { -- cgit v1.2.3 From 7eb3558655aaa87a3e71a0c065dfaddda521fa6d Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:19 +0200 Subject: netfilter: xtables: move extension arguments into compound structure (4/6) This patch does this for target extensions' target functions. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/netfilter/xt_CLASSIFY.c | 6 ++---- net/netfilter/xt_CONNMARK.c | 12 ++++-------- net/netfilter/xt_CONNSECMARK.c | 6 ++---- net/netfilter/xt_DSCP.c | 30 ++++++++++-------------------- net/netfilter/xt_MARK.c | 18 ++++++------------ net/netfilter/xt_NFLOG.c | 10 ++++------ net/netfilter/xt_NFQUEUE.c | 6 ++---- net/netfilter/xt_NOTRACK.c | 4 +--- net/netfilter/xt_RATEEST.c | 9 ++------- net/netfilter/xt_SECMARK.c | 6 ++---- net/netfilter/xt_TCPMSS.c | 12 ++++-------- net/netfilter/xt_TCPOPTSTRIP.c | 12 ++++-------- net/netfilter/xt_TPROXY.c | 11 +++-------- net/netfilter/xt_TRACE.c | 4 +--- 14 files changed, 47 insertions(+), 99 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c index 8cffa295dd3..011bc80dd2a 100644 --- a/net/netfilter/xt_CLASSIFY.c +++ b/net/netfilter/xt_CLASSIFY.c @@ -27,11 +27,9 @@ MODULE_ALIAS("ipt_CLASSIFY"); MODULE_ALIAS("ip6t_CLASSIFY"); static unsigned int -classify_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +classify_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct xt_classify_target_info *clinfo = targinfo; + const struct xt_classify_target_info *clinfo = par->targinfo; skb->priority = clinfo->priority; return XT_CONTINUE; diff --git a/net/netfilter/xt_CONNMARK.c b/net/netfilter/xt_CONNMARK.c index e1415c3f5c9..95ed267328a 100644 --- a/net/netfilter/xt_CONNMARK.c +++ b/net/netfilter/xt_CONNMARK.c @@ -36,11 +36,9 @@ MODULE_ALIAS("ip6t_CONNMARK"); #include static unsigned int -connmark_tg_v0(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +connmark_tg_v0(struct sk_buff *skb, const struct xt_target_param *par) { - const struct xt_connmark_target_info *markinfo = targinfo; + const struct xt_connmark_target_info *markinfo = par->targinfo; struct nf_conn *ct; enum ip_conntrack_info ctinfo; u_int32_t diff; @@ -77,11 +75,9 @@ connmark_tg_v0(struct sk_buff *skb, const struct net_device *in, } static unsigned int -connmark_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +connmark_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct xt_connmark_tginfo1 *info = targinfo; + const struct xt_connmark_tginfo1 *info = par->targinfo; enum ip_conntrack_info ctinfo; struct nf_conn *ct; u_int32_t newmark; diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c index 5f221c3bd35..2211a2cef28 100644 --- a/net/netfilter/xt_CONNSECMARK.c +++ b/net/netfilter/xt_CONNSECMARK.c @@ -65,11 +65,9 @@ static void secmark_restore(struct sk_buff *skb) } static unsigned int -connsecmark_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +connsecmark_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct xt_connsecmark_target_info *info = targinfo; + const struct xt_connsecmark_target_info *info = par->targinfo; switch (info->mode) { case CONNSECMARK_SAVE: diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c index f0b4958528e..c78e80afdf3 100644 --- a/net/netfilter/xt_DSCP.c +++ b/net/netfilter/xt_DSCP.c @@ -29,11 +29,9 @@ MODULE_ALIAS("ipt_TOS"); MODULE_ALIAS("ip6t_TOS"); static unsigned int -dscp_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +dscp_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct xt_DSCP_info *dinfo = targinfo; + const struct xt_DSCP_info *dinfo = par->targinfo; u_int8_t dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; if (dscp != dinfo->dscp) { @@ -48,11 +46,9 @@ dscp_tg(struct sk_buff *skb, const struct net_device *in, } static unsigned int -dscp_tg6(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +dscp_tg6(struct sk_buff *skb, const struct xt_target_param *par) { - const struct xt_DSCP_info *dinfo = targinfo; + const struct xt_DSCP_info *dinfo = par->targinfo; u_int8_t dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; if (dscp != dinfo->dscp) { @@ -80,11 +76,9 @@ dscp_tg_check(const char *tablename, const void *e_void, } static unsigned int -tos_tg_v0(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +tos_tg_v0(struct sk_buff *skb, const struct xt_target_param *par) { - const struct ipt_tos_target_info *info = targinfo; + const struct ipt_tos_target_info *info = par->targinfo; struct iphdr *iph = ip_hdr(skb); u_int8_t oldtos; @@ -119,11 +113,9 @@ tos_tg_check_v0(const char *tablename, const void *e_void, } static unsigned int -tos_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +tos_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct xt_tos_target_info *info = targinfo; + const struct xt_tos_target_info *info = par->targinfo; struct iphdr *iph = ip_hdr(skb); u_int8_t orig, nv; @@ -141,11 +133,9 @@ tos_tg(struct sk_buff *skb, const struct net_device *in, } static unsigned int -tos_tg6(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +tos_tg6(struct sk_buff *skb, const struct xt_target_param *par) { - const struct xt_tos_target_info *info = targinfo; + const struct xt_tos_target_info *info = par->targinfo; struct ipv6hdr *iph = ipv6_hdr(skb); u_int8_t orig, nv; diff --git a/net/netfilter/xt_MARK.c b/net/netfilter/xt_MARK.c index c8ea7a80970..27d03f39611 100644 --- a/net/netfilter/xt_MARK.c +++ b/net/netfilter/xt_MARK.c @@ -25,22 +25,18 @@ MODULE_ALIAS("ipt_MARK"); MODULE_ALIAS("ip6t_MARK"); static unsigned int -mark_tg_v0(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +mark_tg_v0(struct sk_buff *skb, const struct xt_target_param *par) { - const struct xt_mark_target_info *markinfo = targinfo; + const struct xt_mark_target_info *markinfo = par->targinfo; skb->mark = markinfo->mark; return XT_CONTINUE; } static unsigned int -mark_tg_v1(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +mark_tg_v1(struct sk_buff *skb, const struct xt_target_param *par) { - const struct xt_mark_target_info_v1 *markinfo = targinfo; + const struct xt_mark_target_info_v1 *markinfo = par->targinfo; int mark = 0; switch (markinfo->mode) { @@ -62,11 +58,9 @@ mark_tg_v1(struct sk_buff *skb, const struct net_device *in, } static unsigned int -mark_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +mark_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct xt_mark_tginfo2 *info = targinfo; + const struct xt_mark_tginfo2 *info = par->targinfo; skb->mark = (skb->mark & ~info->mask) ^ info->mark; return XT_CONTINUE; diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index 9b095520176..3218ad63bd1 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -21,11 +21,9 @@ MODULE_ALIAS("ipt_NFLOG"); MODULE_ALIAS("ip6t_NFLOG"); static unsigned int -nflog_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +nflog_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct xt_nflog_info *info = targinfo; + const struct xt_nflog_info *info = par->targinfo; struct nf_loginfo li; li.type = NF_LOG_TYPE_ULOG; @@ -33,8 +31,8 @@ nflog_tg(struct sk_buff *skb, const struct net_device *in, li.u.ulog.group = info->group; li.u.ulog.qthreshold = info->threshold; - nf_log_packet(target->family, hooknum, skb, in, out, &li, - "%s", info->prefix); + nf_log_packet(par->target->family, par->hooknum, skb, par->in, + par->out, &li, "%s", info->prefix); return XT_CONTINUE; } diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c index c03c2e8d06f..2cc1fff4930 100644 --- a/net/netfilter/xt_NFQUEUE.c +++ b/net/netfilter/xt_NFQUEUE.c @@ -24,11 +24,9 @@ MODULE_ALIAS("ip6t_NFQUEUE"); MODULE_ALIAS("arpt_NFQUEUE"); static unsigned int -nfqueue_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +nfqueue_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct xt_NFQ_info *tinfo = targinfo; + const struct xt_NFQ_info *tinfo = par->targinfo; return NF_QUEUE_NR(tinfo->queuenum); } diff --git a/net/netfilter/xt_NOTRACK.c b/net/netfilter/xt_NOTRACK.c index b9ee268b37c..cc50295cd11 100644 --- a/net/netfilter/xt_NOTRACK.c +++ b/net/netfilter/xt_NOTRACK.c @@ -13,9 +13,7 @@ MODULE_ALIAS("ipt_NOTRACK"); MODULE_ALIAS("ip6t_NOTRACK"); static unsigned int -notrack_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +notrack_tg(struct sk_buff *skb, const struct xt_target_param *par) { /* Previously seen (loopback)? Ignore. */ if (skb->nfct != NULL) diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index da7946e6ecb..92e33524f78 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -71,14 +71,9 @@ void xt_rateest_put(struct xt_rateest *est) EXPORT_SYMBOL_GPL(xt_rateest_put); static unsigned int -xt_rateest_tg(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +xt_rateest_tg(struct sk_buff *skb, const struct xt_target_param *par) { - const struct xt_rateest_target_info *info = targinfo; + const struct xt_rateest_target_info *info = par->targinfo; struct gnet_stats_basic *stats = &info->est->bstats; spin_lock_bh(&info->est->lock); diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c index 2a2ab833481..ad05214e380 100644 --- a/net/netfilter/xt_SECMARK.c +++ b/net/netfilter/xt_SECMARK.c @@ -29,12 +29,10 @@ MODULE_ALIAS("ip6t_SECMARK"); static u8 mode; static unsigned int -secmark_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +secmark_tg(struct sk_buff *skb, const struct xt_target_param *par) { u32 secmark = 0; - const struct xt_secmark_target_info *info = targinfo; + const struct xt_secmark_target_info *info = par->targinfo; BUG_ON(info->mode != mode); diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index b868f995239..e08762d9b0f 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -174,15 +174,13 @@ static u_int32_t tcpmss_reverse_mtu(const struct sk_buff *skb, } static unsigned int -tcpmss_tg4(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +tcpmss_tg4(struct sk_buff *skb, const struct xt_target_param *par) { struct iphdr *iph = ip_hdr(skb); __be16 newlen; int ret; - ret = tcpmss_mangle_packet(skb, targinfo, + ret = tcpmss_mangle_packet(skb, par->targinfo, tcpmss_reverse_mtu(skb, PF_INET), iph->ihl * 4, sizeof(*iph) + sizeof(struct tcphdr)); @@ -199,9 +197,7 @@ tcpmss_tg4(struct sk_buff *skb, const struct net_device *in, #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) static unsigned int -tcpmss_tg6(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +tcpmss_tg6(struct sk_buff *skb, const struct xt_target_param *par) { struct ipv6hdr *ipv6h = ipv6_hdr(skb); u8 nexthdr; @@ -212,7 +208,7 @@ tcpmss_tg6(struct sk_buff *skb, const struct net_device *in, tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr); if (tcphoff < 0) return NF_DROP; - ret = tcpmss_mangle_packet(skb, targinfo, + ret = tcpmss_mangle_packet(skb, par->targinfo, tcpmss_reverse_mtu(skb, PF_INET6), tcphoff, sizeof(*ipv6h) + sizeof(struct tcphdr)); diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c index 2e0ae6cc5d9..9dd8c8ef63e 100644 --- a/net/netfilter/xt_TCPOPTSTRIP.c +++ b/net/netfilter/xt_TCPOPTSTRIP.c @@ -75,19 +75,15 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb, } static unsigned int -tcpoptstrip_tg4(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_target_param *par) { - return tcpoptstrip_mangle_packet(skb, targinfo, ip_hdrlen(skb), + return tcpoptstrip_mangle_packet(skb, par->targinfo, ip_hdrlen(skb), sizeof(struct iphdr) + sizeof(struct tcphdr)); } #if defined(CONFIG_IP6_NF_MANGLE) || defined(CONFIG_IP6_NF_MANGLE_MODULE) static unsigned int -tcpoptstrip_tg6(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_target_param *par) { struct ipv6hdr *ipv6h = ipv6_hdr(skb); int tcphoff; @@ -98,7 +94,7 @@ tcpoptstrip_tg6(struct sk_buff *skb, const struct net_device *in, if (tcphoff < 0) return NF_DROP; - return tcpoptstrip_mangle_packet(skb, targinfo, tcphoff, + return tcpoptstrip_mangle_packet(skb, par->targinfo, tcphoff, sizeof(*ipv6h) + sizeof(struct tcphdr)); } #endif diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 183f251d2f0..f08c49ea4bd 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -25,15 +25,10 @@ #include static unsigned int -tproxy_tg(struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - unsigned int hooknum, - const struct xt_target *target, - const void *targinfo) +tproxy_tg(struct sk_buff *skb, const struct xt_target_param *par) { const struct iphdr *iph = ip_hdr(skb); - const struct xt_tproxy_target_info *tgi = targinfo; + const struct xt_tproxy_target_info *tgi = par->targinfo; struct udphdr _hdr, *hp; struct sock *sk; @@ -44,7 +39,7 @@ tproxy_tg(struct sk_buff *skb, sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, iph->saddr, tgi->laddr ? tgi->laddr : iph->daddr, hp->source, tgi->lport ? tgi->lport : hp->dest, - in, true); + par->in, true); /* NOTE: assign_sock consumes our sk reference */ if (sk && nf_tproxy_assign_sock(skb, sk)) { diff --git a/net/netfilter/xt_TRACE.c b/net/netfilter/xt_TRACE.c index da35f9f1cd7..fbb04b86c46 100644 --- a/net/netfilter/xt_TRACE.c +++ b/net/netfilter/xt_TRACE.c @@ -11,9 +11,7 @@ MODULE_ALIAS("ipt_TRACE"); MODULE_ALIAS("ip6t_TRACE"); static unsigned int -trace_tg(struct sk_buff *skb, const struct net_device *in, - const struct net_device *out, unsigned int hooknum, - const struct xt_target *target, const void *targinfo) +trace_tg(struct sk_buff *skb, const struct xt_target_param *par) { skb->nf_trace = 1; return XT_CONTINUE; -- cgit v1.2.3 From af5d6dc200eb0fcc6fbd3df1ab4d8969004cb37f Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:19 +0200 Subject: netfilter: xtables: move extension arguments into compound structure (5/6) This patch does this for target extensions' checkentry functions. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/netfilter/x_tables.c | 32 ++++++++++++++++---------------- net/netfilter/xt_CONNMARK.c | 24 +++++++++--------------- net/netfilter/xt_CONNSECMARK.c | 16 +++++++--------- net/netfilter/xt_DSCP.c | 19 +++++++------------ net/netfilter/xt_MARK.c | 14 ++++---------- net/netfilter/xt_NFLOG.c | 7 ++----- net/netfilter/xt_RATEEST.c | 9 ++------- net/netfilter/xt_SECMARK.c | 12 +++++------- net/netfilter/xt_TCPMSS.c | 22 ++++++++-------------- net/netfilter/xt_TPROXY.c | 9 ++------- 10 files changed, 62 insertions(+), 102 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 817ab14f7cd..f29513cd139 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -471,35 +471,35 @@ int xt_compat_match_to_user(struct xt_entry_match *m, void __user **dstptr, EXPORT_SYMBOL_GPL(xt_compat_match_to_user); #endif /* CONFIG_COMPAT */ -int xt_check_target(const struct xt_target *target, unsigned short family, - unsigned int size, const char *table, unsigned int hook_mask, - unsigned short proto, int inv_proto, const void *entry, - void *targinfo) +int xt_check_target(struct xt_tgchk_param *par, u_int8_t family, + unsigned int size, u_int8_t proto, bool inv_proto) { - if (XT_ALIGN(target->targetsize) != size) { + if (XT_ALIGN(par->target->targetsize) != size) { printk("%s_tables: %s target: invalid size %Zu != %u\n", - xt_prefix[family], target->name, - XT_ALIGN(target->targetsize), size); + xt_prefix[family], par->target->name, + XT_ALIGN(par->target->targetsize), size); return -EINVAL; } - if (target->table && strcmp(target->table, table)) { + if (par->target->table != NULL && + strcmp(par->target->table, par->table) != 0) { printk("%s_tables: %s target: only valid in %s table, not %s\n", - xt_prefix[family], target->name, target->table, table); + xt_prefix[family], par->target->name, + par->target->table, par->table); return -EINVAL; } - if (target->hooks && (hook_mask & ~target->hooks) != 0) { + if (par->target->hooks && (par->hook_mask & ~par->target->hooks) != 0) { printk("%s_tables: %s target: bad hook_mask %#x/%#x\n", - xt_prefix[family], target->name, hook_mask, - target->hooks); + xt_prefix[family], par->target->name, par->hook_mask, + par->target->hooks); return -EINVAL; } - if (target->proto && (target->proto != proto || inv_proto)) { + if (par->target->proto && (par->target->proto != proto || inv_proto)) { printk("%s_tables: %s target: only valid for protocol %u\n", - xt_prefix[family], target->name, target->proto); + xt_prefix[family], par->target->name, + par->target->proto); return -EINVAL; } - if (target->checkentry != NULL && - !target->checkentry(table, entry, target, targinfo, hook_mask)) + if (par->target->checkentry != NULL && !par->target->checkentry(par)) return -EINVAL; return 0; } diff --git a/net/netfilter/xt_CONNMARK.c b/net/netfilter/xt_CONNMARK.c index 95ed267328a..8fc9f35e67d 100644 --- a/net/netfilter/xt_CONNMARK.c +++ b/net/netfilter/xt_CONNMARK.c @@ -112,18 +112,15 @@ connmark_tg(struct sk_buff *skb, const struct xt_target_param *par) return XT_CONTINUE; } -static bool -connmark_tg_check_v0(const char *tablename, const void *entry, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool connmark_tg_check_v0(const struct xt_tgchk_param *par) { - const struct xt_connmark_target_info *matchinfo = targinfo; + const struct xt_connmark_target_info *matchinfo = par->targinfo; if (matchinfo->mode == XT_CONNMARK_RESTORE) { - if (strcmp(tablename, "mangle") != 0) { + if (strcmp(par->table, "mangle") != 0) { printk(KERN_WARNING "CONNMARK: restore can only be " "called from \"mangle\" table, not \"%s\"\n", - tablename); + par->table); return false; } } @@ -131,22 +128,19 @@ connmark_tg_check_v0(const char *tablename, const void *entry, printk(KERN_WARNING "CONNMARK: Only supports 32bit mark\n"); return false; } - if (nf_ct_l3proto_try_module_get(target->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->target->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%u\n", target->family); + "proto=%u\n", par->target->family); return false; } return true; } -static bool -connmark_tg_check(const char *tablename, const void *entry, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool connmark_tg_check(const struct xt_tgchk_param *par) { - if (nf_ct_l3proto_try_module_get(target->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->target->family) < 0) { printk(KERN_WARNING "cannot load conntrack support for " - "proto=%u\n", target->family); + "proto=%u\n", par->target->family); return false; } return true; diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c index 2211a2cef28..2041a3d4b4d 100644 --- a/net/netfilter/xt_CONNSECMARK.c +++ b/net/netfilter/xt_CONNSECMARK.c @@ -85,16 +85,14 @@ connsecmark_tg(struct sk_buff *skb, const struct xt_target_param *par) return XT_CONTINUE; } -static bool -connsecmark_tg_check(const char *tablename, const void *entry, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool connsecmark_tg_check(const struct xt_tgchk_param *par) { - const struct xt_connsecmark_target_info *info = targinfo; + const struct xt_connsecmark_target_info *info = par->targinfo; - if (strcmp(tablename, "mangle") && strcmp(tablename, "security")) { + if (strcmp(par->table, "mangle") != 0 && + strcmp(par->table, "security") != 0) { printk(KERN_INFO PFX "target only valid in the \'mangle\' " - "or \'security\' tables, not \'%s\'.\n", tablename); + "or \'security\' tables, not \'%s\'.\n", par->table); return false; } @@ -108,9 +106,9 @@ connsecmark_tg_check(const char *tablename, const void *entry, return false; } - if (nf_ct_l3proto_try_module_get(target->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->target->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%u\n", target->family); + "proto=%u\n", par->target->family); return false; } return true; diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c index c78e80afdf3..6a347e768f8 100644 --- a/net/netfilter/xt_DSCP.c +++ b/net/netfilter/xt_DSCP.c @@ -61,15 +61,12 @@ dscp_tg6(struct sk_buff *skb, const struct xt_target_param *par) return XT_CONTINUE; } -static bool -dscp_tg_check(const char *tablename, const void *e_void, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool dscp_tg_check(const struct xt_tgchk_param *par) { - const u_int8_t dscp = ((struct xt_DSCP_info *)targinfo)->dscp; + const struct xt_DSCP_info *info = par->targinfo; - if (dscp > XT_DSCP_MAX) { - printk(KERN_WARNING "DSCP: dscp %x out of range\n", dscp); + if (info->dscp > XT_DSCP_MAX) { + printk(KERN_WARNING "DSCP: dscp %x out of range\n", info->dscp); return false; } return true; @@ -95,12 +92,10 @@ tos_tg_v0(struct sk_buff *skb, const struct xt_target_param *par) return XT_CONTINUE; } -static bool -tos_tg_check_v0(const char *tablename, const void *e_void, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool tos_tg_check_v0(const struct xt_tgchk_param *par) { - const u_int8_t tos = ((struct ipt_tos_target_info *)targinfo)->tos; + const struct ipt_tos_target_info *info = par->targinfo; + const uint8_t tos = info->tos; if (tos != IPTOS_LOWDELAY && tos != IPTOS_THROUGHPUT && tos != IPTOS_RELIABILITY && tos != IPTOS_MINCOST && diff --git a/net/netfilter/xt_MARK.c b/net/netfilter/xt_MARK.c index 27d03f39611..123ee0ba78c 100644 --- a/net/netfilter/xt_MARK.c +++ b/net/netfilter/xt_MARK.c @@ -66,12 +66,9 @@ mark_tg(struct sk_buff *skb, const struct xt_target_param *par) return XT_CONTINUE; } -static bool -mark_tg_check_v0(const char *tablename, const void *entry, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool mark_tg_check_v0(const struct xt_tgchk_param *par) { - const struct xt_mark_target_info *markinfo = targinfo; + const struct xt_mark_target_info *markinfo = par->targinfo; if (markinfo->mark > 0xffffffff) { printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n"); @@ -80,12 +77,9 @@ mark_tg_check_v0(const char *tablename, const void *entry, return true; } -static bool -mark_tg_check_v1(const char *tablename, const void *entry, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool mark_tg_check_v1(const struct xt_tgchk_param *par) { - const struct xt_mark_target_info_v1 *markinfo = targinfo; + const struct xt_mark_target_info_v1 *markinfo = par->targinfo; if (markinfo->mode != XT_MARK_SET && markinfo->mode != XT_MARK_AND diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index 3218ad63bd1..56ee4f118b5 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -36,12 +36,9 @@ nflog_tg(struct sk_buff *skb, const struct xt_target_param *par) return XT_CONTINUE; } -static bool -nflog_tg_check(const char *tablename, const void *entry, - const struct xt_target *target, void *targetinfo, - unsigned int hookmask) +static bool nflog_tg_check(const struct xt_tgchk_param *par) { - const struct xt_nflog_info *info = targetinfo; + const struct xt_nflog_info *info = par->targinfo; if (info->flags & ~XT_NFLOG_MASK) return false; diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 92e33524f78..edf4ab1f30f 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -84,14 +84,9 @@ xt_rateest_tg(struct sk_buff *skb, const struct xt_target_param *par) return XT_CONTINUE; } -static bool -xt_rateest_tg_checkentry(const char *tablename, - const void *entry, - const struct xt_target *target, - void *targinfo, - unsigned int hook_mask) +static bool xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) { - struct xt_rateest_target_info *info = targinfo; + struct xt_rateest_target_info *info = par->targinfo; struct xt_rateest *est; struct { struct nlattr opt; diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c index ad05214e380..e5777227192 100644 --- a/net/netfilter/xt_SECMARK.c +++ b/net/netfilter/xt_SECMARK.c @@ -80,16 +80,14 @@ static bool checkentry_selinux(struct xt_secmark_target_info *info) return true; } -static bool -secmark_tg_check(const char *tablename, const void *entry, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool secmark_tg_check(const struct xt_tgchk_param *par) { - struct xt_secmark_target_info *info = targinfo; + struct xt_secmark_target_info *info = par->targinfo; - if (strcmp(tablename, "mangle") && strcmp(tablename, "security")) { + if (strcmp(par->table, "mangle") != 0 && + strcmp(par->table, "security") != 0) { printk(KERN_INFO PFX "target only valid in the \'mangle\' " - "or \'security\' tables, not \'%s\'.\n", tablename); + "or \'security\' tables, not \'%s\'.\n", par->table); return false; } diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c index e08762d9b0f..4f3b1f80879 100644 --- a/net/netfilter/xt_TCPMSS.c +++ b/net/netfilter/xt_TCPMSS.c @@ -237,16 +237,13 @@ static inline bool find_syn_match(const struct xt_entry_match *m) return false; } -static bool -tcpmss_tg4_check(const char *tablename, const void *entry, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool tcpmss_tg4_check(const struct xt_tgchk_param *par) { - const struct xt_tcpmss_info *info = targinfo; - const struct ipt_entry *e = entry; + const struct xt_tcpmss_info *info = par->targinfo; + const struct ipt_entry *e = par->entryinfo; if (info->mss == XT_TCPMSS_CLAMP_PMTU && - (hook_mask & ~((1 << NF_INET_FORWARD) | + (par->hook_mask & ~((1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING))) != 0) { printk("xt_TCPMSS: path-MTU clamping only supported in " @@ -260,16 +257,13 @@ tcpmss_tg4_check(const char *tablename, const void *entry, } #if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) -static bool -tcpmss_tg6_check(const char *tablename, const void *entry, - const struct xt_target *target, void *targinfo, - unsigned int hook_mask) +static bool tcpmss_tg6_check(const struct xt_tgchk_param *par) { - const struct xt_tcpmss_info *info = targinfo; - const struct ip6t_entry *e = entry; + const struct xt_tcpmss_info *info = par->targinfo; + const struct ip6t_entry *e = par->entryinfo; if (info->mss == XT_TCPMSS_CLAMP_PMTU && - (hook_mask & ~((1 << NF_INET_FORWARD) | + (par->hook_mask & ~((1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING))) != 0) { printk("xt_TCPMSS: path-MTU clamping only supported in " diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index f08c49ea4bd..1340c2fa362 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -59,14 +59,9 @@ tproxy_tg(struct sk_buff *skb, const struct xt_target_param *par) return NF_DROP; } -static bool -tproxy_tg_check(const char *tablename, - const void *entry, - const struct xt_target *target, - void *targetinfo, - unsigned int hook_mask) +static bool tproxy_tg_check(const struct xt_tgchk_param *par) { - const struct ipt_ip *i = entry; + const struct ipt_ip *i = par->entryinfo; if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP) && !(i->invflags & IPT_INV_PROTO)) -- cgit v1.2.3 From a2df1648ba615dd5908e9a1fa7b2f133fa302487 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:19 +0200 Subject: netfilter: xtables: move extension arguments into compound structure (6/6) This patch does this for target extensions' destroy functions. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/netfilter/xt_CONNMARK.c | 5 ++--- net/netfilter/xt_CONNSECMARK.c | 5 ++--- net/netfilter/xt_RATEEST.c | 5 ++--- net/netfilter/xt_SECMARK.c | 2 +- 4 files changed, 7 insertions(+), 10 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/xt_CONNMARK.c b/net/netfilter/xt_CONNMARK.c index 8fc9f35e67d..c5a5072e005 100644 --- a/net/netfilter/xt_CONNMARK.c +++ b/net/netfilter/xt_CONNMARK.c @@ -146,10 +146,9 @@ static bool connmark_tg_check(const struct xt_tgchk_param *par) return true; } -static void -connmark_tg_destroy(const struct xt_target *target, void *targinfo) +static void connmark_tg_destroy(const struct xt_tgdtor_param *par) { - nf_ct_l3proto_module_put(target->family); + nf_ct_l3proto_module_put(par->target->family); } #ifdef CONFIG_COMPAT diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c index 2041a3d4b4d..b6e3f3f125f 100644 --- a/net/netfilter/xt_CONNSECMARK.c +++ b/net/netfilter/xt_CONNSECMARK.c @@ -114,10 +114,9 @@ static bool connsecmark_tg_check(const struct xt_tgchk_param *par) return true; } -static void -connsecmark_tg_destroy(const struct xt_target *target, void *targinfo) +static void connsecmark_tg_destroy(const struct xt_tgdtor_param *par) { - nf_ct_l3proto_module_put(target->family); + nf_ct_l3proto_module_put(par->target->family); } static struct xt_target connsecmark_tg_reg[] __read_mostly = { diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index edf4ab1f30f..43f5676b1af 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -139,10 +139,9 @@ err1: return false; } -static void xt_rateest_tg_destroy(const struct xt_target *target, - void *targinfo) +static void xt_rateest_tg_destroy(const struct xt_tgdtor_param *par) { - struct xt_rateest_target_info *info = targinfo; + struct xt_rateest_target_info *info = par->targinfo; xt_rateest_put(info->est); } diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c index e5777227192..7a6f9e6f5df 100644 --- a/net/netfilter/xt_SECMARK.c +++ b/net/netfilter/xt_SECMARK.c @@ -113,7 +113,7 @@ static bool secmark_tg_check(const struct xt_tgchk_param *par) return true; } -static void secmark_tg_destroy(const struct xt_target *target, void *targinfo) +static void secmark_tg_destroy(const struct xt_tgdtor_param *par) { switch (mode) { case SECMARK_MODE_SEL: -- cgit v1.2.3 From 916a917dfec18535ff9e2afdafba82e6279eb4f4 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:20 +0200 Subject: netfilter: xtables: provide invoked family value to extensions By passing in the family through which extensions were invoked, a bit of data space can be reclaimed. The "family" member will be added to the parameter structures and the check functions be adjusted. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/netfilter/x_tables.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index f29513cd139..89837a4eef7 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -321,7 +321,7 @@ int xt_find_revision(u8 af, const char *name, u8 revision, int target, } EXPORT_SYMBOL_GPL(xt_find_revision); -int xt_check_match(struct xt_mtchk_param *par, u_int8_t family, +int xt_check_match(struct xt_mtchk_param *par, unsigned int size, u_int8_t proto, bool inv_proto) { if (XT_ALIGN(par->match->matchsize) != size && @@ -331,26 +331,27 @@ int xt_check_match(struct xt_mtchk_param *par, u_int8_t family, * because it uses a dynamic-size data set. */ printk("%s_tables: %s match: invalid size %Zu != %u\n", - xt_prefix[family], par->match->name, + xt_prefix[par->family], par->match->name, XT_ALIGN(par->match->matchsize), size); return -EINVAL; } if (par->match->table != NULL && strcmp(par->match->table, par->table) != 0) { printk("%s_tables: %s match: only valid in %s table, not %s\n", - xt_prefix[family], par->match->name, + xt_prefix[par->family], par->match->name, par->match->table, par->table); return -EINVAL; } if (par->match->hooks && (par->hook_mask & ~par->match->hooks) != 0) { printk("%s_tables: %s match: bad hook_mask %#x/%#x\n", - xt_prefix[family], par->match->name, + xt_prefix[par->family], par->match->name, par->hook_mask, par->match->hooks); return -EINVAL; } if (par->match->proto && (par->match->proto != proto || inv_proto)) { printk("%s_tables: %s match: only valid for protocol %u\n", - xt_prefix[family], par->match->name, par->match->proto); + xt_prefix[par->family], par->match->name, + par->match->proto); return -EINVAL; } if (par->match->checkentry != NULL && !par->match->checkentry(par)) @@ -471,31 +472,31 @@ int xt_compat_match_to_user(struct xt_entry_match *m, void __user **dstptr, EXPORT_SYMBOL_GPL(xt_compat_match_to_user); #endif /* CONFIG_COMPAT */ -int xt_check_target(struct xt_tgchk_param *par, u_int8_t family, +int xt_check_target(struct xt_tgchk_param *par, unsigned int size, u_int8_t proto, bool inv_proto) { if (XT_ALIGN(par->target->targetsize) != size) { printk("%s_tables: %s target: invalid size %Zu != %u\n", - xt_prefix[family], par->target->name, + xt_prefix[par->family], par->target->name, XT_ALIGN(par->target->targetsize), size); return -EINVAL; } if (par->target->table != NULL && strcmp(par->target->table, par->table) != 0) { printk("%s_tables: %s target: only valid in %s table, not %s\n", - xt_prefix[family], par->target->name, + xt_prefix[par->family], par->target->name, par->target->table, par->table); return -EINVAL; } if (par->target->hooks && (par->hook_mask & ~par->target->hooks) != 0) { printk("%s_tables: %s target: bad hook_mask %#x/%#x\n", - xt_prefix[family], par->target->name, par->hook_mask, - par->target->hooks); + xt_prefix[par->family], par->target->name, + par->hook_mask, par->target->hooks); return -EINVAL; } if (par->target->proto && (par->target->proto != proto || inv_proto)) { printk("%s_tables: %s target: only valid for protocol %u\n", - xt_prefix[family], par->target->name, + xt_prefix[par->family], par->target->name, par->target->proto); return -EINVAL; } -- cgit v1.2.3 From 92f3b2b1bc968caaabee8cd78bee75ab7c4af74e Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:20 +0200 Subject: netfilter: xtables: cut down on static data for family-independent extensions Using ->family in struct xt_*_param, multiple struct xt_{match,target} can be squashed together. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/netfilter/xt_CONNMARK.c | 39 +++++++---------------------------- net/netfilter/xt_CONNSECMARK.c | 40 +++++++++++++---------------------- net/netfilter/xt_NFLOG.c | 31 ++++++++++------------------ net/netfilter/xt_connbytes.c | 39 +++++++++++++---------------------- net/netfilter/xt_connlimit.c | 47 ++++++++++++++++-------------------------- net/netfilter/xt_connmark.c | 39 +++++++---------------------------- net/netfilter/xt_conntrack.c | 26 +++++++---------------- net/netfilter/xt_helper.c | 38 +++++++++++++--------------------- net/netfilter/xt_pkttype.c | 30 ++++++++++----------------- 9 files changed, 104 insertions(+), 225 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/xt_CONNMARK.c b/net/netfilter/xt_CONNMARK.c index c5a5072e005..d6e5ab46327 100644 --- a/net/netfilter/xt_CONNMARK.c +++ b/net/netfilter/xt_CONNMARK.c @@ -128,9 +128,9 @@ static bool connmark_tg_check_v0(const struct xt_tgchk_param *par) printk(KERN_WARNING "CONNMARK: Only supports 32bit mark\n"); return false; } - if (nf_ct_l3proto_try_module_get(par->target->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%u\n", par->target->family); + "proto=%u\n", par->family); return false; } return true; @@ -138,9 +138,9 @@ static bool connmark_tg_check_v0(const struct xt_tgchk_param *par) static bool connmark_tg_check(const struct xt_tgchk_param *par) { - if (nf_ct_l3proto_try_module_get(par->target->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->family) < 0) { printk(KERN_WARNING "cannot load conntrack support for " - "proto=%u\n", par->target->family); + "proto=%u\n", par->family); return false; } return true; @@ -148,7 +148,7 @@ static bool connmark_tg_check(const struct xt_tgchk_param *par) static void connmark_tg_destroy(const struct xt_tgdtor_param *par) { - nf_ct_l3proto_module_put(par->target->family); + nf_ct_l3proto_module_put(par->family); } #ifdef CONFIG_COMPAT @@ -186,7 +186,7 @@ static struct xt_target connmark_tg_reg[] __read_mostly = { { .name = "CONNMARK", .revision = 0, - .family = NFPROTO_IPV4, + .family = NFPROTO_UNSPEC, .checkentry = connmark_tg_check_v0, .destroy = connmark_tg_destroy, .target = connmark_tg_v0, @@ -198,35 +198,10 @@ static struct xt_target connmark_tg_reg[] __read_mostly = { #endif .me = THIS_MODULE }, - { - .name = "CONNMARK", - .revision = 0, - .family = NFPROTO_IPV6, - .checkentry = connmark_tg_check_v0, - .destroy = connmark_tg_destroy, - .target = connmark_tg_v0, - .targetsize = sizeof(struct xt_connmark_target_info), -#ifdef CONFIG_COMPAT - .compatsize = sizeof(struct compat_xt_connmark_target_info), - .compat_from_user = connmark_tg_compat_from_user_v0, - .compat_to_user = connmark_tg_compat_to_user_v0, -#endif - .me = THIS_MODULE - }, - { - .name = "CONNMARK", - .revision = 1, - .family = NFPROTO_IPV4, - .checkentry = connmark_tg_check, - .target = connmark_tg, - .targetsize = sizeof(struct xt_connmark_tginfo1), - .destroy = connmark_tg_destroy, - .me = THIS_MODULE, - }, { .name = "CONNMARK", .revision = 1, - .family = NFPROTO_IPV6, + .family = NFPROTO_UNSPEC, .checkentry = connmark_tg_check, .target = connmark_tg, .targetsize = sizeof(struct xt_connmark_tginfo1), diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c index b6e3f3f125f..b54c3756fdc 100644 --- a/net/netfilter/xt_CONNSECMARK.c +++ b/net/netfilter/xt_CONNSECMARK.c @@ -106,9 +106,9 @@ static bool connsecmark_tg_check(const struct xt_tgchk_param *par) return false; } - if (nf_ct_l3proto_try_module_get(par->target->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%u\n", par->target->family); + "proto=%u\n", par->family); return false; } return true; @@ -116,40 +116,28 @@ static bool connsecmark_tg_check(const struct xt_tgchk_param *par) static void connsecmark_tg_destroy(const struct xt_tgdtor_param *par) { - nf_ct_l3proto_module_put(par->target->family); + nf_ct_l3proto_module_put(par->family); } -static struct xt_target connsecmark_tg_reg[] __read_mostly = { - { - .name = "CONNSECMARK", - .family = NFPROTO_IPV4, - .checkentry = connsecmark_tg_check, - .destroy = connsecmark_tg_destroy, - .target = connsecmark_tg, - .targetsize = sizeof(struct xt_connsecmark_target_info), - .me = THIS_MODULE, - }, - { - .name = "CONNSECMARK", - .family = NFPROTO_IPV6, - .checkentry = connsecmark_tg_check, - .destroy = connsecmark_tg_destroy, - .target = connsecmark_tg, - .targetsize = sizeof(struct xt_connsecmark_target_info), - .me = THIS_MODULE, - }, +static struct xt_target connsecmark_tg_reg __read_mostly = { + .name = "CONNSECMARK", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = connsecmark_tg_check, + .destroy = connsecmark_tg_destroy, + .target = connsecmark_tg, + .targetsize = sizeof(struct xt_connsecmark_target_info), + .me = THIS_MODULE, }; static int __init connsecmark_tg_init(void) { - return xt_register_targets(connsecmark_tg_reg, - ARRAY_SIZE(connsecmark_tg_reg)); + return xt_register_target(&connsecmark_tg_reg); } static void __exit connsecmark_tg_exit(void) { - xt_unregister_targets(connsecmark_tg_reg, - ARRAY_SIZE(connsecmark_tg_reg)); + xt_unregister_target(&connsecmark_tg_reg); } module_init(connsecmark_tg_init); diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c index 56ee4f118b5..50e3a52d3b3 100644 --- a/net/netfilter/xt_NFLOG.c +++ b/net/netfilter/xt_NFLOG.c @@ -31,7 +31,7 @@ nflog_tg(struct sk_buff *skb, const struct xt_target_param *par) li.u.ulog.group = info->group; li.u.ulog.qthreshold = info->threshold; - nf_log_packet(par->target->family, par->hooknum, skb, par->in, + nf_log_packet(par->family, par->hooknum, skb, par->in, par->out, &li, "%s", info->prefix); return XT_CONTINUE; } @@ -47,33 +47,24 @@ static bool nflog_tg_check(const struct xt_tgchk_param *par) return true; } -static struct xt_target nflog_tg_reg[] __read_mostly = { - { - .name = "NFLOG", - .family = NFPROTO_IPV4, - .checkentry = nflog_tg_check, - .target = nflog_tg, - .targetsize = sizeof(struct xt_nflog_info), - .me = THIS_MODULE, - }, - { - .name = "NFLOG", - .family = NFPROTO_IPV6, - .checkentry = nflog_tg_check, - .target = nflog_tg, - .targetsize = sizeof(struct xt_nflog_info), - .me = THIS_MODULE, - }, +static struct xt_target nflog_tg_reg __read_mostly = { + .name = "NFLOG", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = nflog_tg_check, + .target = nflog_tg, + .targetsize = sizeof(struct xt_nflog_info), + .me = THIS_MODULE, }; static int __init nflog_tg_init(void) { - return xt_register_targets(nflog_tg_reg, ARRAY_SIZE(nflog_tg_reg)); + return xt_register_target(&nflog_tg_reg); } static void __exit nflog_tg_exit(void) { - xt_unregister_targets(nflog_tg_reg, ARRAY_SIZE(nflog_tg_reg)); + xt_unregister_target(&nflog_tg_reg); } module_init(nflog_tg_init); diff --git a/net/netfilter/xt_connbytes.c b/net/netfilter/xt_connbytes.c index 5bf4aa08b0f..955e6598a7f 100644 --- a/net/netfilter/xt_connbytes.c +++ b/net/netfilter/xt_connbytes.c @@ -106,9 +106,9 @@ static bool connbytes_mt_check(const struct xt_mtchk_param *par) sinfo->direction != XT_CONNBYTES_DIR_BOTH) return false; - if (nf_ct_l3proto_try_module_get(par->match->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%u\n", par->match->family); + "proto=%u\n", par->family); return false; } @@ -117,39 +117,28 @@ static bool connbytes_mt_check(const struct xt_mtchk_param *par) static void connbytes_mt_destroy(const struct xt_mtdtor_param *par) { - nf_ct_l3proto_module_put(par->match->family); + nf_ct_l3proto_module_put(par->family); } -static struct xt_match connbytes_mt_reg[] __read_mostly = { - { - .name = "connbytes", - .family = NFPROTO_IPV4, - .checkentry = connbytes_mt_check, - .match = connbytes_mt, - .destroy = connbytes_mt_destroy, - .matchsize = sizeof(struct xt_connbytes_info), - .me = THIS_MODULE - }, - { - .name = "connbytes", - .family = NFPROTO_IPV6, - .checkentry = connbytes_mt_check, - .match = connbytes_mt, - .destroy = connbytes_mt_destroy, - .matchsize = sizeof(struct xt_connbytes_info), - .me = THIS_MODULE - }, +static struct xt_match connbytes_mt_reg __read_mostly = { + .name = "connbytes", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = connbytes_mt_check, + .match = connbytes_mt, + .destroy = connbytes_mt_destroy, + .matchsize = sizeof(struct xt_connbytes_info), + .me = THIS_MODULE, }; static int __init connbytes_mt_init(void) { - return xt_register_matches(connbytes_mt_reg, - ARRAY_SIZE(connbytes_mt_reg)); + return xt_register_match(&connbytes_mt_reg); } static void __exit connbytes_mt_exit(void) { - xt_unregister_matches(connbytes_mt_reg, ARRAY_SIZE(connbytes_mt_reg)); + xt_unregister_match(&connbytes_mt_reg); } module_init(connbytes_mt_init); diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index bfb3ee6c712..7f404cc64c8 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -192,10 +192,10 @@ connlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par) if (ct != NULL) tuple_ptr = &ct->tuplehash[0].tuple; else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), - par->match->family, &tuple)) + par->family, &tuple)) goto hotdrop; - if (par->match->family == NFPROTO_IPV6) { + if (par->family == NFPROTO_IPV6) { const struct ipv6hdr *iph = ipv6_hdr(skb); memcpy(&addr.ip6, &iph->saddr, sizeof(iph->saddr)); } else { @@ -226,16 +226,16 @@ static bool connlimit_mt_check(const struct xt_mtchk_param *par) struct xt_connlimit_info *info = par->matchinfo; unsigned int i; - if (nf_ct_l3proto_try_module_get(par->match->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->family) < 0) { printk(KERN_WARNING "cannot load conntrack support for " - "address family %u\n", par->match->family); + "address family %u\n", par->family); return false; } /* init private data */ info->data = kmalloc(sizeof(struct xt_connlimit_data), GFP_KERNEL); if (info->data == NULL) { - nf_ct_l3proto_module_put(par->match->family); + nf_ct_l3proto_module_put(par->family); return false; } @@ -254,7 +254,7 @@ static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) struct list_head *hash = info->data->iphash; unsigned int i; - nf_ct_l3proto_module_put(par->match->family); + nf_ct_l3proto_module_put(par->family); for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) { list_for_each_entry_safe(conn, tmp, &hash[i], list) { @@ -266,41 +266,30 @@ static void connlimit_mt_destroy(const struct xt_mtdtor_param *par) kfree(info->data); } -static struct xt_match connlimit_mt_reg[] __read_mostly = { - { - .name = "connlimit", - .family = NFPROTO_IPV4, - .checkentry = connlimit_mt_check, - .match = connlimit_mt, - .matchsize = sizeof(struct xt_connlimit_info), - .destroy = connlimit_mt_destroy, - .me = THIS_MODULE, - }, - { - .name = "connlimit", - .family = NFPROTO_IPV6, - .checkentry = connlimit_mt_check, - .match = connlimit_mt, - .matchsize = sizeof(struct xt_connlimit_info), - .destroy = connlimit_mt_destroy, - .me = THIS_MODULE, - }, +static struct xt_match connlimit_mt_reg __read_mostly = { + .name = "connlimit", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = connlimit_mt_check, + .match = connlimit_mt, + .matchsize = sizeof(struct xt_connlimit_info), + .destroy = connlimit_mt_destroy, + .me = THIS_MODULE, }; static int __init connlimit_mt_init(void) { - return xt_register_matches(connlimit_mt_reg, - ARRAY_SIZE(connlimit_mt_reg)); + return xt_register_match(&connlimit_mt_reg); } static void __exit connlimit_mt_exit(void) { - xt_unregister_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg)); + xt_unregister_match(&connlimit_mt_reg); } module_init(connlimit_mt_init); module_exit(connlimit_mt_exit); -MODULE_AUTHOR("Jan Engelhardt "); +MODULE_AUTHOR("Jan Engelhardt "); MODULE_DESCRIPTION("Xtables: Number of connections matching"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_connlimit"); diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index c708577ea1b..86cacab7a4a 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -69,9 +69,9 @@ static bool connmark_mt_check_v0(const struct xt_mtchk_param *par) printk(KERN_WARNING "connmark: only support 32bit mark\n"); return false; } - if (nf_ct_l3proto_try_module_get(par->match->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%u\n", par->match->family); + "proto=%u\n", par->family); return false; } return true; @@ -79,9 +79,9 @@ static bool connmark_mt_check_v0(const struct xt_mtchk_param *par) static bool connmark_mt_check(const struct xt_mtchk_param *par) { - if (nf_ct_l3proto_try_module_get(par->match->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->family) < 0) { printk(KERN_WARNING "cannot load conntrack support for " - "proto=%u\n", par->match->family); + "proto=%u\n", par->family); return false; } return true; @@ -89,7 +89,7 @@ static bool connmark_mt_check(const struct xt_mtchk_param *par) static void connmark_mt_destroy(const struct xt_mtdtor_param *par) { - nf_ct_l3proto_module_put(par->match->family); + nf_ct_l3proto_module_put(par->family); } #ifdef CONFIG_COMPAT @@ -127,7 +127,7 @@ static struct xt_match connmark_mt_reg[] __read_mostly = { { .name = "connmark", .revision = 0, - .family = NFPROTO_IPV4, + .family = NFPROTO_UNSPEC, .checkentry = connmark_mt_check_v0, .match = connmark_mt_v0, .destroy = connmark_mt_destroy, @@ -139,35 +139,10 @@ static struct xt_match connmark_mt_reg[] __read_mostly = { #endif .me = THIS_MODULE }, - { - .name = "connmark", - .revision = 0, - .family = NFPROTO_IPV6, - .checkentry = connmark_mt_check_v0, - .match = connmark_mt_v0, - .destroy = connmark_mt_destroy, - .matchsize = sizeof(struct xt_connmark_info), -#ifdef CONFIG_COMPAT - .compatsize = sizeof(struct compat_xt_connmark_info), - .compat_from_user = connmark_mt_compat_from_user_v0, - .compat_to_user = connmark_mt_compat_to_user_v0, -#endif - .me = THIS_MODULE - }, - { - .name = "connmark", - .revision = 1, - .family = NFPROTO_IPV4, - .checkentry = connmark_mt_check, - .match = connmark_mt, - .matchsize = sizeof(struct xt_connmark_mtinfo1), - .destroy = connmark_mt_destroy, - .me = THIS_MODULE, - }, { .name = "connmark", .revision = 1, - .family = NFPROTO_IPV6, + .family = NFPROTO_UNSPEC, .checkentry = connmark_mt_check, .match = connmark_mt, .matchsize = sizeof(struct xt_connmark_mtinfo1), diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index 5cd58d7fcb1..0b7139f3dd7 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -238,22 +238,22 @@ conntrack_mt(const struct sk_buff *skb, const struct xt_match_param *par) return false; if (info->match_flags & XT_CONNTRACK_ORIGSRC) - if (conntrack_mt_origsrc(ct, info, par->match->family) ^ + if (conntrack_mt_origsrc(ct, info, par->family) ^ !(info->invert_flags & XT_CONNTRACK_ORIGSRC)) return false; if (info->match_flags & XT_CONNTRACK_ORIGDST) - if (conntrack_mt_origdst(ct, info, par->match->family) ^ + if (conntrack_mt_origdst(ct, info, par->family) ^ !(info->invert_flags & XT_CONNTRACK_ORIGDST)) return false; if (info->match_flags & XT_CONNTRACK_REPLSRC) - if (conntrack_mt_replsrc(ct, info, par->match->family) ^ + if (conntrack_mt_replsrc(ct, info, par->family) ^ !(info->invert_flags & XT_CONNTRACK_REPLSRC)) return false; if (info->match_flags & XT_CONNTRACK_REPLDST) - if (conntrack_mt_repldst(ct, info, par->match->family) ^ + if (conntrack_mt_repldst(ct, info, par->family) ^ !(info->invert_flags & XT_CONNTRACK_REPLDST)) return false; @@ -280,9 +280,9 @@ conntrack_mt(const struct sk_buff *skb, const struct xt_match_param *par) static bool conntrack_mt_check(const struct xt_mtchk_param *par) { - if (nf_ct_l3proto_try_module_get(par->match->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%u\n", par->match->family); + "proto=%u\n", par->family); return false; } return true; @@ -290,7 +290,7 @@ static bool conntrack_mt_check(const struct xt_mtchk_param *par) static void conntrack_mt_destroy(const struct xt_mtdtor_param *par) { - nf_ct_l3proto_module_put(par->match->family); + nf_ct_l3proto_module_put(par->family); } #ifdef CONFIG_COMPAT @@ -361,17 +361,7 @@ static struct xt_match conntrack_mt_reg[] __read_mostly = { { .name = "conntrack", .revision = 1, - .family = NFPROTO_IPV4, - .matchsize = sizeof(struct xt_conntrack_mtinfo1), - .match = conntrack_mt, - .checkentry = conntrack_mt_check, - .destroy = conntrack_mt_destroy, - .me = THIS_MODULE, - }, - { - .name = "conntrack", - .revision = 1, - .family = NFPROTO_IPV6, + .family = NFPROTO_UNSPEC, .matchsize = sizeof(struct xt_conntrack_mtinfo1), .match = conntrack_mt, .checkentry = conntrack_mt_check, diff --git a/net/netfilter/xt_helper.c b/net/netfilter/xt_helper.c index 280c984349f..64fc7f27722 100644 --- a/net/netfilter/xt_helper.c +++ b/net/netfilter/xt_helper.c @@ -58,9 +58,9 @@ static bool helper_mt_check(const struct xt_mtchk_param *par) { struct xt_helper_info *info = par->matchinfo; - if (nf_ct_l3proto_try_module_get(par->match->family) < 0) { + if (nf_ct_l3proto_try_module_get(par->family) < 0) { printk(KERN_WARNING "can't load conntrack support for " - "proto=%u\n", par->match->family); + "proto=%u\n", par->family); return false; } info->name[29] = '\0'; @@ -69,38 +69,28 @@ static bool helper_mt_check(const struct xt_mtchk_param *par) static void helper_mt_destroy(const struct xt_mtdtor_param *par) { - nf_ct_l3proto_module_put(par->match->family); + nf_ct_l3proto_module_put(par->family); } -static struct xt_match helper_mt_reg[] __read_mostly = { - { - .name = "helper", - .family = NFPROTO_IPV4, - .checkentry = helper_mt_check, - .match = helper_mt, - .destroy = helper_mt_destroy, - .matchsize = sizeof(struct xt_helper_info), - .me = THIS_MODULE, - }, - { - .name = "helper", - .family = NFPROTO_IPV6, - .checkentry = helper_mt_check, - .match = helper_mt, - .destroy = helper_mt_destroy, - .matchsize = sizeof(struct xt_helper_info), - .me = THIS_MODULE, - }, +static struct xt_match helper_mt_reg __read_mostly = { + .name = "helper", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = helper_mt_check, + .match = helper_mt, + .destroy = helper_mt_destroy, + .matchsize = sizeof(struct xt_helper_info), + .me = THIS_MODULE, }; static int __init helper_mt_init(void) { - return xt_register_matches(helper_mt_reg, ARRAY_SIZE(helper_mt_reg)); + return xt_register_match(&helper_mt_reg); } static void __exit helper_mt_exit(void) { - xt_unregister_matches(helper_mt_reg, ARRAY_SIZE(helper_mt_reg)); + xt_unregister_match(&helper_mt_reg); } module_init(helper_mt_init); diff --git a/net/netfilter/xt_pkttype.c b/net/netfilter/xt_pkttype.c index 37753a37760..69da1d3a1d8 100644 --- a/net/netfilter/xt_pkttype.c +++ b/net/netfilter/xt_pkttype.c @@ -30,10 +30,10 @@ pkttype_mt(const struct sk_buff *skb, const struct xt_match_param *par) if (skb->pkt_type != PACKET_LOOPBACK) type = skb->pkt_type; - else if (par->match->family == NFPROTO_IPV4 && + else if (par->family == NFPROTO_IPV4 && ipv4_is_multicast(ip_hdr(skb)->daddr)) type = PACKET_MULTICAST; - else if (par->match->family == NFPROTO_IPV6 && + else if (par->family == NFPROTO_IPV6 && ipv6_hdr(skb)->daddr.s6_addr[0] == 0xFF) type = PACKET_MULTICAST; else @@ -42,31 +42,23 @@ pkttype_mt(const struct sk_buff *skb, const struct xt_match_param *par) return (type == info->pkttype) ^ info->invert; } -static struct xt_match pkttype_mt_reg[] __read_mostly = { - { - .name = "pkttype", - .family = NFPROTO_IPV4, - .match = pkttype_mt, - .matchsize = sizeof(struct xt_pkttype_info), - .me = THIS_MODULE, - }, - { - .name = "pkttype", - .family = NFPROTO_IPV6, - .match = pkttype_mt, - .matchsize = sizeof(struct xt_pkttype_info), - .me = THIS_MODULE, - }, +static struct xt_match pkttype_mt_reg __read_mostly = { + .name = "pkttype", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = pkttype_mt, + .matchsize = sizeof(struct xt_pkttype_info), + .me = THIS_MODULE, }; static int __init pkttype_mt_init(void) { - return xt_register_matches(pkttype_mt_reg, ARRAY_SIZE(pkttype_mt_reg)); + return xt_register_match(&pkttype_mt_reg); } static void __exit pkttype_mt_exit(void) { - xt_unregister_matches(pkttype_mt_reg, ARRAY_SIZE(pkttype_mt_reg)); + xt_unregister_match(&pkttype_mt_reg); } module_init(pkttype_mt_init); -- cgit v1.2.3 From ab4f21e6fb1c09b13c4c3cb8357babe8223471bd Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:20 +0200 Subject: netfilter: xtables: use NFPROTO_UNSPEC in more extensions Lots of extensions are completely family-independent, so squash some code. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/netfilter/xt_MARK.c | 34 ++-------------------------------- net/netfilter/xt_NOTRACK.c | 26 +++++++++----------------- net/netfilter/xt_comment.c | 26 +++++++++----------------- net/netfilter/xt_mac.c | 34 +++++++++++----------------------- net/netfilter/xt_owner.c | 12 +----------- net/netfilter/xt_physdev.c | 29 ++++++++++------------------- net/netfilter/xt_realm.c | 2 +- 7 files changed, 43 insertions(+), 120 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/xt_MARK.c b/net/netfilter/xt_MARK.c index 123ee0ba78c..67574bcfb8a 100644 --- a/net/netfilter/xt_MARK.c +++ b/net/netfilter/xt_MARK.c @@ -149,7 +149,7 @@ static int mark_tg_compat_to_user_v1(void __user *dst, void *src) static struct xt_target mark_tg_reg[] __read_mostly = { { .name = "MARK", - .family = NFPROTO_IPV4, + .family = NFPROTO_UNSPEC, .revision = 0, .checkentry = mark_tg_check_v0, .target = mark_tg_v0, @@ -164,37 +164,7 @@ static struct xt_target mark_tg_reg[] __read_mostly = { }, { .name = "MARK", - .family = NFPROTO_IPV4, - .revision = 1, - .checkentry = mark_tg_check_v1, - .target = mark_tg_v1, - .targetsize = sizeof(struct xt_mark_target_info_v1), -#ifdef CONFIG_COMPAT - .compatsize = sizeof(struct compat_xt_mark_target_info_v1), - .compat_from_user = mark_tg_compat_from_user_v1, - .compat_to_user = mark_tg_compat_to_user_v1, -#endif - .table = "mangle", - .me = THIS_MODULE, - }, - { - .name = "MARK", - .family = NFPROTO_IPV6, - .revision = 0, - .checkentry = mark_tg_check_v0, - .target = mark_tg_v0, - .targetsize = sizeof(struct xt_mark_target_info), -#ifdef CONFIG_COMPAT - .compatsize = sizeof(struct compat_xt_mark_target_info), - .compat_from_user = mark_tg_compat_from_user_v0, - .compat_to_user = mark_tg_compat_to_user_v0, -#endif - .table = "mangle", - .me = THIS_MODULE, - }, - { - .name = "MARK", - .family = NFPROTO_IPV6, + .family = NFPROTO_UNSPEC, .revision = 1, .checkentry = mark_tg_check_v1, .target = mark_tg_v1, diff --git a/net/netfilter/xt_NOTRACK.c b/net/netfilter/xt_NOTRACK.c index cc50295cd11..e7a0a54fd4e 100644 --- a/net/netfilter/xt_NOTRACK.c +++ b/net/netfilter/xt_NOTRACK.c @@ -30,31 +30,23 @@ notrack_tg(struct sk_buff *skb, const struct xt_target_param *par) return XT_CONTINUE; } -static struct xt_target notrack_tg_reg[] __read_mostly = { - { - .name = "NOTRACK", - .family = NFPROTO_IPV4, - .target = notrack_tg, - .table = "raw", - .me = THIS_MODULE, - }, - { - .name = "NOTRACK", - .family = NFPROTO_IPV6, - .target = notrack_tg, - .table = "raw", - .me = THIS_MODULE, - }, +static struct xt_target notrack_tg_reg __read_mostly = { + .name = "NOTRACK", + .revision = 0, + .family = NFPROTO_UNSPEC, + .target = notrack_tg, + .table = "raw", + .me = THIS_MODULE, }; static int __init notrack_tg_init(void) { - return xt_register_targets(notrack_tg_reg, ARRAY_SIZE(notrack_tg_reg)); + return xt_register_target(¬rack_tg_reg); } static void __exit notrack_tg_exit(void) { - xt_unregister_targets(notrack_tg_reg, ARRAY_SIZE(notrack_tg_reg)); + xt_unregister_target(¬rack_tg_reg); } module_init(notrack_tg_init); diff --git a/net/netfilter/xt_comment.c b/net/netfilter/xt_comment.c index bd7aa57af42..e82179832ac 100644 --- a/net/netfilter/xt_comment.c +++ b/net/netfilter/xt_comment.c @@ -22,31 +22,23 @@ comment_mt(const struct sk_buff *skb, const struct xt_match_param *par) return true; } -static struct xt_match comment_mt_reg[] __read_mostly = { - { - .name = "comment", - .family = NFPROTO_IPV4, - .match = comment_mt, - .matchsize = sizeof(struct xt_comment_info), - .me = THIS_MODULE - }, - { - .name = "comment", - .family = NFPROTO_IPV6, - .match = comment_mt, - .matchsize = sizeof(struct xt_comment_info), - .me = THIS_MODULE - }, +static struct xt_match comment_mt_reg __read_mostly = { + .name = "comment", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = comment_mt, + .matchsize = sizeof(struct xt_comment_info), + .me = THIS_MODULE, }; static int __init comment_mt_init(void) { - return xt_register_matches(comment_mt_reg, ARRAY_SIZE(comment_mt_reg)); + return xt_register_match(&comment_mt_reg); } static void __exit comment_mt_exit(void) { - xt_unregister_matches(comment_mt_reg, ARRAY_SIZE(comment_mt_reg)); + xt_unregister_match(&comment_mt_reg); } module_init(comment_mt_init); diff --git a/net/netfilter/xt_mac.c b/net/netfilter/xt_mac.c index 269f9d8aef5..c2007116ce5 100644 --- a/net/netfilter/xt_mac.c +++ b/net/netfilter/xt_mac.c @@ -36,37 +36,25 @@ static bool mac_mt(const struct sk_buff *skb, const struct xt_match_param *par) ^ info->invert); } -static struct xt_match mac_mt_reg[] __read_mostly = { - { - .name = "mac", - .family = NFPROTO_IPV4, - .match = mac_mt, - .matchsize = sizeof(struct xt_mac_info), - .hooks = (1 << NF_INET_PRE_ROUTING) | - (1 << NF_INET_LOCAL_IN) | - (1 << NF_INET_FORWARD), - .me = THIS_MODULE, - }, - { - .name = "mac", - .family = NFPROTO_IPV6, - .match = mac_mt, - .matchsize = sizeof(struct xt_mac_info), - .hooks = (1 << NF_INET_PRE_ROUTING) | - (1 << NF_INET_LOCAL_IN) | - (1 << NF_INET_FORWARD), - .me = THIS_MODULE, - }, +static struct xt_match mac_mt_reg __read_mostly = { + .name = "mac", + .revision = 0, + .family = NFPROTO_UNSPEC, + .match = mac_mt, + .matchsize = sizeof(struct xt_mac_info), + .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD), + .me = THIS_MODULE, }; static int __init mac_mt_init(void) { - return xt_register_matches(mac_mt_reg, ARRAY_SIZE(mac_mt_reg)); + return xt_register_match(&mac_mt_reg); } static void __exit mac_mt_exit(void) { - xt_unregister_matches(mac_mt_reg, ARRAY_SIZE(mac_mt_reg)); + xt_unregister_match(&mac_mt_reg); } module_init(mac_mt_init); diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index 32f84e84d9e..f19ebd9b78f 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -160,17 +160,7 @@ static struct xt_match owner_mt_reg[] __read_mostly = { { .name = "owner", .revision = 1, - .family = NFPROTO_IPV4, - .match = owner_mt, - .matchsize = sizeof(struct xt_owner_match_info), - .hooks = (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_POST_ROUTING), - .me = THIS_MODULE, - }, - { - .name = "owner", - .revision = 1, - .family = NFPROTO_IPV6, + .family = NFPROTO_UNSPEC, .match = owner_mt, .matchsize = sizeof(struct xt_owner_match_info), .hooks = (1 << NF_INET_LOCAL_OUT) | diff --git a/net/netfilter/xt_physdev.c b/net/netfilter/xt_physdev.c index b01786d2dd9..1bcdfc12cf5 100644 --- a/net/netfilter/xt_physdev.c +++ b/net/netfilter/xt_physdev.c @@ -112,33 +112,24 @@ static bool physdev_mt_check(const struct xt_mtchk_param *par) return true; } -static struct xt_match physdev_mt_reg[] __read_mostly = { - { - .name = "physdev", - .family = NFPROTO_IPV4, - .checkentry = physdev_mt_check, - .match = physdev_mt, - .matchsize = sizeof(struct xt_physdev_info), - .me = THIS_MODULE, - }, - { - .name = "physdev", - .family = NFPROTO_IPV6, - .checkentry = physdev_mt_check, - .match = physdev_mt, - .matchsize = sizeof(struct xt_physdev_info), - .me = THIS_MODULE, - }, +static struct xt_match physdev_mt_reg __read_mostly = { + .name = "physdev", + .revision = 0, + .family = NFPROTO_UNSPEC, + .checkentry = physdev_mt_check, + .match = physdev_mt, + .matchsize = sizeof(struct xt_physdev_info), + .me = THIS_MODULE, }; static int __init physdev_mt_init(void) { - return xt_register_matches(physdev_mt_reg, ARRAY_SIZE(physdev_mt_reg)); + return xt_register_match(&physdev_mt_reg); } static void __exit physdev_mt_exit(void) { - xt_unregister_matches(physdev_mt_reg, ARRAY_SIZE(physdev_mt_reg)); + xt_unregister_match(&physdev_mt_reg); } module_init(physdev_mt_init); diff --git a/net/netfilter/xt_realm.c b/net/netfilter/xt_realm.c index b25942110ed..67419287bc7 100644 --- a/net/netfilter/xt_realm.c +++ b/net/netfilter/xt_realm.c @@ -36,7 +36,7 @@ static struct xt_match realm_mt_reg __read_mostly = { .matchsize = sizeof(struct xt_realm_info), .hooks = (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_LOCAL_IN), - .family = NFPROTO_IPV4, + .family = NFPROTO_UNSPEC, .me = THIS_MODULE }; -- cgit v1.2.3 From f39a9410ed0503278fd5edc559fa019051413039 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 8 Oct 2008 11:35:20 +0200 Subject: netfilter: xtables: remove bogus mangle table dependency of connmark Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/netfilter/Kconfig | 1 - 1 file changed, 1 deletion(-) (limited to 'net/netfilter') diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 899e78051d8..f70b4145ffc 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -318,7 +318,6 @@ config NETFILTER_XT_TARGET_CLASSIFY config NETFILTER_XT_TARGET_CONNMARK tristate '"CONNMARK" target support' - depends on IP_NF_MANGLE || IP6_NF_MANGLE depends on NF_CONNTRACK depends on NETFILTER_ADVANCED select NF_CONNTRACK_MARK -- cgit v1.2.3 From f901b64472fdabc72eca2b9426fa4e96972b64c4 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 11 Oct 2008 12:18:04 -0700 Subject: ipvs: Add proper dependencies on IP_VS, and fix description header line. Linus noted a build failure case: net/netfilter/ipvs/ip_vs_xmit.c: In function 'ip_vs_tunnel_xmit': net/netfilter/ipvs/ip_vs_xmit.c:616: error: implicit declaration of function 'ip_select_ident' The proper include file (net/ip.h) is being included in ip_vs_xmit.c to get that declaration. So the only possible case where this can happen is if CONFIG_INET is not enabled. This seems to be purely a missing dependency in the ipvs/Kconfig file IP_VS entry. Also, while we're here, remove the out of date "EXPERIMENTAL" string in the IP_VS config help header line. IP_VS no longer depends upon CONFIG_EXPERIMENTAL Signed-off-by: David S. Miller --- net/netfilter/ipvs/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/netfilter') diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index de6004de80b..05048e40326 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -2,8 +2,8 @@ # IP Virtual Server configuration # menuconfig IP_VS - tristate "IP virtual server support (EXPERIMENTAL)" - depends on NETFILTER + tristate "IP virtual server support" + depends on NET && INET && NETFILTER ---help--- IP Virtual Server support will let you build a high-performance virtual server based on cluster of two or more real servers. This -- cgit v1.2.3