From 318360646941d6f3d4c6e4ee99107392728a4079 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Dec 2005 23:13:48 -0800 Subject: [NETFILTER] ip_tables: NUMA-aware allocation Part of a performance problem with ip_tables is that memory allocation is not NUMA aware, but 'only' SMP aware (ie each CPU normally touch separate cache lines) Even with small iptables rules, the cost of this misplacement can be high on common workloads. Instead of using one vmalloc() area (located in the node of the iptables process), we now allocate an area for each possible CPU, using vmalloc_node() so that memory should be allocated in the CPU's node if possible. Port to arp_tables and ip6_tables by Harald Welte. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/netfilter/ip6_tables.c | 190 ++++++++++++++++++++++++++-------------- 1 file changed, 126 insertions(+), 64 deletions(-) (limited to 'net/ipv6/netfilter/ip6_tables.c') diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 95d469271c4..dd80020d874 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -86,11 +86,6 @@ static DECLARE_MUTEX(ip6t_mutex); context stops packets coming through and allows user context to read the counters or update the rules. - To be cache friendly on SMP, we arrange them like so: - [ n-entries ] - ... cache-align padding ... - [ n-entries ] - Hence the start of any table is given by get_table() below. */ /* The table itself */ @@ -108,20 +103,15 @@ struct ip6t_table_info unsigned int underflow[NF_IP6_NUMHOOKS]; /* ip6t_entry tables: one per CPU */ - char entries[0] ____cacheline_aligned; + void *entries[NR_CPUS]; }; static LIST_HEAD(ip6t_target); static LIST_HEAD(ip6t_match); static LIST_HEAD(ip6t_tables); +#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0) #define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) -#ifdef CONFIG_SMP -#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) -#else -#define TABLE_OFFSET(t,p) 0 -#endif - #if 0 #define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0) #define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; }) @@ -376,8 +366,7 @@ ip6t_do_table(struct sk_buff **pskb, read_lock_bh(&table->lock); IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - table_base = (void *)table->private->entries - + TABLE_OFFSET(table->private, smp_processor_id()); + table_base = (void *)table->private->entries[smp_processor_id()]; e = get_entry(table_base, table->private->hook_entry[hook]); #ifdef CONFIG_NETFILTER_DEBUG @@ -649,7 +638,8 @@ unconditional(const struct ip6t_ip6 *ipv6) /* Figures out from what hook each rule can be called: returns 0 if there are loops. Puts hook bitmask in comefrom. */ static int -mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) +mark_source_chains(struct ip6t_table_info *newinfo, + unsigned int valid_hooks, void *entry0) { unsigned int hook; @@ -658,7 +648,7 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) for (hook = 0; hook < NF_IP6_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct ip6t_entry *e - = (struct ip6t_entry *)(newinfo->entries + pos); + = (struct ip6t_entry *)(entry0 + pos); if (!(valid_hooks & (1 << hook))) continue; @@ -708,13 +698,13 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) goto next; e = (struct ip6t_entry *) - (newinfo->entries + pos); + (entry0 + pos); } while (oldpos == pos + e->next_offset); /* Move along one */ size = e->next_offset; e = (struct ip6t_entry *) - (newinfo->entries + pos + size); + (entry0 + pos + size); e->counters.pcnt = pos; pos += size; } else { @@ -731,7 +721,7 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) newpos = pos + e->next_offset; } e = (struct ip6t_entry *) - (newinfo->entries + newpos); + (entry0 + newpos); e->counters.pcnt = pos; pos = newpos; } @@ -941,6 +931,7 @@ static int translate_table(const char *name, unsigned int valid_hooks, struct ip6t_table_info *newinfo, + void *entry0, unsigned int size, unsigned int number, const unsigned int *hook_entries, @@ -961,11 +952,11 @@ translate_table(const char *name, duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ - ret = IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size, check_entry_size_and_hooks, newinfo, - newinfo->entries, - newinfo->entries + size, + entry0, + entry0 + size, hook_entries, underflows, &i); if (ret != 0) return ret; @@ -993,27 +984,24 @@ translate_table(const char *name, } } - if (!mark_source_chains(newinfo, valid_hooks)) + if (!mark_source_chains(newinfo, valid_hooks, entry0)) return -ELOOP; /* Finally, each sanity check must pass */ i = 0; - ret = IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size, check_entry, name, size, &i); if (ret != 0) { - IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, + IP6T_ENTRY_ITERATE(entry0, newinfo->size, cleanup_entry, &i); return ret; } /* And one copy for every other CPU */ for_each_cpu(i) { - if (i == 0) - continue; - memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i, - newinfo->entries, - SMP_ALIGN(newinfo->size)); + if (newinfo->entries[i] && newinfo->entries[i] != entry0) + memcpy(newinfo->entries[i], entry0, newinfo->size); } return ret; @@ -1029,15 +1017,12 @@ replace_table(struct ip6t_table *table, #ifdef CONFIG_NETFILTER_DEBUG { - struct ip6t_entry *table_base; - unsigned int i; + int cpu; - for_each_cpu(i) { - table_base = - (void *)newinfo->entries - + TABLE_OFFSET(newinfo, i); - - table_base->comefrom = 0xdead57ac; + for_each_cpu(cpu) { + struct ip6t_entry *table_base = newinfo->entries[cpu]; + if (table_base) + table_base->comefrom = 0xdead57ac; } } #endif @@ -1072,16 +1057,44 @@ add_entry_to_counter(const struct ip6t_entry *e, return 0; } +static inline int +set_entry_to_counter(const struct ip6t_entry *e, + struct ip6t_counters total[], + unsigned int *i) +{ + SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); + + (*i)++; + return 0; +} + static void get_counters(const struct ip6t_table_info *t, struct ip6t_counters counters[]) { unsigned int cpu; unsigned int i; + unsigned int curcpu; + + /* Instead of clearing (by a previous call to memset()) + * the counters and using adds, we set the counters + * with data used by 'current' CPU + * We dont care about preemption here. + */ + curcpu = raw_smp_processor_id(); + + i = 0; + IP6T_ENTRY_ITERATE(t->entries[curcpu], + t->size, + set_entry_to_counter, + counters, + &i); for_each_cpu(cpu) { + if (cpu == curcpu) + continue; i = 0; - IP6T_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), + IP6T_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, @@ -1098,6 +1111,7 @@ copy_entries_to_user(unsigned int total_size, struct ip6t_entry *e; struct ip6t_counters *counters; int ret = 0; + void *loc_cpu_entry; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -1109,13 +1123,13 @@ copy_entries_to_user(unsigned int total_size, return -ENOMEM; /* First, sum counters... */ - memset(counters, 0, countersize); write_lock_bh(&table->lock); get_counters(table->private, counters); write_unlock_bh(&table->lock); - /* ... then copy entire thing from CPU 0... */ - if (copy_to_user(userptr, table->private->entries, total_size) != 0) { + /* choose the copy that is on ourc node/cpu */ + loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; + if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; goto free_counters; } @@ -1127,7 +1141,7 @@ copy_entries_to_user(unsigned int total_size, struct ip6t_entry_match *m; struct ip6t_entry_target *t; - e = (struct ip6t_entry *)(table->private->entries + off); + e = (struct ip6t_entry *)(loc_cpu_entry + off); if (copy_to_user(userptr + off + offsetof(struct ip6t_entry, counters), &counters[num], @@ -1196,6 +1210,46 @@ get_entries(const struct ip6t_get_entries *entries, return ret; } +static void free_table_info(struct ip6t_table_info *info) +{ + int cpu; + for_each_cpu(cpu) { + if (info->size <= PAGE_SIZE) + kfree(info->entries[cpu]); + else + vfree(info->entries[cpu]); + } + kfree(info); +} + +static struct ip6t_table_info *alloc_table_info(unsigned int size) +{ + struct ip6t_table_info *newinfo; + int cpu; + + newinfo = kzalloc(sizeof(struct ip6t_table_info), GFP_KERNEL); + if (!newinfo) + return NULL; + + newinfo->size = size; + + for_each_cpu(cpu) { + if (size <= PAGE_SIZE) + newinfo->entries[cpu] = kmalloc_node(size, + GFP_KERNEL, + cpu_to_node(cpu)); + else + newinfo->entries[cpu] = vmalloc_node(size, + cpu_to_node(cpu)); + if (newinfo->entries[cpu] == NULL) { + free_table_info(newinfo); + return NULL; + } + } + + return newinfo; +} + static int do_replace(void __user *user, unsigned int len) { @@ -1204,6 +1258,7 @@ do_replace(void __user *user, unsigned int len) struct ip6t_table *t; struct ip6t_table_info *newinfo, *oldinfo; struct ip6t_counters *counters; + void *loc_cpu_entry, *loc_cpu_old_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1212,13 +1267,13 @@ do_replace(void __user *user, unsigned int len) if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) return -ENOMEM; - newinfo = vmalloc(sizeof(struct ip6t_table_info) - + SMP_ALIGN(tmp.size) * - (highest_possible_processor_id()+1)); + newinfo = alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; - if (copy_from_user(newinfo->entries, user + sizeof(tmp), + /* choose the copy that is on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; @@ -1229,10 +1284,9 @@ do_replace(void __user *user, unsigned int len) ret = -ENOMEM; goto free_newinfo; } - memset(counters, 0, tmp.num_counters * sizeof(struct ip6t_counters)); ret = translate_table(tmp.name, tmp.valid_hooks, - newinfo, tmp.size, tmp.num_entries, + newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); if (ret != 0) goto free_newinfo_counters; @@ -1271,8 +1325,9 @@ do_replace(void __user *user, unsigned int len) /* Get the old counters. */ get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - IP6T_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); - vfree(oldinfo); + loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; + IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL); + free_table_info(oldinfo); if (copy_to_user(tmp.counters, counters, sizeof(struct ip6t_counters) * tmp.num_counters) != 0) ret = -EFAULT; @@ -1284,11 +1339,11 @@ do_replace(void __user *user, unsigned int len) module_put(t->me); up(&ip6t_mutex); free_newinfo_counters_untrans: - IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL); + IP6T_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL); free_newinfo_counters: vfree(counters); free_newinfo: - vfree(newinfo); + free_table_info(newinfo); return ret; } @@ -1321,6 +1376,7 @@ do_add_counters(void __user *user, unsigned int len) struct ip6t_counters_info tmp, *paddc; struct ip6t_table *t; int ret = 0; + void *loc_cpu_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1350,7 +1406,9 @@ do_add_counters(void __user *user, unsigned int len) } i = 0; - IP6T_ENTRY_ITERATE(t->private->entries, + /* Choose the copy that is on our node */ + loc_cpu_entry = t->private->entries[smp_processor_id()]; + IP6T_ENTRY_ITERATE(loc_cpu_entry, t->private->size, add_counter_to_entry, paddc->counters, @@ -1543,28 +1601,29 @@ int ip6t_register_table(struct ip6t_table *table, struct ip6t_table_info *newinfo; static struct ip6t_table_info bootstrap = { 0, 0, 0, { 0 }, { 0 }, { } }; + void *loc_cpu_entry; - newinfo = vmalloc(sizeof(struct ip6t_table_info) - + SMP_ALIGN(repl->size) * - (highest_possible_processor_id()+1)); + newinfo = alloc_table_info(repl->size); if (!newinfo) return -ENOMEM; - memcpy(newinfo->entries, repl->entries, repl->size); + /* choose the copy on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(table->name, table->valid_hooks, - newinfo, repl->size, + newinfo, loc_cpu_entry, repl->size, repl->num_entries, repl->hook_entry, repl->underflow); if (ret != 0) { - vfree(newinfo); + free_table_info(newinfo); return ret; } ret = down_interruptible(&ip6t_mutex); if (ret != 0) { - vfree(newinfo); + free_table_info(newinfo); return ret; } @@ -1593,20 +1652,23 @@ int ip6t_register_table(struct ip6t_table *table, return ret; free_unlock: - vfree(newinfo); + free_table_info(newinfo); goto unlock; } void ip6t_unregister_table(struct ip6t_table *table) { + void *loc_cpu_entry; + down(&ip6t_mutex); LIST_DELETE(&ip6t_tables, table); up(&ip6t_mutex); /* Decrease module usage counts and free resources */ - IP6T_ENTRY_ITERATE(table->private->entries, table->private->size, + loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; + IP6T_ENTRY_ITERATE(loc_cpu_entry, table->private->size, cleanup_entry, NULL); - vfree(table->private); + free_table_info(table->private); } /* Returns 1 if the port is matched by the range, 0 otherwise */ -- cgit v1.2.3 From 14c850212ed8f8cbb5972ad6b8812e08a0bc901c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 27 Dec 2005 02:43:12 -0200 Subject: [INET_SOCK]: Move struct inet_sock & helper functions to net/inet_sock.h To help in reducing the number of include dependencies, several files were touched as they were getting needed headers indirectly for stuff they use. Thanks also to Alan Menegotto for pointing out that net/dccp/proto.c had linux/dccp.h include twice. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv6/netfilter/ip6_tables.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv6/netfilter/ip6_tables.c') diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index dd80020d874..ea43ef1d94a 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -15,6 +15,7 @@ * - new extension header parser code */ #include +#include #include #include #include -- cgit v1.2.3