diff options
Diffstat (limited to 'net')
35 files changed, 2646 insertions, 1015 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index ab935778ce8..7016e0c36b3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -115,18 +115,6 @@ #endif /* CONFIG_NET_RADIO */ #include <asm/current.h> -/* This define, if set, will randomly drop a packet when congestion - * is more than moderate. It helps fairness in the multi-interface - * case when one of them is a hog, but it kills performance for the - * single interface case so it is off now by default. - */ -#undef RAND_LIE - -/* Setting this will sample the queue lengths and thus congestion - * via a timer instead of as each packet is received. - */ -#undef OFFLINE_SAMPLE - /* * The list of packet types we will receive (as opposed to discard) * and the routines to invoke. @@ -159,11 +147,6 @@ static DEFINE_SPINLOCK(ptype_lock); static struct list_head ptype_base[16]; /* 16 way hashed list */ static struct list_head ptype_all; /* Taps */ -#ifdef OFFLINE_SAMPLE -static void sample_queue(unsigned long dummy); -static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0); -#endif - /* * The @dev_base list is protected by @dev_base_lock and the rtln * semaphore. @@ -215,7 +198,7 @@ static struct notifier_block *netdev_chain; * Device drivers call our routines to queue packets here. We empty the * queue in the local softnet handler. */ -DEFINE_PER_CPU(struct softnet_data, softnet_data) = { 0, }; +DEFINE_PER_CPU(struct softnet_data, softnet_data) = { NULL }; #ifdef CONFIG_SYSFS extern int netdev_sysfs_init(void); @@ -1363,71 +1346,13 @@ out: Receiver routines =======================================================================*/ -int netdev_max_backlog = 300; +int netdev_max_backlog = 1000; +int netdev_budget = 300; int weight_p = 64; /* old backlog weight */ -/* These numbers are selected based on intuition and some - * experimentatiom, if you have more scientific way of doing this - * please go ahead and fix things. - */ -int no_cong_thresh = 10; -int no_cong = 20; -int lo_cong = 100; -int mod_cong = 290; DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; -static void get_sample_stats(int cpu) -{ -#ifdef RAND_LIE - unsigned long rd; - int rq; -#endif - struct softnet_data *sd = &per_cpu(softnet_data, cpu); - int blog = sd->input_pkt_queue.qlen; - int avg_blog = sd->avg_blog; - - avg_blog = (avg_blog >> 1) + (blog >> 1); - - if (avg_blog > mod_cong) { - /* Above moderate congestion levels. */ - sd->cng_level = NET_RX_CN_HIGH; -#ifdef RAND_LIE - rd = net_random(); - rq = rd % netdev_max_backlog; - if (rq < avg_blog) /* unlucky bastard */ - sd->cng_level = NET_RX_DROP; -#endif - } else if (avg_blog > lo_cong) { - sd->cng_level = NET_RX_CN_MOD; -#ifdef RAND_LIE - rd = net_random(); - rq = rd % netdev_max_backlog; - if (rq < avg_blog) /* unlucky bastard */ - sd->cng_level = NET_RX_CN_HIGH; -#endif - } else if (avg_blog > no_cong) - sd->cng_level = NET_RX_CN_LOW; - else /* no congestion */ - sd->cng_level = NET_RX_SUCCESS; - - sd->avg_blog = avg_blog; -} - -#ifdef OFFLINE_SAMPLE -static void sample_queue(unsigned long dummy) -{ -/* 10 ms 0r 1ms -- i don't care -- JHS */ - int next_tick = 1; - int cpu = smp_processor_id(); - - get_sample_stats(cpu); - next_tick += jiffies; - mod_timer(&samp_timer, next_tick); -} -#endif - - /** * netif_rx - post buffer to the network code * @skb: buffer to post @@ -1448,7 +1373,6 @@ static void sample_queue(unsigned long dummy) int netif_rx(struct sk_buff *skb) { - int this_cpu; struct softnet_data *queue; unsigned long flags; @@ -1464,38 +1388,22 @@ int netif_rx(struct sk_buff *skb) * short when CPU is congested, but is still operating. */ local_irq_save(flags); - this_cpu = smp_processor_id(); queue = &__get_cpu_var(softnet_data); __get_cpu_var(netdev_rx_stat).total++; if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { if (queue->input_pkt_queue.qlen) { - if (queue->throttle) - goto drop; - enqueue: dev_hold(skb->dev); __skb_queue_tail(&queue->input_pkt_queue, skb); -#ifndef OFFLINE_SAMPLE - get_sample_stats(this_cpu); -#endif local_irq_restore(flags); - return queue->cng_level; + return NET_RX_SUCCESS; } - if (queue->throttle) - queue->throttle = 0; - netif_rx_schedule(&queue->backlog_dev); goto enqueue; } - if (!queue->throttle) { - queue->throttle = 1; - __get_cpu_var(netdev_rx_stat).throttled++; - } - -drop: __get_cpu_var(netdev_rx_stat).dropped++; local_irq_restore(flags); @@ -1780,8 +1688,6 @@ job_done: smp_mb__before_clear_bit(); netif_poll_enable(backlog_dev); - if (queue->throttle) - queue->throttle = 0; local_irq_enable(); return 0; } @@ -1790,8 +1696,7 @@ static void net_rx_action(struct softirq_action *h) { struct softnet_data *queue = &__get_cpu_var(softnet_data); unsigned long start_time = jiffies; - int budget = netdev_max_backlog; - + int budget = netdev_budget; local_irq_disable(); @@ -2055,15 +1960,9 @@ static int softnet_seq_show(struct seq_file *seq, void *v) struct netif_rx_stats *s = v; seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", - s->total, s->dropped, s->time_squeeze, s->throttled, - s->fastroute_hit, s->fastroute_success, s->fastroute_defer, - s->fastroute_deferred_out, -#if 0 - s->fastroute_latency_reduction -#else - s->cpu_collision -#endif - ); + s->total, s->dropped, s->time_squeeze, 0, + 0, 0, 0, 0, /* was fastroute */ + s->cpu_collision ); return 0; } @@ -3305,9 +3204,6 @@ static int __init net_dev_init(void) queue = &per_cpu(softnet_data, i); skb_queue_head_init(&queue->input_pkt_queue); - queue->throttle = 0; - queue->cng_level = 0; - queue->avg_blog = 10; /* arbitrary non-zero */ queue->completion_queue = NULL; INIT_LIST_HEAD(&queue->poll_list); set_bit(__LINK_STATE_START, &queue->backlog_dev.state); @@ -3316,11 +3212,6 @@ static int __init net_dev_init(void) atomic_set(&queue->backlog_dev.refcnt, 1); } -#ifdef OFFLINE_SAMPLE - samp_timer.expires = jiffies + (10 * HZ); - add_timer(&samp_timer); -#endif - dev_boot_phase = 0; open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index f6bdcad47da..851eb927ed9 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -32,6 +32,7 @@ #include <net/sock.h> #include <linux/rtnetlink.h> #include <linux/random.h> +#include <linux/string.h> #define NEIGH_DEBUG 1 @@ -2592,7 +2593,7 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, t->neigh_vars[17].extra1 = dev; } - dev_name = net_sysctl_strdup(dev_name_source); + dev_name = kstrdup(dev_name_source, GFP_KERNEL); if (!dev_name) { err = -ENOBUFS; goto free; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6d68c03bc05..bb73b2190ec 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1500,6 +1500,159 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) skb_split_no_header(skb, skb1, len, pos); } +/** + * skb_prepare_seq_read - Prepare a sequential read of skb data + * @skb: the buffer to read + * @from: lower offset of data to be read + * @to: upper offset of data to be read + * @st: state variable + * + * Initializes the specified state variable. Must be called before + * invoking skb_seq_read() for the first time. + */ +void skb_prepare_seq_read(struct sk_buff *skb, unsigned int from, + unsigned int to, struct skb_seq_state *st) +{ + st->lower_offset = from; + st->upper_offset = to; + st->root_skb = st->cur_skb = skb; + st->frag_idx = st->stepped_offset = 0; + st->frag_data = NULL; +} + +/** + * skb_seq_read - Sequentially read skb data + * @consumed: number of bytes consumed by the caller so far + * @data: destination pointer for data to be returned + * @st: state variable + * + * Reads a block of skb data at &consumed relative to the + * lower offset specified to skb_prepare_seq_read(). Assigns + * the head of the data block to &data and returns the length + * of the block or 0 if the end of the skb data or the upper + * offset has been reached. + * + * The caller is not required to consume all of the data + * returned, i.e. &consumed is typically set to the number + * of bytes already consumed and the next call to + * skb_seq_read() will return the remaining part of the block. + * + * Note: The size of each block of data returned can be arbitary, + * this limitation is the cost for zerocopy seqeuental + * reads of potentially non linear data. + * + * Note: Fragment lists within fragments are not implemented + * at the moment, state->root_skb could be replaced with + * a stack for this purpose. + */ +unsigned int skb_seq_read(unsigned int consumed, const u8 **data, + struct skb_seq_state *st) +{ + unsigned int block_limit, abs_offset = consumed + st->lower_offset; + skb_frag_t *frag; + + if (unlikely(abs_offset >= st->upper_offset)) + return 0; + +next_skb: + block_limit = skb_headlen(st->cur_skb); + + if (abs_offset < block_limit) { + *data = st->cur_skb->data + abs_offset; + return block_limit - abs_offset; + } + + if (st->frag_idx == 0 && !st->frag_data) + st->stepped_offset += skb_headlen(st->cur_skb); + + while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { + frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx]; + block_limit = frag->size + st->stepped_offset; + + if (abs_offset < block_limit) { + if (!st->frag_data) + st->frag_data = kmap_skb_frag(frag); + + *data = (u8 *) st->frag_data + frag->page_offset + + (abs_offset - st->stepped_offset); + + return block_limit - abs_offset; + } + + if (st->frag_data) { + kunmap_skb_frag(st->frag_data); + st->frag_data = NULL; + } + + st->frag_idx++; + st->stepped_offset += frag->size; + } + + if (st->cur_skb->next) { + st->cur_skb = st->cur_skb->next; + st->frag_idx = 0; + goto next_skb; + } else if (st->root_skb == st->cur_skb && + skb_shinfo(st->root_skb)->frag_list) { + st->cur_skb = skb_shinfo(st->root_skb)->frag_list; + goto next_skb; + } + + return 0; +} + +/** + * skb_abort_seq_read - Abort a sequential read of skb data + * @st: state variable + * + * Must be called if skb_seq_read() was not called until it + * returned 0. + */ +void skb_abort_seq_read(struct skb_seq_state *st) +{ + if (st->frag_data) + kunmap_skb_frag(st->frag_data); +} + +#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb)) + +static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text, + struct ts_config *conf, + struct ts_state *state) +{ + return skb_seq_read(offset, text, TS_SKB_CB(state)); +} + +static void skb_ts_finish(struct ts_config *conf, struct ts_state *state) +{ + skb_abort_seq_read(TS_SKB_CB(state)); +} + +/** + * skb_find_text - Find a text pattern in skb data + * @skb: the buffer to look in + * @from: search offset + * @to: search limit + * @config: textsearch configuration + * @state: uninitialized textsearch state variable + * + * Finds a pattern in the skb data according to the specified + * textsearch configuration. Use textsearch_next() to retrieve + * subsequent occurrences of the pattern. Returns the offset + * to the first occurrence or UINT_MAX if no match was found. + */ +unsigned int skb_find_text(struct sk_buff *skb, unsigned int from, + unsigned int to, struct ts_config *config, + struct ts_state *state) +{ + config->get_next_block = skb_ts_get_next_block; + config->finish = skb_ts_finish; + + skb_prepare_seq_read(skb, from, to, TS_SKB_CB(state)); + + return textsearch_find(config, state); +} + void __init skb_init(void) { skbuff_head_cache = kmem_cache_create("skbuff_head_cache", @@ -1538,3 +1691,7 @@ EXPORT_SYMBOL(skb_queue_tail); EXPORT_SYMBOL(skb_unlink); EXPORT_SYMBOL(skb_append); EXPORT_SYMBOL(skb_split); +EXPORT_SYMBOL(skb_prepare_seq_read); +EXPORT_SYMBOL(skb_seq_read); +EXPORT_SYMBOL(skb_abort_seq_read); +EXPORT_SYMBOL(skb_find_text); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index c8be646cb19..8f817ad9f54 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -13,12 +13,8 @@ #ifdef CONFIG_SYSCTL extern int netdev_max_backlog; +extern int netdev_budget; extern int weight_p; -extern int no_cong_thresh; -extern int no_cong; -extern int lo_cong; -extern int mod_cong; -extern int netdev_fastroute; extern int net_msg_cost; extern int net_msg_burst; @@ -35,19 +31,6 @@ extern int sysctl_somaxconn; extern char sysctl_divert_version[]; #endif /* CONFIG_NET_DIVERT */ -/* - * This strdup() is used for creating copies of network - * device names to be handed over to sysctl. - */ - -char *net_sysctl_strdup(const char *s) -{ - char *rv = kmalloc(strlen(s)+1, GFP_KERNEL); - if (rv) - strcpy(rv, s); - return rv; -} - ctl_table core_table[] = { #ifdef CONFIG_NET { @@ -99,38 +82,6 @@ ctl_table core_table[] = { .proc_handler = &proc_dointvec }, { - .ctl_name = NET_CORE_NO_CONG_THRESH, - .procname = "no_cong_thresh", - .data = &no_cong_thresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = NET_CORE_NO_CONG, - .procname = "no_cong", - .data = &no_cong, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = NET_CORE_LO_CONG, - .procname = "lo_cong", - .data = &lo_cong, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { - .ctl_name = NET_CORE_MOD_CONG, - .procname = "mod_cong", - .data = &mod_cong, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, - { .ctl_name = NET_CORE_MSG_COST, .procname = "message_cost", .data = &net_msg_cost, @@ -174,9 +125,15 @@ ctl_table core_table[] = { .mode = 0644, .proc_handler = &proc_dointvec }, + { + .ctl_name = NET_CORE_BUDGET, + .procname = "netdev_budget", + .data = &netdev_budget, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, { .ctl_name = 0 } }; -EXPORT_SYMBOL(net_sysctl_strdup); - #endif diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 567b03b1c34..34708343312 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -1,32 +1,6 @@ # # IP configuration # -choice - prompt "Choose IP: FIB lookup" - depends on INET - default IP_FIB_HASH - -config IP_FIB_HASH - bool "FIB_HASH" - ---help--- - Current FIB is very proven and good enough for most users. - -config IP_FIB_TRIE - bool "FIB_TRIE" - ---help--- - Use new experimental LC-trie as FIB lookup algoritm. - This improves lookup performance - - LC-trie is described in: - - IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson - IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999 - An experimental study of compression methods for dynamic tries - Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. - http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/ - -endchoice - config IP_MULTICAST bool "IP: multicasting" depends on INET @@ -79,6 +53,44 @@ config IP_ADVANCED_ROUTER If unsure, say N here. +choice + prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)" + depends on IP_ADVANCED_ROUTER + default IP_FIB_HASH + +config IP_FIB_HASH + bool "FIB_HASH" + ---help--- + Current FIB is very proven and good enough for most users. + +config IP_FIB_TRIE + bool "FIB_TRIE" + ---help--- + Use new experimental LC-trie as FIB lookup algoritm. + This improves lookup performance if you have a large + number of routes. + + LC-trie is a longest matching prefix lookup algorithm which + performs better than FIB_HASH for large routing tables. + But, it consumes more memory and is more complex. + + LC-trie is described in: + + IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson + IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999 + An experimental study of compression methods for dynamic tries + Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. + http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/ + +endchoice + +# If the user does not enable advanced routing, he gets the safe +# default of the fib-hash algorithm. +config IP_FIB_HASH + bool + depends on !IP_ADVANCED_ROUTER + default y + config IP_MULTIPLE_TABLES bool "IP: policy routing" depends on IP_ADVANCED_ROUTER @@ -433,5 +445,113 @@ config IP_TCPDIAG config IP_TCPDIAG_IPV6 def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6) +config TCP_CONG_ADVANCED + bool "TCP: advanced congestion control" + depends on INET + default y + ---help--- + Support for selection of various TCP congestion control + modules. + + Nearly all users can safely say no here, and a safe default + selection will be made (BIC-TCP with new Reno as a fallback). + + If unsure, say N. + +# TCP Reno is builtin (required as fallback) +menu "TCP congestion control" + depends on TCP_CONG_ADVANCED + +config TCP_CONG_BIC + tristate "Binary Increase Congestion (BIC) control" + depends on INET + default y + ---help--- + BIC-TCP is a sender-side only change that ensures a linear RTT + fairness under large windows while offering both scalability and + bounded TCP-friendliness. The protocol combines two schemes + called additive increase and binary search increase. When the + congestion window is large, additive increase with a large + increment ensures linear RTT fairness as well as good + scalability. Under small congestion windows, binary search + increase provides TCP friendliness. + See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ + +config TCP_CONG_WESTWOOD + tristate "TCP Westwood+" + depends on INET + default m + ---help--- + TCP Westwood+ is a sender-side only modification of the TCP Reno + protocol stack that optimizes the performance of TCP congestion + control. It is based on end-to-end bandwidth estimation to set + congestion window and slow start threshold after a congestion + episode. Using this estimation, TCP Westwood+ adaptively sets a + slow start threshold and a congestion window which takes into + account the bandwidth used at the time congestion is experienced. + TCP Westwood+ significantly increases fairness wrt TCP Reno in + wired networks and throughput over wireless links. + +config TCP_CONG_HTCP + tristate "H-TCP" + depends on INET + default m + ---help--- + H-TCP is a send-side only modifications of the TCP Reno + protocol stack that optimizes the performance of TCP + congestion control for high speed network links. It uses a + modeswitch to change the alpha and beta parameters of TCP Reno + based on network conditions and in a way so as to be fair with + other Reno and H-TCP flows. + +config TCP_CONG_HSTCP + tristate "High Speed TCP" + depends on INET && EXPERIMENTAL + default n + ---help--- + Sally Floyd's High Speed TCP (RFC 3649) congestion control. + A modification to TCP's congestion control mechanism for use + with large congestion windows. A table indicates how much to + increase the congestion window by when an ACK is received. + For more detail see http://www.icir.org/floyd/hstcp.html + +config TCP_CONG_HYBLA + tristate "TCP-Hybla congestion control algorithm" + depends on INET && EXPERIMENTAL + default n + ---help--- + TCP-Hybla is a sender-side only change that eliminates penalization of + long-RTT, large-bandwidth connections, like when satellite legs are + involved, expecially when sharing a common bottleneck with normal + terrestrial connections. + +config TCP_CONG_VEGAS + tristate "TCP Vegas" + depends on INET && EXPERIMENTAL + default n + ---help--- + TCP Vegas is a sender-side only change to TCP that anticipates + the onset of congestion by estimating the bandwidth. TCP Vegas + adjusts the sending rate by modifying the congestion + window. TCP Vegas should provide less packet loss, but it is + not as aggressive as TCP Reno. + +config TCP_CONG_SCALABLE + tristate "Scalable TCP" + depends on INET && EXPERIMENTAL + default n + ---help--- + Scalable TCP is a sender-side only change to TCP which uses a + MIMD congestion control algorithm which has some nice scaling + properties, though is known to have fairness issues. + See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/ + +endmenu + +config TCP_CONG_BIC + boolean + depends on !TCP_CONG_ADVANCED + default y + source "net/ipv4/ipvs/Kconfig" diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 65d57d8e1ad..5718cdb3a61 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -5,7 +5,8 @@ obj-y := utils.o route.o inetpeer.o protocol.o \ ip_input.o ip_fragment.o ip_forward.o ip_options.o \ ip_output.o ip_sockglue.o \ - tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \ + tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ + tcp_minisocks.o tcp_cong.o \ datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ sysctl_net_ipv4.o fib_frontend.o fib_semantics.o @@ -30,6 +31,13 @@ obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_IP_VS) += ipvs/ obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o +obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o +obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o +obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o +obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o +obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o +obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o +obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 650dcb12d9a..d8a10e3dd77 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1471,7 +1471,7 @@ static void devinet_sysctl_register(struct in_device *in_dev, * by sysctl and we wouldn't want anyone to change it under our feet * (see SIOCSIFNAME). */ - dev_name = net_sysctl_strdup(dev_name); + dev_name = kstrdup(dev_name, GFP_KERNEL); if (!dev_name) goto free; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 23068bddbf0..e3289453241 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -118,6 +118,45 @@ static int ipv4_sysctl_forward_strategy(ctl_table *table, return 1; } +static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char val[TCP_CA_NAME_MAX]; + ctl_table tbl = { + .data = val, + .maxlen = TCP_CA_NAME_MAX, + }; + int ret; + + tcp_get_default_congestion_control(val); + + ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos); + if (write && ret == 0) + ret = tcp_set_default_congestion_control(val); + return ret; +} + +int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, + void **context) +{ + char val[TCP_CA_NAME_MAX]; + ctl_table tbl = { + .data = val, + .maxlen = TCP_CA_NAME_MAX, + }; + int ret; + + tcp_get_default_congestion_control(val); + ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen, + context); + if (ret == 0 && newval && newlen) + ret = tcp_set_default_congestion_control(val); + return ret; +} + + ctl_table ipv4_table[] = { { .ctl_name = NET_IPV4_TCP_TIMESTAMPS, @@ -612,70 +651,6 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_dointvec, }, { - .ctl_name = NET_TCP_WESTWOOD, - .procname = "tcp_westwood", - .data = &sysctl_tcp_westwood, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_VEGAS, - .procname = "tcp_vegas_cong_avoid", - .data = &sysctl_tcp_vegas_cong_avoid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_VEGAS_ALPHA, - .procname = "tcp_vegas_alpha", - .data = &sysctl_tcp_vegas_alpha, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_VEGAS_BETA, - .procname = "tcp_vegas_beta", - .data = &sysctl_tcp_vegas_beta, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_VEGAS_GAMMA, - .procname = "tcp_vegas_gamma", - .data = &sysctl_tcp_vegas_gamma, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_BIC, - .procname = "tcp_bic", - .data = &sysctl_tcp_bic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_BIC_FAST_CONVERGENCE, - .procname = "tcp_bic_fast_convergence", - .data = &sysctl_tcp_bic_fast_convergence, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_BIC_LOW_WINDOW, - .procname = "tcp_bic_low_window", - .data = &sysctl_tcp_bic_low_window, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { .ctl_name = NET_TCP_MODERATE_RCVBUF, .procname = "tcp_moderate_rcvbuf", .data = &sysctl_tcp_moderate_rcvbuf, @@ -692,13 +667,14 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_dointvec, }, { - .ctl_name = NET_TCP_BIC_BETA, - .procname = "tcp_bic_beta", - .data = &sysctl_tcp_bic_beta, - .maxlen = sizeof(int), + .ctl_name = NET_TCP_CONG_CONTROL, + .procname = "tcp_congestion_control", .mode = 0644, - .proc_handler = &proc_dointvec, + .maxlen = TCP_CA_NAME_MAX, + .proc_handler = &proc_tcp_congestion_control, + .strategy = &sysctl_tcp_congestion_control, }, + { .ctl_name = 0 } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 674bbd8cfd3..882436da9a3 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1927,6 +1927,25 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, return tp->af_specific->setsockopt(sk, level, optname, optval, optlen); + /* This is a string value all the others are int's */ + if (optname == TCP_CONGESTION) { + char name[TCP_CA_NAME_MAX]; + + if (optlen < 1) + return -EINVAL; + + val = strncpy_from_user(name, optval, + min(TCP_CA_NAME_MAX-1, optlen)); + if (val < 0) + return -EFAULT; + name[val] = 0; + + lock_sock(sk); + err = tcp_set_congestion_control(tp, name); + release_sock(sk); + return err; + } + if (optlen < sizeof(int)) return -EINVAL; @@ -2211,6 +2230,16 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, case TCP_QUICKACK: val = !tp->ack.pingpong; break; + + case TCP_CONGESTION: + if (get_user(len, optlen)) + return -EFAULT; + len = min_t(unsigned int, len, TCP_CA_NAME_MAX); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, tp->ca_ops->name, len)) + return -EFAULT; + return 0; default: return -ENOPROTOOPT; }; @@ -2224,7 +2253,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, extern void __skb_cb_too_small_for_tcp(int, int); -extern void tcpdiag_init(void); +extern struct tcp_congestion_ops tcp_reno; static __initdata unsigned long thash_entries; static int __init set_thash_entries(char *str) @@ -2333,6 +2362,8 @@ void __init tcp_init(void) printk(KERN_INFO "TCP: Hash tables configured " "(established %d bind %d)\n", tcp_ehash_size << 1, tcp_bhash_size); + + tcp_register_congestion_control(&tcp_reno); } EXPORT_SYMBOL(tcp_accept); diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c new file mode 100644 index 00000000000..ec38d45d664 --- /dev/null +++ b/net/ipv4/tcp_bic.c @@ -0,0 +1,331 @@ +/* + * Binary Increase Congestion control for TCP + * + * This is from the implementation of BICTCP in + * Lison-Xu, Kahaled Harfoush, and Injong Rhee. + * "Binary Increase Congestion Control for Fast, Long Distance + * Networks" in InfoComm 2004 + * Available from: + * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf + * + * Unless BIC is enabled and congestion window is large + * this behaves the same as the original Reno. + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <net/tcp.h> + + +#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation + * max_cwnd = snd_cwnd * beta + */ +#define BICTCP_B 4 /* + * In binary search, + * go to point (max+min)/N + */ + +static int fast_convergence = 1; +static int max_increment = 32; +static int low_window = 14; +static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ +static int low_utilization_threshold = 153; +static int low_utilization_period = 2; +static int initial_ssthresh = 100; +static int smooth_part = 20; + +module_param(fast_convergence, int, 0644); +MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence"); +module_param(max_increment, int, 0644); +MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search"); +module_param(low_window, int, 0644); +MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)"); +module_param(beta, int, 0644); +MODULE_PARM_DESC(beta, "beta for multiplicative increase"); +module_param(low_utilization_threshold, int, 0644); +MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode"); +module_param(low_utilization_period, int, 0644); +MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)"); +module_param(initial_ssthresh, int, 0644); +MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); +module_param(smooth_part, int, 0644); +MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax"); + + +/* BIC TCP Parameters */ +struct bictcp { + u32 cnt; /* increase cwnd by 1 after ACKs */ + u32 last_max_cwnd; /* last maximum snd_cwnd */ + u32 loss_cwnd; /* congestion window at last loss */ + u32 last_cwnd; /* the last snd_cwnd */ + u32 last_time; /* time when updated last_cwnd */ + u32 delay_min; /* min delay */ + u32 delay_max; /* max delay */ + u32 last_delay; + u8 low_utilization;/* 0: high; 1: low */ + u32 low_utilization_start; /* starting time of low utilization detection*/ + u32 epoch_start; /* beginning of an epoch */ +#define ACK_RATIO_SHIFT 4 + u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ +}; + +static inline void bictcp_reset(struct bictcp *ca) +{ + ca->cnt = 0; + ca->last_max_cwnd = 0; + ca->loss_cwnd = 0; + ca->last_cwnd = 0; + ca->last_time = 0; + ca->delay_min = 0; + ca->delay_max = 0; + ca->last_delay = 0; + ca->low_utilization = 0; + ca->low_utilization_start = 0; + ca->epoch_start = 0; + ca->delayed_ack = 2 << ACK_RATIO_SHIFT; +} + +static void bictcp_init(struct tcp_sock *tp) +{ + bictcp_reset(tcp_ca(tp)); + if (initial_ssthresh) + tp->snd_ssthresh = initial_ssthresh; +} + +/* + * Compute congestion window to use. + */ +static inline void bictcp_update(struct bictcp *ca, u32 cwnd) +{ + if (ca->last_cwnd == cwnd && + (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) + return; + + ca->last_cwnd = cwnd; + ca->last_time = tcp_time_stamp; + + if (ca->epoch_start == 0) /* record the beginning of an epoch */ + ca->epoch_start = tcp_time_stamp; + + /* start off normal */ + if (cwnd <= low_window) { + ca->cnt = cwnd; + return; + } + + /* binary increase */ + if (cwnd < ca->last_max_cwnd) { + __u32 dist = (ca->last_max_cwnd - cwnd) + / BICTCP_B; + + if (dist > max_increment) + /* linear increase */ + ca->cnt = cwnd / max_increment; + else if (dist <= 1U) + /* binary search increase */ + ca->cnt = (cwnd * smooth_part) / BICTCP_B; + else + /* binary search increase */ + ca->cnt = cwnd / dist; + } else { + /* slow start AMD linear increase */ + if (cwnd < ca->last_max_cwnd + BICTCP_B) + /* slow start */ + ca->cnt = (cwnd * smooth_part) / BICTCP_B; + else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1)) + /* slow start */ + ca->cnt = (cwnd * (BICTCP_B-1)) + / cwnd-ca->last_max_cwnd; + else + /* linear increase */ + ca->cnt = cwnd / max_increment; + } + + /* if in slow start or link utilization is very low */ + if ( ca->loss_cwnd == 0 || + (cwnd > ca->loss_cwnd && ca->low_utilization)) { + if (ca->cnt > 20) /* increase cwnd 5% per RTT */ + ca->cnt = 20; + } + + ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack; + if (ca->cnt == 0) /* cannot be zero */ + ca->cnt = 1; +} + + +/* Detect low utilization in congestion avoidance */ +static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag) +{ + struct bictcp *ca = tcp_ca(tp); + u32 dist, delay; + + /* No time stamp */ + if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) || + /* Discard delay samples right after fast recovery */ + tcp_time_stamp < ca->epoch_start + HZ || + /* this delay samples may not be accurate */ + flag == 0) { + ca->last_delay = 0; + goto notlow; + } + + delay = ca->last_delay<<3; /* use the same scale as tp->srtt*/ + ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr; + if (delay == 0) /* no previous delay sample */ + goto notlow; + + /* first time call or link delay decreases */ + if (ca->delay_min == 0 || ca->delay_min > delay) { + ca->delay_min = ca->delay_max = delay; + goto notlow; + } + + if (ca->delay_max < delay) + ca->delay_max = delay; + + /* utilization is low, if avg delay < dist*threshold + for checking_period time */ + dist = ca->delay_max - ca->delay_min; + if (dist <= ca->delay_min>>6 || + tp->srtt - ca->delay_min >= (dist*low_utilization_threshold)>>10) + goto notlow; + + if (ca->low_utilization_start == 0) { + ca->low_utilization = 0; + ca->low_utilization_start = tcp_time_stamp; + } else if ((s32)(tcp_time_stamp - ca->low_utilization_start) + > low_utilization_period*HZ) { + ca->low_utilization = 1; + } + + return; + + notlow: + ca->low_utilization = 0; + ca->low_utilization_start = 0; + +} + +static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack, + u32 seq_rtt, u32 in_flight, int data_acked) +{ + struct bictcp *ca = tcp_ca(tp); + + bictcp_low_utilization(tp, data_acked); + + if (in_flight < tp->snd_cwnd) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* In "safe" area, increase. */ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } else { + bictcp_update(ca, tp->snd_cwnd); + + /* In dangerous area, increase slowly. + * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd + */ + if (tp->snd_cwnd_cnt >= ca->cnt) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } + +} + +/* + * behave like Reno until low_window is reached, + * then increase congestion window slowly + */ +static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp) +{ + struct bictcp *ca = tcp_ca(tp); + + ca->epoch_start = 0; /* end of epoch */ + + /* in case of wrong delay_max*/ + if (ca->delay_min > 0 && ca->delay_max > ca->delay_min) + ca->delay_max = ca->delay_min + + ((ca->delay_max - ca->delay_min)* 90) / 100; + + /* Wmax and fast convergence */ + if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) + ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) + / (2 * BICTCP_BETA_SCALE); + else + ca->last_max_cwnd = tp->snd_cwnd; + + ca->loss_cwnd = tp->snd_cwnd; + + + if (tp->snd_cwnd <= low_window) + return max(tp->snd_cwnd >> 1U, 2U); + else + return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); +} + +static u32 bictcp_undo_cwnd(struct tcp_sock *tp) +{ + struct bictcp *ca = tcp_ca(tp); + + return max(tp->snd_cwnd, ca->last_max_cwnd); +} + +static u32 bictcp_min_cwnd(struct tcp_sock *tp) +{ + return tp->snd_ssthresh; +} + +static void bictcp_state(struct tcp_sock *tp, u8 new_state) +{ + if (new_state == TCP_CA_Loss) + bictcp_reset(tcp_ca(tp)); +} + +/* Track delayed acknowledgement ratio using sliding window + * ratio = (15*ratio + sample) / 16 + */ +static void bictcp_acked(struct tcp_sock *tp, u32 cnt) +{ + if (cnt > 0 && tp->ca_state == TCP_CA_Open) { + struct bictcp *ca = tcp_ca(tp); + cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; + ca->delayed_ack += cnt; + } +} + + +static struct tcp_congestion_ops bictcp = { + .init = bictcp_init, + .ssthresh = bictcp_recalc_ssthresh, + .cong_avoid = bictcp_cong_avoid, + .set_state = bictcp_state, + .undo_cwnd = bictcp_undo_cwnd, + .min_cwnd = bictcp_min_cwnd, + .pkts_acked = bictcp_acked, + .owner = THIS_MODULE, + .name = "bic", +}; + +static int __init bictcp_register(void) +{ + BUG_ON(sizeof(struct bictcp) > TCP_CA_PRIV_SIZE); + return tcp_register_congestion_control(&bictcp); +} + +static void __exit bictcp_unregister(void) +{ + tcp_unregister_congestion_control(&bictcp); +} + +module_init(bictcp_register); +module_exit(bictcp_unregister); + +MODULE_AUTHOR("Stephen Hemminger"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("BIC TCP"); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c new file mode 100644 index 00000000000..4970d10a778 --- /dev/null +++ b/net/ipv4/tcp_cong.c @@ -0,0 +1,237 @@ +/* + * Plugable TCP congestion control support and newReno + * congestion control. + * Based on ideas from I/O scheduler suport and Web100. + * + * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org> + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/types.h> +#include <linux/list.h> +#include <net/tcp.h> + +static DEFINE_SPINLOCK(tcp_cong_list_lock); +static LIST_HEAD(tcp_cong_list); + +/* Simple linear search, don't expect many entries! */ +static struct tcp_congestion_ops *tcp_ca_find(const char *name) +{ + struct tcp_congestion_ops *e; + + list_for_each_entry_rcu(e, &tcp_cong_list, list) { + if (strcmp(e->name, name) == 0) + return e; + } + + return NULL; +} + +/* + * Attach new congestion control algorthim to the list + * of available options. + */ +int tcp_register_congestion_control(struct tcp_congestion_ops *ca) +{ + int ret = 0; + + /* all algorithms must implement ssthresh and cong_avoid ops */ + if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) { + printk(KERN_ERR "TCP %s does not implement required ops\n", + ca->name); + return -EINVAL; + } + + spin_lock(&tcp_cong_list_lock); + if (tcp_ca_find(ca->name)) { + printk(KERN_NOTICE "TCP %s already registered\n", ca->name); + ret = -EEXIST; + } else { + list_add_rcu(&ca->list, &tcp_cong_list); + printk(KERN_INFO "TCP %s registered\n", ca->name); + } + spin_unlock(&tcp_cong_list_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(tcp_register_congestion_control); + +/* + * Remove congestion control algorithm, called from + * the module's remove function. Module ref counts are used + * to ensure that this can't be done till all sockets using + * that method are closed. + */ +void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) +{ + spin_lock(&tcp_cong_list_lock); + list_del_rcu(&ca->list); + spin_unlock(&tcp_cong_list_lock); +} +EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); + +/* Assign choice of congestion control. */ +void tcp_init_congestion_control(struct tcp_sock *tp) +{ + struct tcp_congestion_ops *ca; + + if (tp->ca_ops != &tcp_init_congestion_ops) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(ca, &tcp_cong_list, list) { + if (try_module_get(ca->owner)) { + tp->ca_ops = ca; + break; + } + + } + rcu_read_unlock(); + + if (tp->ca_ops->init) + tp->ca_ops->init(tp); +} + +/* Manage refcounts on socket close. */ +void tcp_cleanup_congestion_control(struct tcp_sock *tp) +{ + if (tp->ca_ops->release) + tp->ca_ops->release(tp); + module_put(tp->ca_ops->owner); +} + +/* Used by sysctl to change default congestion control */ +int tcp_set_default_congestion_control(const char *name) +{ + struct tcp_congestion_ops *ca; + int ret = -ENOENT; + + spin_lock(&tcp_cong_list_lock); + ca = tcp_ca_find(name); +#ifdef CONFIG_KMOD + if (!ca) { + spin_unlock(&tcp_cong_list_lock); + + request_module("tcp_%s", name); + spin_lock(&tcp_cong_list_lock); + ca = tcp_ca_find(name); + } +#endif + + if (ca) { + list_move(&ca->list, &tcp_cong_list); + ret = 0; + } + spin_unlock(&tcp_cong_list_lock); + + return ret; +} + +/* Get current default congestion control */ +void tcp_get_default_congestion_control(char *name) +{ + struct tcp_congestion_ops *ca; + /* We will always have reno... */ + BUG_ON(list_empty(&tcp_cong_list)); + + rcu_read_lock(); + ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list); + strncpy(name, ca->name, TCP_CA_NAME_MAX); + rcu_read_unlock(); +} + +/* Change congestion control for socket */ +int tcp_set_congestion_control(struct tcp_sock *tp, const char *name) +{ + struct tcp_congestion_ops *ca; + int err = 0; + + rcu_read_lock(); + ca = tcp_ca_find(name); + if (ca == tp->ca_ops) + goto out; + + if (!ca) + err = -ENOENT; + + else if (!try_module_get(ca->owner)) + err = -EBUSY; + + else { + tcp_cleanup_congestion_control(tp); + tp->ca_ops = ca; + if (tp->ca_ops->init) + tp->ca_ops->init(tp); + } + out: + rcu_read_unlock(); + return err; +} + +/* + * TCP Reno congestion control + * This is special case used for fallback as well. + */ +/* This is Jacobson's slow start and congestion avoidance. + * SIGCOMM '88, p. 328. + */ +void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight, + int flag) +{ + if (in_flight < tp->snd_cwnd) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* In "safe" area, increase. */ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } else { + /* In dangerous area, increase slowly. + * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd + */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } +} +EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); + +/* Slow start threshold is half the congestion window (min 2) */ +u32 tcp_reno_ssthresh(struct tcp_sock *tp) +{ + return max(tp->snd_cwnd >> 1U, 2U); +} +EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); + +/* Lower bound on congestion window. */ +u32 tcp_reno_min_cwnd(struct tcp_sock *tp) +{ + return tp->snd_ssthresh/2; +} +EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); + +struct tcp_congestion_ops tcp_reno = { + .name = "reno", + .owner = THIS_MODULE, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, +}; + +/* Initial congestion control used (until SYN) + * really reno under another name so we can tell difference + * during tcp_set_default_congestion_control + */ +struct tcp_congestion_ops tcp_init_congestion_ops = { + .name = "", + .owner = THIS_MODULE, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, +}; +EXPORT_SYMBOL_GPL(tcp_init_congestion_ops); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 634befc0792..f66945cb158 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -42,15 +42,8 @@ struct tcpdiag_entry static struct sock *tcpnl; - #define TCPDIAG_PUT(skb, attrtype, attrlen) \ -({ int rtalen = RTA_LENGTH(attrlen); \ - struct rtattr *rta; \ - if (skb_tailroom(skb) < RTA_ALIGN(rtalen)) goto nlmsg_failure; \ - rta = (void*)__skb_put(skb, RTA_ALIGN(rtalen)); \ - rta->rta_type = attrtype; \ - rta->rta_len = rtalen; \ - RTA_DATA(rta); }) + RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, int ext, u32 pid, u32 seq, u16 nlmsg_flags) @@ -61,7 +54,6 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, struct nlmsghdr *nlh; struct tcp_info *info = NULL; struct tcpdiag_meminfo *minfo = NULL; - struct tcpvegas_info *vinfo = NULL; unsigned char *b = skb->tail; nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); @@ -73,9 +65,11 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, if (ext & (1<<(TCPDIAG_INFO-1))) info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); - if ((tcp_is_westwood(tp) || tcp_is_vegas(tp)) - && (ext & (1<<(TCPDIAG_VEGASINFO-1)))) - vinfo = TCPDIAG_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*vinfo)); + if (ext & (1<<(TCPDIAG_CONG-1))) { + size_t len = strlen(tp->ca_ops->name); + strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1), + tp->ca_ops->name); + } } r->tcpdiag_family = sk->sk_family; r->tcpdiag_state = sk->sk_state; @@ -166,23 +160,13 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, if (info) tcp_get_info(sk, info); - if (vinfo) { - if (tcp_is_vegas(tp)) { - vinfo->tcpv_enabled = tp->vegas.doing_vegas_now; - vinfo->tcpv_rttcnt = tp->vegas.cntRTT; - vinfo->tcpv_rtt = jiffies_to_usecs(tp->vegas.baseRTT); - vinfo->tcpv_minrtt = jiffies_to_usecs(tp->vegas.minRTT); - } else { - vinfo->tcpv_enabled = 0; - vinfo->tcpv_rttcnt = 0; - vinfo->tcpv_rtt = jiffies_to_usecs(tp->westwood.rtt); - vinfo->tcpv_minrtt = jiffies_to_usecs(tp->westwood.rtt_min); - } - } + if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info) + tp->ca_ops->get_info(tp, ext, skb); nlh->nlmsg_len = skb->tail - b; return skb->len; +rtattr_failure: nlmsg_failure: skb_trim(skb, b - skb->data); return -1; diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c new file mode 100644 index 00000000000..36c51f8136b --- /dev/null +++ b/net/ipv4/tcp_highspeed.c @@ -0,0 +1,181 @@ +/* + * Sally Floyd's High Speed TCP (RFC 3649) congestion control + * + * See http://www.icir.org/floyd/hstcp.html + * + * John Heffner <jheffner@psc.edu> + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <net/tcp.h> + + +/* From AIMD tables from RFC 3649 appendix B, + * with fixed-point MD scaled <<8. + */ +static const struct hstcp_aimd_val { + unsigned int cwnd; + unsigned int md; +} hstcp_aimd_vals[] = { + { 38, 128, /* 0.50 */ }, + { 118, 112, /* 0.44 */ }, + { 221, 104, /* 0.41 */ }, + { 347, 98, /* 0.38 */ }, + { 495, 93, /* 0.37 */ }, + { 663, 89, /* 0.35 */ }, + { 851, 86, /* 0.34 */ }, + { 1058, 83, /* 0.33 */ }, + { 1284, 81, /* 0.32 */ }, + { 1529, 78, /* 0.31 */ }, + { 1793, 76, /* 0.30 */ }, + { 2076, 74, /* 0.29 */ }, + { 2378, 72, /* 0.28 */ }, + { 2699, 71, /* 0.28 */ }, + { 3039, 69, /* 0.27 */ }, + { 3399, 68, /* 0.27 */ }, + { 3778, 66, /* 0.26 */ }, + { 4177, 65, /* 0.26 */ }, + { 4596, 64, /* 0.25 */ }, + { 5036, 62, /* 0.25 */ }, + { 5497, 61, /* 0.24 */ }, + { 5979, 60, /* 0.24 */ }, + { 6483, 59, /* 0.23 */ }, + { 7009, 58, /* 0.23 */ }, + { 7558, 57, /* 0.22 */ }, + { 8130, 56, /* 0.22 */ }, + { 8726, 55, /* 0.22 */ }, + { 9346, 54, /* 0.21 */ }, + { 9991, 53, /* 0.21 */ }, + { 10661, 52, /* 0.21 */ }, + { 11358, 52, /* 0.20 */ }, + { 12082, 51, /* 0.20 */ }, + { 12834, 50, /* 0.20 */ }, + { 13614, 49, /* 0.19 */ }, + { 14424, 48, /* 0.19 */ }, + { 15265, 48, /* 0.19 */ }, + { 16137, 47, /* 0.19 */ }, + { 17042, 46, /* 0.18 */ }, + { 17981, 45, /* 0.18 */ }, + { 18955, 45, /* 0.18 */ }, + { 19965, 44, /* 0.17 */ }, + { 21013, 43, /* 0.17 */ }, + { 22101, 43, /* 0.17 */ }, + { 23230, 42, /* 0.17 */ }, + { 24402, 41, /* 0.16 */ }, + { 25618, 41, /* 0.16 */ }, + { 26881, 40, /* 0.16 */ }, + { 28193, 39, /* 0.16 */ }, + { 29557, 39, /* 0.15 */ }, + { 30975, 38, /* 0.15 */ }, + { 32450, 38, /* 0.15 */ }, + { 33986, 37, /* 0.15 */ }, + { 35586, 36, /* 0.14 */ }, + { 37253, 36, /* 0.14 */ }, + { 38992, 35, /* 0.14 */ }, + { 40808, 35, /* 0.14 */ }, + { 42707, 34, /* 0.13 */ }, + { 44694, 33, /* 0.13 */ }, + { 46776, 33, /* 0.13 */ }, + { 48961, 32, /* 0.13 */ }, + { 51258, 32, /* 0.13 */ }, + { 53677, 31, /* 0.12 */ }, + { 56230, 30, /* 0.12 */ }, + { 58932, 30, /* 0.12 */ }, + { 61799, 29, /* 0.12 */ }, + { 64851, 28, /* 0.11 */ }, + { 68113, 28, /* 0.11 */ }, + { 71617, 27, /* 0.11 */ }, + { 75401, 26, /* 0.10 */ }, + { 79517, 26, /* 0.10 */ }, + { 84035, 25, /* 0.10 */ }, + { 89053, 24, /* 0.10 */ }, +}; + +#define HSTCP_AIMD_MAX ARRAY_SIZE(hstcp_aimd_vals) + +struct hstcp { + u32 ai; +}; + +static void hstcp_init(struct tcp_sock *tp) +{ + struct hstcp *ca = tcp_ca(tp); + + ca->ai = 0; + + /* Ensure the MD arithmetic works. This is somewhat pedantic, + * since I don't think we will see a cwnd this large. :) */ + tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); +} + +static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt, + u32 in_flight, int good) +{ + struct hstcp *ca = tcp_ca(tp); + + if (in_flight < tp->snd_cwnd) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } else { + /* Update AIMD parameters */ + if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { + while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && + ca->ai < HSTCP_AIMD_MAX) + ca->ai++; + } else if (tp->snd_cwnd < hstcp_aimd_vals[ca->ai].cwnd) { + while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && + ca->ai > 0) + ca->ai--; + } + + /* Do additive increase */ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) { + tp->snd_cwnd_cnt += ca->ai; + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + tp->snd_cwnd++; + tp->snd_cwnd_cnt -= tp->snd_cwnd; + } + } + } +} + +static u32 hstcp_ssthresh(struct tcp_sock *tp) +{ + struct hstcp *ca = tcp_ca(tp); + + /* Do multiplicative decrease */ + return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U); +} + + +static struct tcp_congestion_ops tcp_highspeed = { + .init = hstcp_init, + .ssthresh = hstcp_ssthresh, + .cong_avoid = hstcp_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + + .owner = THIS_MODULE, + .name = "highspeed" +}; + +static int __init hstcp_register(void) +{ + BUG_ON(sizeof(struct hstcp) > TCP_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_highspeed); +} + +static void __exit hstcp_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_highspeed); +} + +module_init(hstcp_register); +module_exit(hstcp_unregister); + +MODULE_AUTHOR("John Heffner"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("High Speed TCP"); diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c new file mode 100644 index 00000000000..40168275acf --- /dev/null +++ b/net/ipv4/tcp_htcp.c @@ -0,0 +1,289 @@ +/* + * H-TCP congestion control. The algorithm is detailed in: + * R.N.Shorten, D.J.Leith: + * "H-TCP: TCP for high-speed and long-distance networks" + * Proc. PFLDnet, Argonne, 2004. + * http://www.hamilton.ie/net/htcp3.pdf + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <net/tcp.h> + +#define ALPHA_BASE (1<<7) /* 1.0 with shift << 7 */ +#define BETA_MIN (1<<6) /* 0.5 with shift << 7 */ +#define BETA_MAX 102 /* 0.8 with shift << 7 */ + +static int use_rtt_scaling = 1; +module_param(use_rtt_scaling, int, 0644); +MODULE_PARM_DESC(use_rtt_scaling, "turn on/off RTT scaling"); + +static int use_bandwidth_switch = 1; +module_param(use_bandwidth_switch, int, 0644); +MODULE_PARM_DESC(use_bandwidth_switch, "turn on/off bandwidth switcher"); + +struct htcp { + u16 alpha; /* Fixed point arith, << 7 */ + u8 beta; /* Fixed point arith, << 7 */ + u8 modeswitch; /* Delay modeswitch until we had at least one congestion event */ + u8 ccount; /* Number of RTTs since last congestion event */ + u8 undo_ccount; + u16 packetcount; + u32 minRTT; + u32 maxRTT; + u32 snd_cwnd_cnt2; + + u32 undo_maxRTT; + u32 undo_old_maxB; + + /* Bandwidth estimation */ + u32 minB; + u32 maxB; + u32 old_maxB; + u32 Bi; + u32 lasttime; +}; + +static inline void htcp_reset(struct htcp *ca) +{ + ca->undo_ccount = ca->ccount; + ca->undo_maxRTT = ca->maxRTT; + ca->undo_old_maxB = ca->old_maxB; + + ca->ccount = 0; + ca->snd_cwnd_cnt2 = 0; +} + +static u32 htcp_cwnd_undo(struct tcp_sock *tp) +{ + struct htcp *ca = tcp_ca(tp); + ca->ccount = ca->undo_ccount; + ca->maxRTT = ca->undo_maxRTT; + ca->old_maxB = ca->undo_old_maxB; + return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta); +} + +static inline void measure_rtt(struct tcp_sock *tp) +{ + struct htcp *ca = tcp_ca(tp); + u32 srtt = tp->srtt>>3; + + /* keep track of minimum RTT seen so far, minRTT is zero at first */ + if (ca->minRTT > srtt || !ca->minRTT) + ca->minRTT = srtt; + + /* max RTT */ + if (tp->ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) { + if (ca->maxRTT < ca->minRTT) + ca->maxRTT = ca->minRTT; + if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50) + ca->maxRTT = srtt; + } +} + +static void measure_achieved_throughput(struct tcp_sock *tp, u32 pkts_acked) +{ + struct htcp *ca = tcp_ca(tp); + u32 now = tcp_time_stamp; + + /* achieved throughput calculations */ + if (tp->ca_state != TCP_CA_Open && tp->ca_state != TCP_CA_Disorder) { + ca->packetcount = 0; + ca->lasttime = now; + return; + } + + ca->packetcount += pkts_acked; + + if (ca->packetcount >= tp->snd_cwnd - (ca->alpha>>7? : 1) + && now - ca->lasttime >= ca->minRTT + && ca->minRTT > 0) { + __u32 cur_Bi = ca->packetcount*HZ/(now - ca->lasttime); + if (ca->ccount <= 3) { + /* just after backoff */ + ca->minB = ca->maxB = ca->Bi = cur_Bi; + } else { + ca->Bi = (3*ca->Bi + cur_Bi)/4; + if (ca->Bi > ca->maxB) + ca->maxB = ca->Bi; + if (ca->minB > ca->maxB) + ca->minB = ca->maxB; + } + ca->packetcount = 0; + ca->lasttime = now; + } +} + +static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT) +{ + if (use_bandwidth_switch) { + u32 maxB = ca->maxB; + u32 old_maxB = ca->old_maxB; + ca->old_maxB = ca->maxB; + + if (!between(5*maxB, 4*old_maxB, 6*old_maxB)) { + ca->beta = BETA_MIN; + ca->modeswitch = 0; + return; + } + } + + if (ca->modeswitch && minRTT > max(HZ/100, 1) && maxRTT) { + ca->beta = (minRTT<<7)/maxRTT; + if (ca->beta < BETA_MIN) + ca->beta = BETA_MIN; + else if (ca->beta > BETA_MAX) + ca->beta = BETA_MAX; + } else { + ca->beta = BETA_MIN; + ca->modeswitch = 1; + } +} + +static inline void htcp_alpha_update(struct htcp *ca) +{ + u32 minRTT = ca->minRTT; + u32 factor = 1; + u32 diff = ca->ccount * minRTT; /* time since last backoff */ + + if (diff > HZ) { + diff -= HZ; + factor = 1+ ( 10*diff + ((diff/2)*(diff/2)/HZ) )/HZ; + } + + if (use_rtt_scaling && minRTT) { + u32 scale = (HZ<<3)/(10*minRTT); + scale = min(max(scale, 1U<<2), 10U<<3); /* clamping ratio to interval [0.5,10]<<3 */ + factor = (factor<<3)/scale; + if (!factor) + factor = 1; + } + + ca->alpha = 2*factor*((1<<7)-ca->beta); + if (!ca->alpha) + ca->alpha = ALPHA_BASE; +} + +/* After we have the rtt data to calculate beta, we'd still prefer to wait one + * rtt before we adjust our beta to ensure we are working from a consistent + * data. + * + * This function should be called when we hit a congestion event since only at + * that point do we really have a real sense of maxRTT (the queues en route + * were getting just too full now). + */ +static void htcp_param_update(struct tcp_sock *tp) +{ + struct htcp *ca = tcp_ca(tp); + u32 minRTT = ca->minRTT; + u32 maxRTT = ca->maxRTT; + + htcp_beta_update(ca, minRTT, maxRTT); + htcp_alpha_update(ca); + + /* add slowly fading memory for maxRTT to accommodate routing changes etc */ + if (minRTT > 0 && maxRTT > minRTT) + ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100; +} + +static u32 htcp_recalc_ssthresh(struct tcp_sock *tp) +{ + struct htcp *ca = tcp_ca(tp); + htcp_param_update(tp); + return max((tp->snd_cwnd * ca->beta) >> 7, 2U); +} + +static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, + u32 in_flight, int data_acked) +{ + struct htcp *ca = tcp_ca(tp); + + if (in_flight < tp->snd_cwnd) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* In "safe" area, increase. */ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } else { + measure_rtt(tp); + + /* keep track of number of round-trip times since last backoff event */ + if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) { + ca->ccount++; + ca->snd_cwnd_cnt2 = 0; + htcp_alpha_update(ca); + } + + /* In dangerous area, increase slowly. + * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd + */ + if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + ca->ccount++; + } + } +} + +/* Lower bound on congestion window. */ +static u32 htcp_min_cwnd(struct tcp_sock *tp) +{ + return tp->snd_ssthresh; +} + + +static void htcp_init(struct tcp_sock *tp) +{ + struct htcp *ca = tcp_ca(tp); + + memset(ca, 0, sizeof(struct htcp)); + ca->alpha = ALPHA_BASE; + ca->beta = BETA_MIN; +} + +static void htcp_state(struct tcp_sock *tp, u8 new_state) +{ + switch (new_state) { + case TCP_CA_CWR: + case TCP_CA_Recovery: + case TCP_CA_Loss: + htcp_reset(tcp_ca(tp)); + break; + } +} + +static struct tcp_congestion_ops htcp = { + .init = htcp_init, + .ssthresh = htcp_recalc_ssthresh, + .min_cwnd = htcp_min_cwnd, + .cong_avoid = htcp_cong_avoid, + .set_state = htcp_state, + .undo_cwnd = htcp_cwnd_undo, + .pkts_acked = measure_achieved_throughput, + .owner = THIS_MODULE, + .name = "htcp", +}; + +static int __init htcp_register(void) +{ + BUG_ON(sizeof(struct htcp) > TCP_CA_PRIV_SIZE); + BUILD_BUG_ON(BETA_MIN >= BETA_MAX); + if (!use_bandwidth_switch) + htcp.pkts_acked = NULL; + return tcp_register_congestion_control(&htcp); +} + +static void __exit htcp_unregister(void) +{ + tcp_unregister_congestion_control(&htcp); +} + +module_init(htcp_register); +module_exit(htcp_unregister); + +MODULE_AUTHOR("Baruch Even"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("H-TCP"); diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c new file mode 100644 index 00000000000..13a66342c30 --- /dev/null +++ b/net/ipv4/tcp_hybla.c @@ -0,0 +1,187 @@ +/* + * TCP HYBLA + * + * TCP-HYBLA Congestion control algorithm, based on: + * C.Caini, R.Firrincieli, "TCP-Hybla: A TCP Enhancement + * for Heterogeneous Networks", + * International Journal on satellite Communications, + * September 2004 + * Daniele Lacamera + * root at danielinux.net + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <net/tcp.h> + +/* Tcp Hybla structure. */ +struct hybla { + u8 hybla_en; + u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */ + u32 rho; /* Rho parameter, integer part */ + u32 rho2; /* Rho * Rho, integer part */ + u32 rho_3ls; /* Rho parameter, <<3 */ + u32 rho2_7ls; /* Rho^2, <<7 */ + u32 minrtt; /* Minimum smoothed round trip time value seen */ +}; + +/* Hybla reference round trip time (default= 1/40 sec = 25 ms), + expressed in jiffies */ +static int rtt0 = 25; +module_param(rtt0, int, 0644); +MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)"); + + +/* This is called to refresh values for hybla parameters */ +static inline void hybla_recalc_param (struct tcp_sock *tp) +{ + struct hybla *ca = tcp_ca(tp); + + ca->rho_3ls = max_t(u32, tp->srtt / msecs_to_jiffies(rtt0), 8); + ca->rho = ca->rho_3ls >> 3; + ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; + ca->rho2 = ca->rho2_7ls >>7; +} + +static void hybla_init(struct tcp_sock *tp) +{ + struct hybla *ca = tcp_ca(tp); + + ca->rho = 0; + ca->rho2 = 0; + ca->rho_3ls = 0; + ca->rho2_7ls = 0; + ca->snd_cwnd_cents = 0; + ca->hybla_en = 1; + tp->snd_cwnd = 2; + tp->snd_cwnd_clamp = 65535; + + /* 1st Rho measurement based on initial srtt */ + hybla_recalc_param(tp); + + /* set minimum rtt as this is the 1st ever seen */ + ca->minrtt = tp->srtt; + tp->snd_cwnd = ca->rho; +} + +static void hybla_state(struct tcp_sock *tp, u8 ca_state) +{ + struct hybla *ca = tcp_ca(tp); + + ca->hybla_en = (ca_state == TCP_CA_Open); +} + +static inline u32 hybla_fraction(u32 odds) +{ + static const u32 fractions[] = { + 128, 139, 152, 165, 181, 197, 215, 234, + }; + + return (odds < ARRAY_SIZE(fractions)) ? fractions[odds] : 128; +} + +/* TCP Hybla main routine. + * This is the algorithm behavior: + * o Recalc Hybla parameters if min_rtt has changed + * o Give cwnd a new value based on the model proposed + * o remember increments <1 + */ +static void hybla_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, + u32 in_flight, int flag) +{ + struct hybla *ca = tcp_ca(tp); + u32 increment, odd, rho_fractions; + int is_slowstart = 0; + + /* Recalculate rho only if this srtt is the lowest */ + if (tp->srtt < ca->minrtt){ + hybla_recalc_param(tp); + ca->minrtt = tp->srtt; + } + + if (!ca->hybla_en) + return tcp_reno_cong_avoid(tp, ack, rtt, in_flight, flag); + + if (in_flight < tp->snd_cwnd) + return; + + if (ca->rho == 0) + hybla_recalc_param(tp); + + rho_fractions = ca->rho_3ls - (ca->rho << 3); + + if (tp->snd_cwnd < tp->snd_ssthresh) { + /* + * slow start + * INC = 2^RHO - 1 + * This is done by splitting the rho parameter + * into 2 parts: an integer part and a fraction part. + * Inrement<<7 is estimated by doing: + * [2^(int+fract)]<<7 + * that is equal to: + * (2^int) * [(2^fract) <<7] + * 2^int is straightly computed as 1<<int, + * while we will use hybla_slowstart_fraction_increment() to + * calculate 2^fract in a <<7 value. + */ + is_slowstart = 1; + increment = ((1 << ca->rho) * hybla_fraction(rho_fractions)) + - 128; + } else { + /* + * congestion avoidance + * INC = RHO^2 / W + * as long as increment is estimated as (rho<<7)/window + * it already is <<7 and we can easily count its fractions. + */ + increment = ca->rho2_7ls / tp->snd_cwnd; + if (increment < 128) + tp->snd_cwnd_cnt++; + } + + odd = increment % 128; + tp->snd_cwnd += increment >> 7; + ca->snd_cwnd_cents += odd; + + /* check when fractions goes >=128 and increase cwnd by 1. */ + while(ca->snd_cwnd_cents >= 128) { + tp->snd_cwnd++; + ca->snd_cwnd_cents -= 128; + tp->snd_cwnd_cnt = 0; + } + + /* clamp down slowstart cwnd to ssthresh value. */ + if (is_slowstart) + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + + tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); +} + +static struct tcp_congestion_ops tcp_hybla = { + .init = hybla_init, + .ssthresh = tcp_reno_ssthresh, + .min_cwnd = tcp_reno_min_cwnd, + .cong_avoid = hybla_cong_avoid, + .set_state = hybla_state, + + .owner = THIS_MODULE, + .name = "hybla" +}; + +static int __init hybla_register(void) +{ + BUG_ON(sizeof(struct hybla) > TCP_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_hybla); +} + +static void __exit hybla_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_hybla); +} + +module_init(hybla_register); +module_exit(hybla_unregister); + +MODULE_AUTHOR("Daniele Lacamera"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Hybla"); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5bad504630a..7bbbbc33eb4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -61,7 +61,6 @@ * Panu Kuhlberg: Experimental audit of TCP (re)transmission * engine. Lots of bugs are found. * Pasi Sarolahti: F-RTO for dealing with spurious RTOs - * Angelo Dell'Aera: TCP Westwood+ support */ #include <linux/config.h> @@ -88,23 +87,9 @@ int sysctl_tcp_rfc1337; int sysctl_tcp_max_orphans = NR_FILE; int sysctl_tcp_frto; int sysctl_tcp_nometrics_save; -int sysctl_tcp_westwood; -int sysctl_tcp_vegas_cong_avoid; int sysctl_tcp_moderate_rcvbuf = 1; -/* Default values of the Vegas variables, in fixed-point representation - * with V_PARAM_SHIFT bits to the right of the binary point. - */ -#define V_PARAM_SHIFT 1 -int sysctl_tcp_vegas_alpha = 1<<V_PARAM_SHIFT; -int sysctl_tcp_vegas_beta = 3<<V_PARAM_SHIFT; -int sysctl_tcp_vegas_gamma = 1<<V_PARAM_SHIFT; -int sysctl_tcp_bic = 1; -int sysctl_tcp_bic_fast_convergence = 1; -int sysctl_tcp_bic_low_window = 14; -int sysctl_tcp_bic_beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ - #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ @@ -333,15 +318,6 @@ static void tcp_init_buffer_space(struct sock *sk) tp->snd_cwnd_stamp = tcp_time_stamp; } -static void init_bictcp(struct tcp_sock *tp) -{ - tp->bictcp.cnt = 0; - - tp->bictcp.last_max_cwnd = 0; - tp->bictcp.last_cwnd = 0; - tp->bictcp.last_stamp = 0; -} - /* 5. Recalculate window clamp after socket hit its memory bounds. */ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) { @@ -558,45 +534,6 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ tcp_grow_window(sk, tp, skb); } -/* When starting a new connection, pin down the current choice of - * congestion algorithm. - */ -void tcp_ca_init(struct tcp_sock *tp) -{ - if (sysctl_tcp_westwood) - tp->adv_cong = TCP_WESTWOOD; - else if (sysctl_tcp_bic) - tp->adv_cong = TCP_BIC; - else if (sysctl_tcp_vegas_cong_avoid) { - tp->adv_cong = TCP_VEGAS; - tp->vegas.baseRTT = 0x7fffffff; - tcp_vegas_enable(tp); - } -} - -/* Do RTT sampling needed for Vegas. - * Basically we: - * o min-filter RTT samples from within an RTT to get the current - * propagation delay + queuing delay (we are min-filtering to try to - * avoid the effects of delayed ACKs) - * o min-filter RTT samples from a much longer window (forever for now) - * to find the propagation delay (baseRTT) - */ -static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt) -{ - __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */ - - /* Filter to find propagation delay: */ - if (vrtt < tp->vegas.baseRTT) - tp->vegas.baseRTT = vrtt; - - /* Find the min RTT during the last RTT to find - * the current prop. delay + queuing delay: - */ - tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt); - tp->vegas.cntRTT++; -} - /* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge @@ -606,13 +543,10 @@ static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt) * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ -static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) +static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt) { long m = mrtt; /* RTT */ - if (tcp_vegas_enabled(tp)) - vegas_rtt_calc(tp, mrtt); - /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. @@ -670,7 +604,8 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) tp->rtt_seq = tp->snd_nxt; } - tcp_westwood_update_rtt(tp, tp->srtt >> 3); + if (tp->ca_ops->rtt_sample) + tp->ca_ops->rtt_sample(tp, *usrtt); } /* Calculate rto without backoff. This is the second half of Van Jacobson's @@ -1185,8 +1120,8 @@ void tcp_enter_frto(struct sock *sk) tp->snd_una == tp->high_seq || (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(tp); - if (!tcp_westwood_ssthresh(tp)) - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); + tcp_ca_event(tp, CA_EVENT_FRTO); } /* Have to clear retransmission markers here to keep the bookkeeping @@ -1252,8 +1187,6 @@ static void tcp_enter_frto_loss(struct sock *sk) tcp_set_ca_state(tp, TCP_CA_Loss); tp->high_seq = tp->frto_highmark; TCP_ECN_queue_cwr(tp); - - init_bictcp(tp); } void tcp_clear_retrans(struct tcp_sock *tp) @@ -1283,7 +1216,8 @@ void tcp_enter_loss(struct sock *sk, int how) if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(tp); - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); + tcp_ca_event(tp, CA_EVENT_LOSS); } tp->snd_cwnd = 1; tp->snd_cwnd_cnt = 0; @@ -1596,28 +1530,14 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) } /* Decrease cwnd each second ack. */ - static void tcp_cwnd_down(struct tcp_sock *tp) { int decr = tp->snd_cwnd_cnt + 1; - __u32 limit; - - /* - * TCP Westwood - * Here limit is evaluated as BWestimation*RTTmin (for obtaining it - * in packets we use mss_cache). If sysctl_tcp_westwood is off - * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is - * still used as usual. It prevents other strange cases in which - * BWE*RTTmin could assume value 0. It should not happen but... - */ - - if (!(limit = tcp_westwood_bw_rttmin(tp))) - limit = tp->snd_ssthresh/2; tp->snd_cwnd_cnt = decr&1; decr >>= 1; - if (decr && tp->snd_cwnd > limit) + if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp)) tp->snd_cwnd -= decr; tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); @@ -1654,8 +1574,8 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg) static void tcp_undo_cwr(struct tcp_sock *tp, int undo) { if (tp->prior_ssthresh) { - if (tcp_is_bic(tp)) - tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd); + if (tp->ca_ops->undo_cwnd) + tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp); else tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); @@ -1767,11 +1687,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) static inline void tcp_complete_cwr(struct tcp_sock *tp) { - if (tcp_westwood_cwnd(tp)) - tp->snd_ssthresh = tp->snd_cwnd; - else - tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); tp->snd_cwnd_stamp = tcp_time_stamp; + tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR); } static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) @@ -1946,7 +1864,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, if (tp->ca_state < TCP_CA_CWR) { if (!(flag&FLAG_ECE)) tp->prior_ssthresh = tcp_current_ssthresh(tp); - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); TCP_ECN_queue_cwr(tp); } @@ -1963,7 +1881,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* Read draft-ietf-tcplw-high-performance before mucking * with this code. (Superceeds RFC1323) */ -static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag) +static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag) { __u32 seq_rtt; @@ -1983,13 +1901,13 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag) * in window is lost... Voila. --ANK (010210) */ seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; - tcp_rtt_estimator(tp, seq_rtt); + tcp_rtt_estimator(tp, seq_rtt, usrtt); tcp_set_rto(tp); tp->backoff = 0; tcp_bound_rto(tp); } -static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag) +static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag) { /* We don't have a timestamp. Can only use * packets that are not retransmitted to determine @@ -2003,338 +1921,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag) if (flag & FLAG_RETRANS_DATA_ACKED) return; - tcp_rtt_estimator(tp, seq_rtt); + tcp_rtt_estimator(tp, seq_rtt, usrtt); tcp_set_rto(tp); tp->backoff = 0; tcp_bound_rto(tp); } static inline void tcp_ack_update_rtt(struct tcp_sock *tp, - int flag, s32 seq_rtt) + int flag, s32 seq_rtt, u32 *usrtt) { /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) - tcp_ack_saw_tstamp(tp, flag); + tcp_ack_saw_tstamp(tp, usrtt, flag); else if (seq_rtt >= 0) - tcp_ack_no_tstamp(tp, seq_rtt, flag); + tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag); } -/* - * Compute congestion window to use. - * - * This is from the implementation of BICTCP in - * Lison-Xu, Kahaled Harfoush, and Injog Rhee. - * "Binary Increase Congestion Control for Fast, Long Distance - * Networks" in InfoComm 2004 - * Available from: - * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf - * - * Unless BIC is enabled and congestion window is large - * this behaves the same as the original Reno. - */ -static inline __u32 bictcp_cwnd(struct tcp_sock *tp) -{ - /* orignal Reno behaviour */ - if (!tcp_is_bic(tp)) - return tp->snd_cwnd; - - if (tp->bictcp.last_cwnd == tp->snd_cwnd && - (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5)) - return tp->bictcp.cnt; - - tp->bictcp.last_cwnd = tp->snd_cwnd; - tp->bictcp.last_stamp = tcp_time_stamp; - - /* start off normal */ - if (tp->snd_cwnd <= sysctl_tcp_bic_low_window) - tp->bictcp.cnt = tp->snd_cwnd; - - /* binary increase */ - else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) { - __u32 dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd) - / BICTCP_B; - - if (dist > BICTCP_MAX_INCREMENT) - /* linear increase */ - tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT; - else if (dist <= 1U) - /* binary search increase */ - tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR - / BICTCP_B; - else - /* binary search increase */ - tp->bictcp.cnt = tp->snd_cwnd / dist; - } else { - /* slow start amd linear increase */ - if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B) - /* slow start */ - tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR - / BICTCP_B; - else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd - + BICTCP_MAX_INCREMENT*(BICTCP_B-1)) - /* slow start */ - tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1) - / (tp->snd_cwnd-tp->bictcp.last_max_cwnd); - else - /* linear increase */ - tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT; - } - return tp->bictcp.cnt; -} - -/* This is Jacobson's slow start and congestion avoidance. - * SIGCOMM '88, p. 328. - */ -static inline void reno_cong_avoid(struct tcp_sock *tp) +static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, + u32 in_flight, int good) { - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { - /* In dangerous area, increase slowly. - * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd - */ - if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt=0; - } else - tp->snd_cwnd_cnt++; - } + tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good); tp->snd_cwnd_stamp = tcp_time_stamp; } -/* This is based on the congestion detection/avoidance scheme described in - * Lawrence S. Brakmo and Larry L. Peterson. - * "TCP Vegas: End to end congestion avoidance on a global internet." - * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, - * October 1995. Available from: - * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps - * - * See http://www.cs.arizona.edu/xkernel/ for their implementation. - * The main aspects that distinguish this implementation from the - * Arizona Vegas implementation are: - * o We do not change the loss detection or recovery mechanisms of - * Linux in any way. Linux already recovers from losses quite well, - * using fine-grained timers, NewReno, and FACK. - * o To avoid the performance penalty imposed by increasing cwnd - * only every-other RTT during slow start, we increase during - * every RTT during slow start, just like Reno. - * o Largely to allow continuous cwnd growth during slow start, - * we use the rate at which ACKs come back as the "actual" - * rate, rather than the rate at which data is sent. - * o To speed convergence to the right rate, we set the cwnd - * to achieve the right ("actual") rate when we exit slow start. - * o To filter out the noise caused by delayed ACKs, we use the - * minimum RTT sample observed during the last RTT to calculate - * the actual rate. - * o When the sender re-starts from idle, it waits until it has - * received ACKs for an entire flight of new data before making - * a cwnd adjustment decision. The original Vegas implementation - * assumed senders never went idle. - */ -static void vegas_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt) -{ - /* The key players are v_beg_snd_una and v_beg_snd_nxt. - * - * These are so named because they represent the approximate values - * of snd_una and snd_nxt at the beginning of the current RTT. More - * precisely, they represent the amount of data sent during the RTT. - * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, - * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding - * bytes of data have been ACKed during the course of the RTT, giving - * an "actual" rate of: - * - * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) - * - * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, - * because delayed ACKs can cover more than one segment, so they - * don't line up nicely with the boundaries of RTTs. - * - * Another unfortunate fact of life is that delayed ACKs delay the - * advance of the left edge of our send window, so that the number - * of bytes we send in an RTT is often less than our cwnd will allow. - * So we keep track of our cwnd separately, in v_beg_snd_cwnd. - */ - - if (after(ack, tp->vegas.beg_snd_nxt)) { - /* Do the Vegas once-per-RTT cwnd adjustment. */ - u32 old_wnd, old_snd_cwnd; - - - /* Here old_wnd is essentially the window of data that was - * sent during the previous RTT, and has all - * been acknowledged in the course of the RTT that ended - * with the ACK we just received. Likewise, old_snd_cwnd - * is the cwnd during the previous RTT. - */ - old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) / - tp->mss_cache_std; - old_snd_cwnd = tp->vegas.beg_snd_cwnd; - - /* Save the extent of the current window so we can use this - * at the end of the next RTT. - */ - tp->vegas.beg_snd_una = tp->vegas.beg_snd_nxt; - tp->vegas.beg_snd_nxt = tp->snd_nxt; - tp->vegas.beg_snd_cwnd = tp->snd_cwnd; - - /* Take into account the current RTT sample too, to - * decrease the impact of delayed acks. This double counts - * this sample since we count it for the next window as well, - * but that's not too awful, since we're taking the min, - * rather than averaging. - */ - vegas_rtt_calc(tp, seq_rtt); - - /* We do the Vegas calculations only if we got enough RTT - * samples that we can be reasonably sure that we got - * at least one RTT sample that wasn't from a delayed ACK. - * If we only had 2 samples total, - * then that means we're getting only 1 ACK per RTT, which - * means they're almost certainly delayed ACKs. - * If we have 3 samples, we should be OK. - */ - - if (tp->vegas.cntRTT <= 2) { - /* We don't have enough RTT samples to do the Vegas - * calculation, so we'll behave like Reno. - */ - if (tp->snd_cwnd > tp->snd_ssthresh) - tp->snd_cwnd++; - } else { - u32 rtt, target_cwnd, diff; - - /* We have enough RTT samples, so, using the Vegas - * algorithm, we determine if we should increase or - * decrease cwnd, and by how much. - */ - - /* Pluck out the RTT we are using for the Vegas - * calculations. This is the min RTT seen during the - * last RTT. Taking the min filters out the effects - * of delayed ACKs, at the cost of noticing congestion - * a bit later. - */ - rtt = tp->vegas.minRTT; - - /* Calculate the cwnd we should have, if we weren't - * going too fast. - * - * This is: - * (actual rate in segments) * baseRTT - * We keep it as a fixed point number with - * V_PARAM_SHIFT bits to the right of the binary point. - */ - target_cwnd = ((old_wnd * tp->vegas.baseRTT) - << V_PARAM_SHIFT) / rtt; - - /* Calculate the difference between the window we had, - * and the window we would like to have. This quantity - * is the "Diff" from the Arizona Vegas papers. - * - * Again, this is a fixed point number with - * V_PARAM_SHIFT bits to the right of the binary - * point. - */ - diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; - - if (tp->snd_cwnd < tp->snd_ssthresh) { - /* Slow start. */ - if (diff > sysctl_tcp_vegas_gamma) { - /* Going too fast. Time to slow down - * and switch to congestion avoidance. - */ - tp->snd_ssthresh = 2; - - /* Set cwnd to match the actual rate - * exactly: - * cwnd = (actual rate) * baseRTT - * Then we add 1 because the integer - * truncation robs us of full link - * utilization. - */ - tp->snd_cwnd = min(tp->snd_cwnd, - (target_cwnd >> - V_PARAM_SHIFT)+1); - - } - } else { - /* Congestion avoidance. */ - u32 next_snd_cwnd; - - /* Figure out where we would like cwnd - * to be. - */ - if (diff > sysctl_tcp_vegas_beta) { - /* The old window was too fast, so - * we slow down. - */ - next_snd_cwnd = old_snd_cwnd - 1; - } else if (diff < sysctl_tcp_vegas_alpha) { - /* We don't have enough extra packets - * in the network, so speed up. - */ - next_snd_cwnd = old_snd_cwnd + 1; - } else { - /* Sending just as fast as we - * should be. - */ - next_snd_cwnd = old_snd_cwnd; - } - - /* Adjust cwnd upward or downward, toward the - * desired value. - */ - if (next_snd_cwnd > tp->snd_cwnd) - tp->snd_cwnd++; - else if (next_snd_cwnd < tp->snd_cwnd) - tp->snd_cwnd--; - } - } - - /* Wipe the slate clean for the next RTT. */ - tp->vegas.cntRTT = 0; - tp->vegas.minRTT = 0x7fffffff; - } - - /* The following code is executed for every ack we receive, - * except for conditions checked in should_advance_cwnd() - * before the call to tcp_cong_avoid(). Mainly this means that - * we only execute this code if the ack actually acked some - * data. - */ - - /* If we are in slow start, increase our cwnd in response to this ACK. - * (If we are not in slow start then we are in congestion avoidance, - * and adjust our congestion window only once per RTT. See the code - * above.) - */ - if (tp->snd_cwnd <= tp->snd_ssthresh) - tp->snd_cwnd++; - - /* to keep cwnd from growing without bound */ - tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); - - /* Make sure that we are never so timid as to reduce our cwnd below - * 2 MSS. - * - * Going below 2 MSS would risk huge delayed ACKs from our receiver. - */ - tp->snd_cwnd = max(tp->snd_cwnd, 2U); - - tp->snd_cwnd_stamp = tcp_time_stamp; -} - -static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt) -{ - if (tcp_vegas_enabled(tp)) - vegas_cong_avoid(tp, ack, seq_rtt); - else - reno_cong_avoid(tp); -} - /* Restart timer after forward progress on connection. * RFC2988 recommends to restart timer to now+rto. */ @@ -2415,13 +2024,18 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, /* Remove acknowledged frames from the retransmission queue. */ -static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) +static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; __u32 now = tcp_time_stamp; int acked = 0; __s32 seq_rtt = -1; + struct timeval usnow; + u32 pkts_acked = 0; + + if (seq_usrtt) + do_gettimeofday(&usnow); while ((skb = skb_peek(&sk->sk_write_queue)) && skb != sk->sk_send_head) { @@ -2448,6 +2062,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) */ if (!(scb->flags & TCPCB_FLAG_SYN)) { acked |= FLAG_DATA_ACKED; + ++pkts_acked; } else { acked |= FLAG_SYN_ACKED; tp->retrans_stamp = 0; @@ -2461,6 +2076,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) seq_rtt = -1; } else if (seq_rtt < 0) seq_rtt = now - scb->when; + if (seq_usrtt) + *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000 + + (usnow.tv_usec - skb->stamp.tv_usec); + if (sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= tcp_skb_pcount(skb); if (sacked & TCPCB_LOST) @@ -2479,8 +2098,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) } if (acked&FLAG_ACKED) { - tcp_ack_update_rtt(tp, acked, seq_rtt); + tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt); tcp_ack_packets_out(sk, tp); + + if (tp->ca_ops->pkts_acked) + tp->ca_ops->pkts_acked(tp, pkts_acked); } #if FASTRETRANS_DEBUG > 0 @@ -2624,257 +2246,6 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) tp->frto_counter = (tp->frto_counter + 1) % 3; } -/* - * TCP Westwood+ - */ - -/* - * @init_westwood - * This function initializes fields used in TCP Westwood+. We can't - * get no information about RTTmin at this time so we simply set it to - * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative - * since in this way we're sure it will be updated in a consistent - * way as soon as possible. It will reasonably happen within the first - * RTT period of the connection lifetime. - */ - -static void init_westwood(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tp->westwood.bw_ns_est = 0; - tp->westwood.bw_est = 0; - tp->westwood.accounted = 0; - tp->westwood.cumul_ack = 0; - tp->westwood.rtt_win_sx = tcp_time_stamp; - tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT; - tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT; - tp->westwood.snd_una = tp->snd_una; -} - -/* - * @westwood_do_filter - * Low-pass filter. Implemented using constant coeffients. - */ - -static inline __u32 westwood_do_filter(__u32 a, __u32 b) -{ - return (((7 * a) + b) >> 3); -} - -static void westwood_filter(struct sock *sk, __u32 delta) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tp->westwood.bw_ns_est = - westwood_do_filter(tp->westwood.bw_ns_est, - tp->westwood.bk / delta); - tp->westwood.bw_est = - westwood_do_filter(tp->westwood.bw_est, - tp->westwood.bw_ns_est); -} - -/* - * @westwood_update_rttmin - * It is used to update RTTmin. In this case we MUST NOT use - * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN! - */ - -static inline __u32 westwood_update_rttmin(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - __u32 rttmin = tp->westwood.rtt_min; - - if (tp->westwood.rtt != 0 && - (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin)) - rttmin = tp->westwood.rtt; - - return rttmin; -} - -/* - * @westwood_acked - * Evaluate increases for dk. - */ - -static inline __u32 westwood_acked(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - - return tp->snd_una - tp->westwood.snd_una; -} - -/* - * @westwood_new_window - * It evaluates if we are receiving data inside the same RTT window as - * when we started. - * Return value: - * It returns 0 if we are still evaluating samples in the same RTT - * window, 1 if the sample has to be considered in the next window. - */ - -static int westwood_new_window(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - __u32 left_bound; - __u32 rtt; - int ret = 0; - - left_bound = tp->westwood.rtt_win_sx; - rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN); - - /* - * A RTT-window has passed. Be careful since if RTT is less than - * 50ms we don't filter but we continue 'building the sample'. - * This minimum limit was choosen since an estimation on small - * time intervals is better to avoid... - * Obvioulsy on a LAN we reasonably will always have - * right_bound = left_bound + WESTWOOD_RTT_MIN - */ - - if ((left_bound + rtt) < tcp_time_stamp) - ret = 1; - - return ret; -} - -/* - * @westwood_update_window - * It updates RTT evaluation window if it is the right moment to do - * it. If so it calls filter for evaluating bandwidth. - */ - -static void __westwood_update_window(struct sock *sk, __u32 now) -{ - struct tcp_sock *tp = tcp_sk(sk); - __u32 delta = now - tp->westwood.rtt_win_sx; - - if (delta) { - if (tp->westwood.rtt) - westwood_filter(sk, delta); - - tp->westwood.bk = 0; - tp->westwood.rtt_win_sx = tcp_time_stamp; - } -} - - -static void westwood_update_window(struct sock *sk, __u32 now) -{ - if (westwood_new_window(sk)) - __westwood_update_window(sk, now); -} - -/* - * @__tcp_westwood_fast_bw - * It is called when we are in fast path. In particular it is called when - * header prediction is successfull. In such case infact update is - * straight forward and doesn't need any particular care. - */ - -static void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - westwood_update_window(sk, tcp_time_stamp); - - tp->westwood.bk += westwood_acked(sk); - tp->westwood.snd_una = tp->snd_una; - tp->westwood.rtt_min = westwood_update_rttmin(sk); -} - -static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb) -{ - if (tcp_is_westwood(tcp_sk(sk))) - __tcp_westwood_fast_bw(sk, skb); -} - - -/* - * @westwood_dupack_update - * It updates accounted and cumul_ack when receiving a dupack. - */ - -static void westwood_dupack_update(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tp->westwood.accounted += tp->mss_cache_std; - tp->westwood.cumul_ack = tp->mss_cache_std; -} - -static inline int westwood_may_change_cumul(struct tcp_sock *tp) -{ - return (tp->westwood.cumul_ack > tp->mss_cache_std); -} - -static inline void westwood_partial_update(struct tcp_sock *tp) -{ - tp->westwood.accounted -= tp->westwood.cumul_ack; - tp->westwood.cumul_ack = tp->mss_cache_std; -} - -static inline void westwood_complete_update(struct tcp_sock *tp) -{ - tp->westwood.cumul_ack -= tp->westwood.accounted; - tp->westwood.accounted = 0; -} - -/* - * @westwood_acked_count - * This function evaluates cumul_ack for evaluating dk in case of - * delayed or partial acks. - */ - -static inline __u32 westwood_acked_count(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tp->westwood.cumul_ack = westwood_acked(sk); - - /* If cumul_ack is 0 this is a dupack since it's not moving - * tp->snd_una. - */ - if (!(tp->westwood.cumul_ack)) - westwood_dupack_update(sk); - - if (westwood_may_change_cumul(tp)) { - /* Partial or delayed ack */ - if (tp->westwood.accounted >= tp->westwood.cumul_ack) - westwood_partial_update(tp); - else - westwood_complete_update(tp); - } - - tp->westwood.snd_una = tp->snd_una; - - return tp->westwood.cumul_ack; -} - - -/* - * @__tcp_westwood_slow_bw - * It is called when something is going wrong..even if there could - * be no problems! Infact a simple delayed packet may trigger a - * dupack. But we need to be careful in such case. - */ - -static void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - westwood_update_window(sk, tcp_time_stamp); - - tp->westwood.bk += westwood_acked_count(sk); - tp->westwood.rtt_min = westwood_update_rttmin(sk); -} - -static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb) -{ - if (tcp_is_westwood(tcp_sk(sk))) - __tcp_westwood_slow_bw(sk, skb); -} - /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) { @@ -2884,6 +2255,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) u32 ack = TCP_SKB_CB(skb)->ack_seq; u32 prior_in_flight; s32 seq_rtt; + s32 seq_usrtt = 0; int prior_packets; /* If the ack is newer than sent or older than previous acks @@ -2902,9 +2274,10 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) */ tcp_update_wl(tp, ack, ack_seq); tp->snd_una = ack; - tcp_westwood_fast_bw(sk, skb); flag |= FLAG_WIN_UPDATE; + tcp_ca_event(tp, CA_EVENT_FAST_ACK); + NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); } else { if (ack_seq != TCP_SKB_CB(skb)->end_seq) @@ -2920,7 +2293,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) flag |= FLAG_ECE; - tcp_westwood_slow_bw(sk,skb); + tcp_ca_event(tp, CA_EVENT_SLOW_ACK); } /* We passed data and got it acked, remove any soft error @@ -2935,22 +2308,20 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) prior_in_flight = tcp_packets_in_flight(tp); /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, &seq_rtt); + flag |= tcp_clean_rtx_queue(sk, &seq_rtt, + tp->ca_ops->rtt_sample ? &seq_usrtt : NULL); if (tp->frto_counter) tcp_process_frto(sk, prior_snd_una); if (tcp_ack_is_dubious(tp, flag)) { /* Advanve CWND, if state allows this. */ - if ((flag & FLAG_DATA_ACKED) && - (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) && - tcp_may_raise_cwnd(tp, flag)) - tcp_cong_avoid(tp, ack, seq_rtt); + if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag)) + tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 0); tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); } else { - if ((flag & FLAG_DATA_ACKED) && - (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd)) - tcp_cong_avoid(tp, ack, seq_rtt); + if ((flag & FLAG_DATA_ACKED)) + tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1); } if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) @@ -4552,6 +3923,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_init_metrics(sk); + tcp_init_congestion_control(tp); + /* Prevent spurious tcp_cwnd_restart() on first data * packet. */ @@ -4708,9 +4081,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if(tp->af_specific->conn_request(sk, skb) < 0) return 1; - init_westwood(sk); - init_bictcp(tp); - /* Now we have several options: In theory there is * nothing else in the frame. KA9Q has an option to * send data with the syn, BSD accepts data with the @@ -4732,9 +4102,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; case TCP_SYN_SENT: - init_westwood(sk); - init_bictcp(tp); - queued = tcp_rcv_synsent_state_process(sk, skb, th, len); if (queued >= 0) return queued; @@ -4816,7 +4183,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !tp->srtt) - tcp_ack_saw_tstamp(tp, 0); + tcp_ack_saw_tstamp(tp, 0, 0); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; @@ -4828,6 +4195,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tcp_init_metrics(sk); + tcp_init_congestion_control(tp); + /* Prevent spurious tcp_cwnd_restart() on * first data packet. */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2d41d5d6ad1..ebf112347a9 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2048,6 +2048,7 @@ static int tcp_v4_init_sock(struct sock *sk) tp->mss_cache_std = tp->mss_cache = 536; tp->reordering = sysctl_tcp_reordering; + tp->ca_ops = &tcp_init_congestion_ops; sk->sk_state = TCP_CLOSE; @@ -2070,6 +2071,8 @@ int tcp_v4_destroy_sock(struct sock *sk) tcp_clear_xmit_timers(sk); + tcp_cleanup_congestion_control(tp); + /* Cleanup up the write buffer. */ sk_stream_writequeue_purge(sk); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b3943e7562f..f42a284164b 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -774,6 +774,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->frto_counter = 0; newtp->frto_highmark = 0; + newtp->ca_ops = &tcp_reno; + tcp_set_ca_state(newtp, TCP_CA_Open); tcp_init_xmit_timers(newsk); skb_queue_head_init(&newtp->out_of_order_queue); @@ -842,8 +844,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, if (newtp->ecn_flags&TCP_ECN_OK) sock_set_flag(newsk, SOCK_NO_LARGESEND); - tcp_ca_init(newtp); - TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS); } return newsk; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f17c6577e33..0e17c244875 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -111,8 +111,7 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst) u32 restart_cwnd = tcp_init_cwnd(tp, dst); u32 cwnd = tp->snd_cwnd; - if (tcp_is_vegas(tp)) - tcp_vegas_enable(tp); + tcp_ca_event(tp, CA_EVENT_CWND_RESTART); tp->snd_ssthresh = tcp_current_ssthresh(tp); restart_cwnd = min(restart_cwnd, cwnd); @@ -280,6 +279,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) #define SYSCTL_FLAG_WSCALE 0x2 #define SYSCTL_FLAG_SACK 0x4 + /* If congestion control is doing timestamping */ + if (tp->ca_ops->rtt_sample) + do_gettimeofday(&skb->stamp); + sysctl_flags = 0; if (tcb->flags & TCPCB_FLAG_SYN) { tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; @@ -304,17 +307,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); } - /* - * If the connection is idle and we are restarting, - * then we don't want to do any Vegas calculations - * until we get fresh RTT samples. So when we - * restart, we reset our Vegas state to a clean - * slate. After we get acks for this flight of - * packets, _then_ we can make Vegas calculations - * again. - */ - if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0) - tcp_vegas_enable(tp); + if (tcp_packets_in_flight(tp) == 0) + tcp_ca_event(tp, CA_EVENT_TX_START); th = (struct tcphdr *) skb_push(skb, tcp_header_size); skb->h.th = th; @@ -521,6 +515,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) * skbs, which it never sent before. --ANK */ TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; + buff->stamp = skb->stamp; if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { tp->lost_out -= tcp_skb_pcount(skb); @@ -1449,7 +1444,6 @@ static inline void tcp_connect_init(struct sock *sk) tp->window_clamp = dst_metric(dst, RTAX_WINDOW); tp->advmss = dst_metric(dst, RTAX_ADVMSS); tcp_initialize_rcv_mss(sk); - tcp_ca_init(tp); tcp_select_initial_window(tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), @@ -1503,7 +1497,6 @@ int tcp_connect(struct sock *sk) TCP_SKB_CB(buff)->end_seq = tp->write_seq; tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; - tcp_ca_init(tp); /* Send it off. */ TCP_SKB_CB(buff)->when = tcp_time_stamp; diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c new file mode 100644 index 00000000000..70e108e15c7 --- /dev/null +++ b/net/ipv4/tcp_scalable.c @@ -0,0 +1,68 @@ +/* Tom Kelly's Scalable TCP + * + * See htt://www-lce.eng.cam.ac.uk/~ctk21/scalable/ + * + * John Heffner <jheffner@sc.edu> + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <net/tcp.h> + +/* These factors derived from the recommended values in the aer: + * .01 and and 7/8. We use 50 instead of 100 to account for + * delayed ack. + */ +#define TCP_SCALABLE_AI_CNT 50U +#define TCP_SCALABLE_MD_SCALE 3 + +static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, + u32 in_flight, int flag) +{ + if (in_flight < tp->snd_cwnd) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + tp->snd_cwnd++; + } else { + tp->snd_cwnd_cnt++; + if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } + } + tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +static u32 tcp_scalable_ssthresh(struct tcp_sock *tp) +{ + return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); +} + + +static struct tcp_congestion_ops tcp_scalable = { + .ssthresh = tcp_scalable_ssthresh, + .cong_avoid = tcp_scalable_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + + .owner = THIS_MODULE, + .name = "scalable", +}; + +static int __init tcp_scalable_register(void) +{ + return tcp_register_congestion_control(&tcp_scalable); +} + +static void __exit tcp_scalable_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_scalable); +} + +module_init(tcp_scalable_register); +module_exit(tcp_scalable_unregister); + +MODULE_AUTHOR("John Heffner"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Scalable TCP"); diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c new file mode 100644 index 00000000000..9bd443db519 --- /dev/null +++ b/net/ipv4/tcp_vegas.c @@ -0,0 +1,411 @@ +/* + * TCP Vegas congestion control + * + * This is based on the congestion detection/avoidance scheme described in + * Lawrence S. Brakmo and Larry L. Peterson. + * "TCP Vegas: End to end congestion avoidance on a global internet." + * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, + * October 1995. Available from: + * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps + * + * See http://www.cs.arizona.edu/xkernel/ for their implementation. + * The main aspects that distinguish this implementation from the + * Arizona Vegas implementation are: + * o We do not change the loss detection or recovery mechanisms of + * Linux in any way. Linux already recovers from losses quite well, + * using fine-grained timers, NewReno, and FACK. + * o To avoid the performance penalty imposed by increasing cwnd + * only every-other RTT during slow start, we increase during + * every RTT during slow start, just like Reno. + * o Largely to allow continuous cwnd growth during slow start, + * we use the rate at which ACKs come back as the "actual" + * rate, rather than the rate at which data is sent. + * o To speed convergence to the right rate, we set the cwnd + * to achieve the right ("actual") rate when we exit slow start. + * o To filter out the noise caused by delayed ACKs, we use the + * minimum RTT sample observed during the last RTT to calculate + * the actual rate. + * o When the sender re-starts from idle, it waits until it has + * received ACKs for an entire flight of new data before making + * a cwnd adjustment decision. The original Vegas implementation + * assumed senders never went idle. + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/tcp_diag.h> + +#include <net/tcp.h> + +/* Default values of the Vegas variables, in fixed-point representation + * with V_PARAM_SHIFT bits to the right of the binary point. + */ +#define V_PARAM_SHIFT 1 +static int alpha = 1<<V_PARAM_SHIFT; +static int beta = 3<<V_PARAM_SHIFT; +static int gamma = 1<<V_PARAM_SHIFT; + +module_param(alpha, int, 0644); +MODULE_PARM_DESC(alpha, "lower bound of packets in network (scale by 2)"); +module_param(beta, int, 0644); +MODULE_PARM_DESC(beta, "upper bound of packets in network (scale by 2)"); +module_param(gamma, int, 0644); +MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)"); + + +/* Vegas variables */ +struct vegas { + u32 beg_snd_nxt; /* right edge during last RTT */ + u32 beg_snd_una; /* left edge during last RTT */ + u32 beg_snd_cwnd; /* saves the size of the cwnd */ + u8 doing_vegas_now;/* if true, do vegas for this RTT */ + u16 cntRTT; /* # of RTTs measured within last RTT */ + u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ + u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ +}; + +/* There are several situations when we must "re-start" Vegas: + * + * o when a connection is established + * o after an RTO + * o after fast recovery + * o when we send a packet and there is no outstanding + * unacknowledged data (restarting an idle connection) + * + * In these circumstances we cannot do a Vegas calculation at the + * end of the first RTT, because any calculation we do is using + * stale info -- both the saved cwnd and congestion feedback are + * stale. + * + * Instead we must wait until the completion of an RTT during + * which we actually receive ACKs. + */ +static inline void vegas_enable(struct tcp_sock *tp) +{ + struct vegas *vegas = tcp_ca(tp); + + /* Begin taking Vegas samples next time we send something. */ + vegas->doing_vegas_now = 1; + + /* Set the beginning of the next send window. */ + vegas->beg_snd_nxt = tp->snd_nxt; + + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; +} + +/* Stop taking Vegas samples for now. */ +static inline void vegas_disable(struct tcp_sock *tp) +{ + struct vegas *vegas = tcp_ca(tp); + + vegas->doing_vegas_now = 0; +} + +static void tcp_vegas_init(struct tcp_sock *tp) +{ + struct vegas *vegas = tcp_ca(tp); + + vegas->baseRTT = 0x7fffffff; + vegas_enable(tp); +} + +/* Do RTT sampling needed for Vegas. + * Basically we: + * o min-filter RTT samples from within an RTT to get the current + * propagation delay + queuing delay (we are min-filtering to try to + * avoid the effects of delayed ACKs) + * o min-filter RTT samples from a much longer window (forever for now) + * to find the propagation delay (baseRTT) + */ +static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt) +{ + struct vegas *vegas = tcp_ca(tp); + u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */ + + /* Filter to find propagation delay: */ + if (vrtt < vegas->baseRTT) + vegas->baseRTT = vrtt; + + /* Find the min RTT during the last RTT to find + * the current prop. delay + queuing delay: + */ + vegas->minRTT = min(vegas->minRTT, vrtt); + vegas->cntRTT++; +} + +static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state) +{ + + if (ca_state == TCP_CA_Open) + vegas_enable(tp); + else + vegas_disable(tp); +} + +/* + * If the connection is idle and we are restarting, + * then we don't want to do any Vegas calculations + * until we get fresh RTT samples. So when we + * restart, we reset our Vegas state to a clean + * slate. After we get acks for this flight of + * packets, _then_ we can make Vegas calculations + * again. + */ +static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event) +{ + if (event == CA_EVENT_CWND_RESTART || + event == CA_EVENT_TX_START) + tcp_vegas_init(tp); +} + +static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack, + u32 seq_rtt, u32 in_flight, int flag) +{ + struct vegas *vegas = tcp_ca(tp); + + if (!vegas->doing_vegas_now) + return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag); + + /* The key players are v_beg_snd_una and v_beg_snd_nxt. + * + * These are so named because they represent the approximate values + * of snd_una and snd_nxt at the beginning of the current RTT. More + * precisely, they represent the amount of data sent during the RTT. + * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, + * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding + * bytes of data have been ACKed during the course of the RTT, giving + * an "actual" rate of: + * + * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) + * + * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, + * because delayed ACKs can cover more than one segment, so they + * don't line up nicely with the boundaries of RTTs. + * + * Another unfortunate fact of life is that delayed ACKs delay the + * advance of the left edge of our send window, so that the number + * of bytes we send in an RTT is often less than our cwnd will allow. + * So we keep track of our cwnd separately, in v_beg_snd_cwnd. + */ + + if (after(ack, vegas->beg_snd_nxt)) { + /* Do the Vegas once-per-RTT cwnd adjustment. */ + u32 old_wnd, old_snd_cwnd; + + + /* Here old_wnd is essentially the window of data that was + * sent during the previous RTT, and has all + * been acknowledged in the course of the RTT that ended + * with the ACK we just received. Likewise, old_snd_cwnd + * is the cwnd during the previous RTT. + */ + old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) / + tp->mss_cache; + old_snd_cwnd = vegas->beg_snd_cwnd; + + /* Save the extent of the current window so we can use this + * at the end of the next RTT. + */ + vegas->beg_snd_una = vegas->beg_snd_nxt; + vegas->beg_snd_nxt = tp->snd_nxt; + vegas->beg_snd_cwnd = tp->snd_cwnd; + + /* Take into account the current RTT sample too, to + * decrease the impact of delayed acks. This double counts + * this sample since we count it for the next window as well, + * but that's not too awful, since we're taking the min, + * rather than averaging. + */ + tcp_vegas_rtt_calc(tp, seq_rtt*1000); + + /* We do the Vegas calculations only if we got enough RTT + * samples that we can be reasonably sure that we got + * at least one RTT sample that wasn't from a delayed ACK. + * If we only had 2 samples total, + * then that means we're getting only 1 ACK per RTT, which + * means they're almost certainly delayed ACKs. + * If we have 3 samples, we should be OK. + */ + + if (vegas->cntRTT <= 2) { + /* We don't have enough RTT samples to do the Vegas + * calculation, so we'll behave like Reno. + */ + if (tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd++; + } else { + u32 rtt, target_cwnd, diff; + + /* We have enough RTT samples, so, using the Vegas + * algorithm, we determine if we should increase or + * decrease cwnd, and by how much. + */ + + /* Pluck out the RTT we are using for the Vegas + * calculations. This is the min RTT seen during the + * last RTT. Taking the min filters out the effects + * of delayed ACKs, at the cost of noticing congestion + * a bit later. + */ + rtt = vegas->minRTT; + + /* Calculate the cwnd we should have, if we weren't + * going too fast. + * + * This is: + * (actual rate in segments) * baseRTT + * We keep it as a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary point. + */ + target_cwnd = ((old_wnd * vegas->baseRTT) + << V_PARAM_SHIFT) / rtt; + + /* Calculate the difference between the window we had, + * and the window we would like to have. This quantity + * is the "Diff" from the Arizona Vegas papers. + * + * Again, this is a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary + * point. + */ + diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; + + if (tp->snd_cwnd < tp->snd_ssthresh) { + /* Slow start. */ + if (diff > gamma) { + /* Going too fast. Time to slow down + * and switch to congestion avoidance. + */ + tp->snd_ssthresh = 2; + + /* Set cwnd to match the actual rate + * exactly: + * cwnd = (actual rate) * baseRTT + * Then we add 1 because the integer + * truncation robs us of full link + * utilization. + */ + tp->snd_cwnd = min(tp->snd_cwnd, + (target_cwnd >> + V_PARAM_SHIFT)+1); + + } + } else { + /* Congestion avoidance. */ + u32 next_snd_cwnd; + + /* Figure out where we would like cwnd + * to be. + */ + if (diff > beta) { + /* The old window was too fast, so + * we slow down. + */ + next_snd_cwnd = old_snd_cwnd - 1; + } else if (diff < alpha) { + /* We don't have enough extra packets + * in the network, so speed up. + */ + next_snd_cwnd = old_snd_cwnd + 1; + } else { + /* Sending just as fast as we + * should be. + */ + next_snd_cwnd = old_snd_cwnd; + } + + /* Adjust cwnd upward or downward, toward the + * desired value. + */ + if (next_snd_cwnd > tp->snd_cwnd) + tp->snd_cwnd++; + else if (next_snd_cwnd < tp->snd_cwnd) + tp->snd_cwnd--; + } + } + + /* Wipe the slate clean for the next RTT. */ + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; + } + + /* The following code is executed for every ack we receive, + * except for conditions checked in should_advance_cwnd() + * before the call to tcp_cong_avoid(). Mainly this means that + * we only execute this code if the ack actually acked some + * data. + */ + + /* If we are in slow start, increase our cwnd in response to this ACK. + * (If we are not in slow start then we are in congestion avoidance, + * and adjust our congestion window only once per RTT. See the code + * above.) + */ + if (tp->snd_cwnd <= tp->snd_ssthresh) + tp->snd_cwnd++; + + /* to keep cwnd from growing without bound */ + tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); + + /* Make sure that we are never so timid as to reduce our cwnd below + * 2 MSS. + * + * Going below 2 MSS would risk huge delayed ACKs from our receiver. + */ + tp->snd_cwnd = max(tp->snd_cwnd, 2U); +} + +/* Extract info for Tcp socket info provided via netlink. */ +static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext, + struct sk_buff *skb) +{ + const struct vegas *ca = tcp_ca(tp); + if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { + struct tcpvegas_info *info; + + info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO, + sizeof(*info))); + + info->tcpv_enabled = ca->doing_vegas_now; + info->tcpv_rttcnt = ca->cntRTT; + info->tcpv_rtt = ca->baseRTT; + info->tcpv_minrtt = ca->minRTT; + rtattr_failure: ; + } +} + +static struct tcp_congestion_ops tcp_vegas = { + .init = tcp_vegas_init, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_vegas_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + .rtt_sample = tcp_vegas_rtt_calc, + .set_state = tcp_vegas_state, + .cwnd_event = tcp_vegas_cwnd_event, + .get_info = tcp_vegas_get_info, + + .owner = THIS_MODULE, + .name = "vegas", +}; + +static int __init tcp_vegas_register(void) +{ + BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE); + tcp_register_congestion_control(&tcp_vegas); + return 0; +} + +static void __exit tcp_vegas_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_vegas); +} + +module_init(tcp_vegas_register); +module_exit(tcp_vegas_unregister); + +MODULE_AUTHOR("Stephen Hemminger"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Vegas"); diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c new file mode 100644 index 00000000000..ef827242c94 --- /dev/null +++ b/net/ipv4/tcp_westwood.c @@ -0,0 +1,259 @@ +/* + * TCP Westwood+ + * + * Angelo Dell'Aera: TCP Westwood+ support + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/tcp_diag.h> +#include <net/tcp.h> + +/* TCP Westwood structure */ +struct westwood { + u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */ + u32 bw_est; /* bandwidth estimate */ + u32 rtt_win_sx; /* here starts a new evaluation... */ + u32 bk; + u32 snd_una; /* used for evaluating the number of acked bytes */ + u32 cumul_ack; + u32 accounted; + u32 rtt; + u32 rtt_min; /* minimum observed RTT */ +}; + + +/* TCP Westwood functions and constants */ +#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */ +#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */ + +/* + * @tcp_westwood_create + * This function initializes fields used in TCP Westwood+, + * it is called after the initial SYN, so the sequence numbers + * are correct but new passive connections we have no + * information about RTTmin at this time so we simply set it to + * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative + * since in this way we're sure it will be updated in a consistent + * way as soon as possible. It will reasonably happen within the first + * RTT period of the connection lifetime. + */ +static void tcp_westwood_init(struct tcp_sock *tp) +{ + struct westwood *w = tcp_ca(tp); + + w->bk = 0; + w->bw_ns_est = 0; + w->bw_est = 0; + w->accounted = 0; + w->cumul_ack = 0; + w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; + w->rtt_win_sx = tcp_time_stamp; + w->snd_una = tp->snd_una; +} + +/* + * @westwood_do_filter + * Low-pass filter. Implemented using constant coefficients. + */ +static inline u32 westwood_do_filter(u32 a, u32 b) +{ + return (((7 * a) + b) >> 3); +} + +static inline void westwood_filter(struct westwood *w, u32 delta) +{ + w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta); + w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est); +} + +/* + * @westwood_pkts_acked + * Called after processing group of packets. + * but all westwood needs is the last sample of srtt. + */ +static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt) +{ + struct westwood *w = tcp_ca(tp); + if (cnt > 0) + w->rtt = tp->srtt >> 3; +} + +/* + * @westwood_update_window + * It updates RTT evaluation window if it is the right moment to do + * it. If so it calls filter for evaluating bandwidth. + */ +static void westwood_update_window(struct tcp_sock *tp) +{ + struct westwood *w = tcp_ca(tp); + s32 delta = tcp_time_stamp - w->rtt_win_sx; + + /* + * See if a RTT-window has passed. + * Be careful since if RTT is less than + * 50ms we don't filter but we continue 'building the sample'. + * This minimum limit was chosen since an estimation on small + * time intervals is better to avoid... + * Obviously on a LAN we reasonably will always have + * right_bound = left_bound + WESTWOOD_RTT_MIN + */ + if (w->rtt && delta > max_t(u32, w->rtt, TCP_WESTWOOD_RTT_MIN)) { + westwood_filter(w, delta); + + w->bk = 0; + w->rtt_win_sx = tcp_time_stamp; + } +} + +/* + * @westwood_fast_bw + * It is called when we are in fast path. In particular it is called when + * header prediction is successful. In such case in fact update is + * straight forward and doesn't need any particular care. + */ +static inline void westwood_fast_bw(struct tcp_sock *tp) +{ + struct westwood *w = tcp_ca(tp); + + westwood_update_window(tp); + + w->bk += tp->snd_una - w->snd_una; + w->snd_una = tp->snd_una; + w->rtt_min = min(w->rtt, w->rtt_min); +} + +/* + * @westwood_acked_count + * This function evaluates cumul_ack for evaluating bk in case of + * delayed or partial acks. + */ +static inline u32 westwood_acked_count(struct tcp_sock *tp) +{ + struct westwood *w = tcp_ca(tp); + + w->cumul_ack = tp->snd_una - w->snd_una; + + /* If cumul_ack is 0 this is a dupack since it's not moving + * tp->snd_una. + */ + if (!w->cumul_ack) { + w->accounted += tp->mss_cache; + w->cumul_ack = tp->mss_cache; + } + + if (w->cumul_ack > tp->mss_cache) { + /* Partial or delayed ack */ + if (w->accounted >= w->cumul_ack) { + w->accounted -= w->cumul_ack; + w->cumul_ack = tp->mss_cache; + } else { + w->cumul_ack -= w->accounted; + w->accounted = 0; + } + } + + w->snd_una = tp->snd_una; + + return w->cumul_ack; +} + +static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp) +{ + struct westwood *w = tcp_ca(tp); + return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); +} + +/* + * TCP Westwood + * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it + * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 + * so avoids ever returning 0. + */ +static u32 tcp_westwood_cwnd_min(struct tcp_sock *tp) +{ + return westwood_bw_rttmin(tp); +} + +static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event) +{ + struct westwood *w = tcp_ca(tp); + + switch(event) { + case CA_EVENT_FAST_ACK: + westwood_fast_bw(tp); + break; + + case CA_EVENT_COMPLETE_CWR: + tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(tp); + break; + + case CA_EVENT_FRTO: + tp->snd_ssthresh = westwood_bw_rttmin(tp); + break; + + case CA_EVENT_SLOW_ACK: + westwood_update_window(tp); + w->bk += westwood_acked_count(tp); + w->rtt_min = min(w->rtt, w->rtt_min); + break; + + default: + /* don't care */ + break; + } +} + + +/* Extract info for Tcp socket info provided via netlink. */ +static void tcp_westwood_info(struct tcp_sock *tp, u32 ext, + struct sk_buff *skb) +{ + const struct westwood *ca = tcp_ca(tp); + if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { + struct rtattr *rta; + struct tcpvegas_info *info; + + rta = __RTA_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*info)); + info = RTA_DATA(rta); + info->tcpv_enabled = 1; + info->tcpv_rttcnt = 0; + info->tcpv_rtt = jiffies_to_usecs(ca->rtt); + info->tcpv_minrtt = jiffies_to_usecs(ca->rtt_min); + rtattr_failure: ; + } +} + + +static struct tcp_congestion_ops tcp_westwood = { + .init = tcp_westwood_init, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .min_cwnd = tcp_westwood_cwnd_min, + .cwnd_event = tcp_westwood_event, + .get_info = tcp_westwood_info, + .pkts_acked = tcp_westwood_pkts_acked, + + .owner = THIS_MODULE, + .name = "westwood" +}; + +static int __init tcp_westwood_register(void) +{ + BUG_ON(sizeof(struct westwood) > TCP_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_westwood); +} + +static void __exit tcp_westwood_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_westwood); +} + +module_init(tcp_westwood_register); +module_exit(tcp_westwood_unregister); + +MODULE_AUTHOR("Stephen Hemminger, Angelo Dell'Aera"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Westwood+"); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 14f5c53235f..a54d4ef3fd3 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -57,6 +57,7 @@ #endif #include <linux/delay.h> #include <linux/notifier.h> +#include <linux/string.h> #include <net/sock.h> #include <net/snmp.h> @@ -3437,7 +3438,7 @@ static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf * by sysctl and we wouldn't want anyone to change it under our feet * (see SIOCSIFNAME). */ - dev_name = net_sysctl_strdup(dev_name); + dev_name = kstrdup(dev_name, GFP_KERNEL); if (!dev_name) goto free; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2414937f2a8..9dac7fdf472 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2025,7 +2025,7 @@ static int tcp_v6_init_sock(struct sock *sk) sk->sk_state = TCP_CLOSE; tp->af_specific = &ipv6_specific; - + tp->ca_ops = &tcp_init_congestion_ops; sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); diff --git a/net/rxrpc/krxiod.c b/net/rxrpc/krxiod.c index 2b537f425a1..dada34a77b2 100644 --- a/net/rxrpc/krxiod.c +++ b/net/rxrpc/krxiod.c @@ -138,7 +138,7 @@ static int rxrpc_krxiod(void *arg) _debug("### End Work"); - try_to_freeze(PF_FREEZE); + try_to_freeze(); /* discard pending signals */ rxrpc_discard_my_signals(); diff --git a/net/rxrpc/krxsecd.c b/net/rxrpc/krxsecd.c index 6020c89d922..1aadd026d35 100644 --- a/net/rxrpc/krxsecd.c +++ b/net/rxrpc/krxsecd.c @@ -107,7 +107,7 @@ static int rxrpc_krxsecd(void *arg) _debug("### End Inbound Calls"); - try_to_freeze(PF_FREEZE); + try_to_freeze(); /* discard pending signals */ rxrpc_discard_my_signals(); diff --git a/net/rxrpc/krxtimod.c b/net/rxrpc/krxtimod.c index 249c2b0290b..3ac81cdd121 100644 --- a/net/rxrpc/krxtimod.c +++ b/net/rxrpc/krxtimod.c @@ -90,7 +90,7 @@ static int krxtimod(void *arg) complete_and_exit(&krxtimod_dead, 0); } - try_to_freeze(PF_FREEZE); + try_to_freeze(); /* discard pending signals */ rxrpc_discard_my_signals(); diff --git a/net/sched/Kconfig b/net/sched/Kconfig index b22c9beb604..7bac249258e 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -449,6 +449,19 @@ config NET_EMATCH_META To compile this code as a module, choose M here: the module will be called em_meta. +config NET_EMATCH_TEXT + tristate "Textsearch" + depends on NET_EMATCH + select TEXTSEARCH + select TEXTSEARCH_KMP + select TEXTSEARCH_FSM + ---help--- + Say Y here if you want to be ablt to classify packets based on + textsearch comparisons. + + To compile this code as a module, choose M here: the + module will be called em_text. + config NET_CLS_ACT bool "Packet ACTION" depends on EXPERIMENTAL && NET_CLS && NET_QOS diff --git a/net/sched/Makefile b/net/sched/Makefile index eb3fe583eba..8f58cecd626 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -40,3 +40,4 @@ obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o obj-$(CONFIG_NET_EMATCH_META) += em_meta.o +obj-$(CONFIG_NET_EMATCH_TEXT) += em_text.o diff --git a/net/sched/em_text.c b/net/sched/em_text.c new file mode 100644 index 00000000000..873840d8d07 --- /dev/null +++ b/net/sched/em_text.c @@ -0,0 +1,157 @@ +/* + * net/sched/em_text.c Textsearch ematch + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf <tgraf@suug.ch> + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/skbuff.h> +#include <linux/textsearch.h> +#include <linux/tc_ematch/tc_em_text.h> +#include <net/pkt_cls.h> + +struct text_match +{ + u16 from_offset; + u16 to_offset; + u8 from_layer; + u8 to_layer; + struct ts_config *config; +}; + +#define EM_TEXT_PRIV(m) ((struct text_match *) (m)->data) + +static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m, + struct tcf_pkt_info *info) +{ + struct text_match *tm = EM_TEXT_PRIV(m); + int from, to; + struct ts_state state; + + from = tcf_get_base_ptr(skb, tm->from_layer) - skb->data; + from += tm->from_offset; + + to = tcf_get_base_ptr(skb, tm->to_layer) - skb->data; + to += tm->to_offset; + + return skb_find_text(skb, from, to, tm->config, &state) != UINT_MAX; +} + +static int em_text_change(struct tcf_proto *tp, void *data, int len, + struct tcf_ematch *m) +{ + struct text_match *tm; + struct tcf_em_text *conf = data; + struct ts_config *ts_conf; + int flags = 0; + + printk("Configuring text: %s from %d:%d to %d:%d len %d\n", conf->algo, conf->from_offset, + conf->from_layer, conf->to_offset, conf->to_layer, conf->pattern_len); + + if (len < sizeof(*conf) || len < (sizeof(*conf) + conf->pattern_len)) + return -EINVAL; + + if (conf->from_layer > conf->to_layer) + return -EINVAL; + + if (conf->from_layer == conf->to_layer && + conf->from_offset > conf->to_offset) + return -EINVAL; + +retry: + ts_conf = textsearch_prepare(conf->algo, (u8 *) conf + sizeof(*conf), + conf->pattern_len, GFP_KERNEL, flags); + + if (flags & TS_AUTOLOAD) + rtnl_lock(); + + if (IS_ERR(ts_conf)) { + if (PTR_ERR(ts_conf) == -ENOENT && !(flags & TS_AUTOLOAD)) { + rtnl_unlock(); + flags |= TS_AUTOLOAD; + goto retry; + } else + return PTR_ERR(ts_conf); + } else if (flags & TS_AUTOLOAD) { + textsearch_destroy(ts_conf); + return -EAGAIN; + } + + tm = kmalloc(sizeof(*tm), GFP_KERNEL); + if (tm == NULL) { + textsearch_destroy(ts_conf); + return -ENOBUFS; + } + + tm->from_offset = conf->from_offset; + tm->to_offset = conf->to_offset; + tm->from_layer = conf->from_layer; + tm->to_layer = conf->to_layer; + tm->config = ts_conf; + + m->datalen = sizeof(*tm); + m->data = (unsigned long) tm; + + return 0; +} + +static void em_text_destroy(struct tcf_proto *tp, struct tcf_ematch *m) +{ + textsearch_destroy(EM_TEXT_PRIV(m)->config); +} + +static int em_text_dump(struct sk_buff *skb, struct tcf_ematch *m) +{ + struct text_match *tm = EM_TEXT_PRIV(m); + struct tcf_em_text conf; + + strncpy(conf.algo, tm->config->ops->name, sizeof(conf.algo) - 1); + conf.from_offset = tm->from_offset; + conf.to_offset = tm->to_offset; + conf.from_layer = tm->from_layer; + conf.to_layer = tm->to_layer; + conf.pattern_len = textsearch_get_pattern_len(tm->config); + conf.pad = 0; + + RTA_PUT_NOHDR(skb, sizeof(conf), &conf); + RTA_APPEND(skb, conf.pattern_len, textsearch_get_pattern(tm->config)); + return 0; + +rtattr_failure: + return -1; +} + +static struct tcf_ematch_ops em_text_ops = { + .kind = TCF_EM_TEXT, + .change = em_text_change, + .match = em_text_match, + .destroy = em_text_destroy, + .dump = em_text_dump, + .owner = THIS_MODULE, + .link = LIST_HEAD_INIT(em_text_ops.link) +}; + +static int __init init_em_text(void) +{ + return tcf_em_register(&em_text_ops); +} + +static void __exit exit_em_text(void) +{ + tcf_em_unregister(&em_text_ops); +} + +MODULE_LICENSE("GPL"); + +module_init(init_em_text); +module_exit(exit_em_text); diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 058189684c7..86073df418f 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -92,6 +92,17 @@ static sctp_disposition_t sctp_sf_shut_8_4_5(const struct sctp_endpoint *ep, sctp_cmd_seq_t *commands); static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk); +static sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands, + __u16 error, + const struct sctp_association *asoc, + struct sctp_transport *transport); + +static sctp_disposition_t sctp_sf_violation_chunklen( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands); /* Small helper function that checks if the chunk length * is of the appropriate length. The 'required_length' argument @@ -2328,7 +2339,7 @@ sctp_disposition_t sctp_sf_cookie_echoed_abort(const struct sctp_endpoint *ep, * * This is common code called by several sctp_sf_*_abort() functions above. */ -sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands, +static sctp_disposition_t sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands, __u16 error, const struct sctp_association *asoc, struct sctp_transport *transport) @@ -3687,7 +3698,8 @@ sctp_disposition_t sctp_sf_violation(const struct sctp_endpoint *ep, * * Generate an ABORT chunk and terminate the association. */ -sctp_disposition_t sctp_sf_violation_chunklen(const struct sctp_endpoint *ep, +static sctp_disposition_t sctp_sf_violation_chunklen( + const struct sctp_endpoint *ep, const struct sctp_association *asoc, const sctp_subtype_t type, void *arg, diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 32e8acbc60f..62a07349527 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -41,6 +41,7 @@ EXPORT_SYMBOL(rpc_release_task); /* RPC client functions */ EXPORT_SYMBOL(rpc_create_client); +EXPORT_SYMBOL(rpc_new_client); EXPORT_SYMBOL(rpc_clone_client); EXPORT_SYMBOL(rpc_bind_new_program); EXPORT_SYMBOL(rpc_destroy_client); diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 2b99b4028d3..d6baf6fdf8a 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -8,6 +8,7 @@ #include <linux/err.h> #include <linux/seq_file.h> #include <linux/hash.h> +#include <linux/string.h> #define RPCDBG_FACILITY RPCDBG_AUTH @@ -20,14 +21,6 @@ */ -static char *strdup(char *s) -{ - char *rv = kmalloc(strlen(s)+1, GFP_KERNEL); - if (rv) - strcpy(rv, s); - return rv; -} - struct unix_domain { struct auth_domain h; int addr_changes; @@ -55,7 +48,7 @@ struct auth_domain *unix_domain_find(char *name) if (new == NULL) return NULL; cache_init(&new->h.h); - new->h.name = strdup(name); + new->h.name = kstrdup(name, GFP_KERNEL); new->h.flavour = RPC_AUTH_UNIX; new->addr_changes = 0; new->h.h.expiry_time = NEVER; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 05907035bc9..56db8f13e6c 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1185,8 +1185,8 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout) arg->page_len = (pages-2)*PAGE_SIZE; arg->len = (pages-1)*PAGE_SIZE; arg->tail[0].iov_len = 0; - - try_to_freeze(PF_FREEZE); + + try_to_freeze(); if (signalled()) return -EINTR; @@ -1227,7 +1227,7 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout) schedule_timeout(timeout); - try_to_freeze(PF_FREEZE); + try_to_freeze(); spin_lock_bh(&serv->sv_lock); remove_wait_queue(&rqstp->rq_wait, &wait); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index eca92405948..269f217918a 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -970,7 +970,7 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc) goto out; } - dprintk("RPC: XID %08x read %u bytes\n", + dprintk("RPC: XID %08x read %Zd bytes\n", ntohl(xprt->tcp_xid), r); dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n", xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen); @@ -1006,7 +1006,7 @@ tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc) desc->count -= len; desc->offset += len; xprt->tcp_offset += len; - dprintk("RPC: discarded %u bytes\n", len); + dprintk("RPC: discarded %Zu bytes\n", len); tcp_check_recm(xprt); } |