From 317a76f9a44b437d6301718f4e5d08bd93f98da7 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 23 Jun 2005 12:19:55 -0700 Subject: [TCP]: Add pluggable congestion control algorithm infrastructure. Allow TCP to have multiple pluggable congestion control algorithms. Algorithms are defined by a set of operations and can be built in or modules. The legacy "new RENO" algorithm is used as a starting point and fallback. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/sysctl.h | 9 +- include/linux/tcp.h | 49 +-- include/net/tcp.h | 237 +++++---------- net/ipv4/Makefile | 3 +- net/ipv4/sysctl_net_ipv4.c | 114 +++---- net/ipv4/tcp.c | 2 + net/ipv4/tcp_cong.c | 195 ++++++++++++ net/ipv4/tcp_diag.c | 20 +- net/ipv4/tcp_input.c | 737 ++++----------------------------------------- net/ipv4/tcp_ipv4.c | 3 + net/ipv4/tcp_minisocks.c | 4 +- net/ipv4/tcp_output.c | 23 +- net/ipv6/tcp_ipv6.c | 2 +- 13 files changed, 399 insertions(+), 999 deletions(-) create mode 100644 net/ipv4/tcp_cong.c diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 614e939c78a..72965bfe6cf 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -333,21 +333,14 @@ enum NET_TCP_FRTO=92, NET_TCP_LOW_LATENCY=93, NET_IPV4_IPFRAG_SECRET_INTERVAL=94, - NET_TCP_WESTWOOD=95, NET_IPV4_IGMP_MAX_MSF=96, NET_TCP_NO_METRICS_SAVE=97, - NET_TCP_VEGAS=98, - NET_TCP_VEGAS_ALPHA=99, - NET_TCP_VEGAS_BETA=100, - NET_TCP_VEGAS_GAMMA=101, - NET_TCP_BIC=102, - NET_TCP_BIC_FAST_CONVERGENCE=103, - NET_TCP_BIC_LOW_WINDOW=104, NET_TCP_DEFAULT_WIN_SCALE=105, NET_TCP_MODERATE_RCVBUF=106, NET_TCP_TSO_WIN_DIVISOR=107, NET_TCP_BIC_BETA=108, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109, + NET_TCP_CONG_CONTROL=110, }; enum { diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 97a7c9e03df..3ea75dd6640 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -203,13 +203,6 @@ struct tcp_sack_block { __u32 end_seq; }; -enum tcp_congestion_algo { - TCP_RENO=0, - TCP_VEGAS, - TCP_WESTWOOD, - TCP_BIC, -}; - struct tcp_options_received { /* PAWS/RTTM data */ long ts_recent_stamp;/* Time we stored ts_recent (for aging) */ @@ -305,7 +298,7 @@ struct tcp_sock { __u8 reordering; /* Packet reordering metric. */ __u8 frto_counter; /* Number of new acks after RTO */ - __u8 adv_cong; /* Using Vegas, Westwood, or BIC */ + __u8 unused; __u8 defer_accept; /* User waits for some data after accept() */ /* RTT measurement */ @@ -401,37 +394,10 @@ struct tcp_sock { __u32 time; } rcvq_space; -/* TCP Westwood structure */ - struct { - __u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */ - __u32 bw_est; /* bandwidth estimate */ - __u32 rtt_win_sx; /* here starts a new evaluation... */ - __u32 bk; - __u32 snd_una; /* used for evaluating the number of acked bytes */ - __u32 cumul_ack; - __u32 accounted; - __u32 rtt; - __u32 rtt_min; /* minimum observed RTT */ - } westwood; - -/* Vegas variables */ - struct { - __u32 beg_snd_nxt; /* right edge during last RTT */ - __u32 beg_snd_una; /* left edge during last RTT */ - __u32 beg_snd_cwnd; /* saves the size of the cwnd */ - __u8 doing_vegas_now;/* if true, do vegas for this RTT */ - __u16 cntRTT; /* # of RTTs measured within last RTT */ - __u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ - __u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ - } vegas; - - /* BI TCP Parameters */ - struct { - __u32 cnt; /* increase cwnd by 1 after this number of ACKs */ - __u32 last_max_cwnd; /* last maximium snd_cwnd */ - __u32 last_cwnd; /* the last snd_cwnd */ - __u32 last_stamp; /* time when updated last_cwnd */ - } bictcp; + /* Pluggable TCP congestion control hook */ + struct tcp_congestion_ops *ca_ops; + u32 ca_priv[16]; +#define TCP_CA_PRIV_SIZE (16*sizeof(u32)) }; static inline struct tcp_sock *tcp_sk(const struct sock *sk) @@ -439,6 +405,11 @@ static inline struct tcp_sock *tcp_sk(const struct sock *sk) return (struct tcp_sock *)sk; } +static inline void *tcp_ca(const struct tcp_sock *tp) +{ + return (void *) tp->ca_priv; +} + #endif #endif /* _LINUX_TCP_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index f730935b824..e427cf35915 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -505,25 +505,6 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk) #else # define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG) #endif - -#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation - * max_cwnd = snd_cwnd * beta - */ -#define BICTCP_MAX_INCREMENT 32 /* - * Limit on the amount of - * increment allowed during - * binary search. - */ -#define BICTCP_FUNC_OF_MIN_INCR 11 /* - * log(B/Smin)/log(B/(B-1))+1, - * Smin:min increment - * B:log factor - */ -#define BICTCP_B 4 /* - * In binary search, - * go to point (max+min)/N - */ - /* * TCP option */ @@ -596,16 +577,7 @@ extern int sysctl_tcp_adv_win_scale; extern int sysctl_tcp_tw_reuse; extern int sysctl_tcp_frto; extern int sysctl_tcp_low_latency; -extern int sysctl_tcp_westwood; -extern int sysctl_tcp_vegas_cong_avoid; -extern int sysctl_tcp_vegas_alpha; -extern int sysctl_tcp_vegas_beta; -extern int sysctl_tcp_vegas_gamma; extern int sysctl_tcp_nometrics_save; -extern int sysctl_tcp_bic; -extern int sysctl_tcp_bic_fast_convergence; -extern int sysctl_tcp_bic_low_window; -extern int sysctl_tcp_bic_beta; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; @@ -1136,6 +1108,80 @@ static inline void tcp_packets_out_dec(struct tcp_sock *tp, tp->packets_out -= tcp_skb_pcount(skb); } +/* Events passed to congestion control interface */ +enum tcp_ca_event { + CA_EVENT_TX_START, /* first transmit when no packets in flight */ + CA_EVENT_CWND_RESTART, /* congestion window restart */ + CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ + CA_EVENT_FRTO, /* fast recovery timeout */ + CA_EVENT_LOSS, /* loss timeout */ + CA_EVENT_FAST_ACK, /* in sequence ack */ + CA_EVENT_SLOW_ACK, /* other ack */ +}; + +/* + * Interface for adding new TCP congestion control handlers + */ +#define TCP_CA_NAME_MAX 16 +struct tcp_congestion_ops { + struct list_head list; + + /* initialize private data (optional) */ + void (*init)(struct tcp_sock *tp); + /* cleanup private data (optional) */ + void (*release)(struct tcp_sock *tp); + + /* return slow start threshold (required) */ + u32 (*ssthresh)(struct tcp_sock *tp); + /* lower bound for congestion window (optional) */ + u32 (*min_cwnd)(struct tcp_sock *tp); + /* do new cwnd calculation (required) */ + void (*cong_avoid)(struct tcp_sock *tp, u32 ack, + u32 rtt, u32 in_flight, int good_ack); + /* round trip time sample per acked packet (optional) */ + void (*rtt_sample)(struct tcp_sock *tp, u32 usrtt); + /* call before changing ca_state (optional) */ + void (*set_state)(struct tcp_sock *tp, u8 new_state); + /* call when cwnd event occurs (optional) */ + void (*cwnd_event)(struct tcp_sock *tp, enum tcp_ca_event ev); + /* new value of cwnd after loss (optional) */ + u32 (*undo_cwnd)(struct tcp_sock *tp); + /* hook for packet ack accounting (optional) */ + void (*pkts_acked)(struct tcp_sock *tp, u32 num_acked); + /* get info for tcp_diag (optional) */ + void (*get_info)(struct tcp_sock *tp, u32 ext, struct sk_buff *skb); + + char name[TCP_CA_NAME_MAX]; + struct module *owner; +}; + +extern int tcp_register_congestion_control(struct tcp_congestion_ops *type); +extern void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); + +extern void tcp_init_congestion_control(struct tcp_sock *tp); +extern void tcp_cleanup_congestion_control(struct tcp_sock *tp); +extern int tcp_set_default_congestion_control(const char *name); +extern void tcp_get_default_congestion_control(char *name); + +extern struct tcp_congestion_ops tcp_reno; +extern u32 tcp_reno_ssthresh(struct tcp_sock *tp); +extern void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, + u32 rtt, u32 in_flight, int flag); +extern u32 tcp_reno_min_cwnd(struct tcp_sock *tp); + +static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state) +{ + if (tp->ca_ops->set_state) + tp->ca_ops->set_state(tp, ca_state); + tp->ca_state = ca_state; +} + +static inline void tcp_ca_event(struct tcp_sock *tp, enum tcp_ca_event event) +{ + if (tp->ca_ops->cwnd_event) + tp->ca_ops->cwnd_event(tp, event); +} + /* This determines how many packets are "in the network" to the best * of our knowledge. In many cases it is conservative, but where * detailed information is available from the receiver (via SACK @@ -1155,91 +1201,6 @@ static __inline__ unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) return (tp->packets_out - tp->left_out + tp->retrans_out); } -/* - * Which congestion algorithim is in use on the connection. - */ -#define tcp_is_vegas(__tp) ((__tp)->adv_cong == TCP_VEGAS) -#define tcp_is_westwood(__tp) ((__tp)->adv_cong == TCP_WESTWOOD) -#define tcp_is_bic(__tp) ((__tp)->adv_cong == TCP_BIC) - -/* Recalculate snd_ssthresh, we want to set it to: - * - * Reno: - * one half the current congestion window, but no - * less than two segments - * - * BIC: - * behave like Reno until low_window is reached, - * then increase congestion window slowly - */ -static inline __u32 tcp_recalc_ssthresh(struct tcp_sock *tp) -{ - if (tcp_is_bic(tp)) { - if (sysctl_tcp_bic_fast_convergence && - tp->snd_cwnd < tp->bictcp.last_max_cwnd) - tp->bictcp.last_max_cwnd = (tp->snd_cwnd * - (BICTCP_BETA_SCALE - + sysctl_tcp_bic_beta)) - / (2 * BICTCP_BETA_SCALE); - else - tp->bictcp.last_max_cwnd = tp->snd_cwnd; - - if (tp->snd_cwnd > sysctl_tcp_bic_low_window) - return max((tp->snd_cwnd * sysctl_tcp_bic_beta) - / BICTCP_BETA_SCALE, 2U); - } - - return max(tp->snd_cwnd >> 1U, 2U); -} - -/* Stop taking Vegas samples for now. */ -#define tcp_vegas_disable(__tp) ((__tp)->vegas.doing_vegas_now = 0) - -static inline void tcp_vegas_enable(struct tcp_sock *tp) -{ - /* There are several situations when we must "re-start" Vegas: - * - * o when a connection is established - * o after an RTO - * o after fast recovery - * o when we send a packet and there is no outstanding - * unacknowledged data (restarting an idle connection) - * - * In these circumstances we cannot do a Vegas calculation at the - * end of the first RTT, because any calculation we do is using - * stale info -- both the saved cwnd and congestion feedback are - * stale. - * - * Instead we must wait until the completion of an RTT during - * which we actually receive ACKs. - */ - - /* Begin taking Vegas samples next time we send something. */ - tp->vegas.doing_vegas_now = 1; - - /* Set the beginning of the next send window. */ - tp->vegas.beg_snd_nxt = tp->snd_nxt; - - tp->vegas.cntRTT = 0; - tp->vegas.minRTT = 0x7fffffff; -} - -/* Should we be taking Vegas samples right now? */ -#define tcp_vegas_enabled(__tp) ((__tp)->vegas.doing_vegas_now) - -extern void tcp_ca_init(struct tcp_sock *tp); - -static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state) -{ - if (tcp_is_vegas(tp)) { - if (ca_state == TCP_CA_Open) - tcp_vegas_enable(tp); - else - tcp_vegas_disable(tp); - } - tp->ca_state = ca_state; -} - /* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd. * The exception is rate halving phase, when cwnd is decreasing towards * ssthresh. @@ -1288,7 +1249,7 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) static inline void __tcp_enter_cwr(struct tcp_sock *tp) { tp->undo_marker = 0; - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1U); tp->snd_cwnd_cnt = 0; @@ -1876,52 +1837,4 @@ struct tcp_iter_state { extern int tcp_proc_register(struct tcp_seq_afinfo *afinfo); extern void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo); -/* TCP Westwood functions and constants */ - -#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */ -#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */ - -static inline void tcp_westwood_update_rtt(struct tcp_sock *tp, __u32 rtt_seq) -{ - if (tcp_is_westwood(tp)) - tp->westwood.rtt = rtt_seq; -} - -static inline __u32 __tcp_westwood_bw_rttmin(const struct tcp_sock *tp) -{ - return max((tp->westwood.bw_est) * (tp->westwood.rtt_min) / - (__u32) (tp->mss_cache_std), - 2U); -} - -static inline __u32 tcp_westwood_bw_rttmin(const struct tcp_sock *tp) -{ - return tcp_is_westwood(tp) ? __tcp_westwood_bw_rttmin(tp) : 0; -} - -static inline int tcp_westwood_ssthresh(struct tcp_sock *tp) -{ - __u32 ssthresh = 0; - - if (tcp_is_westwood(tp)) { - ssthresh = __tcp_westwood_bw_rttmin(tp); - if (ssthresh) - tp->snd_ssthresh = ssthresh; - } - - return (ssthresh != 0); -} - -static inline int tcp_westwood_cwnd(struct tcp_sock *tp) -{ - __u32 cwnd = 0; - - if (tcp_is_westwood(tp)) { - cwnd = __tcp_westwood_bw_rttmin(tp); - if (cwnd) - tp->snd_cwnd = cwnd; - } - - return (cwnd != 0); -} #endif /* _TCP_H */ diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 65d57d8e1ad..89c0b4cb470 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -5,7 +5,8 @@ obj-y := utils.o route.o inetpeer.o protocol.o \ ip_input.o ip_fragment.o ip_forward.o ip_options.o \ ip_output.o ip_sockglue.o \ - tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \ + tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ + tcp_minisocks.o tcp_cong.o \ datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ sysctl_net_ipv4.o fib_frontend.o fib_semantics.o diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 23068bddbf0..e3289453241 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -118,6 +118,45 @@ static int ipv4_sysctl_forward_strategy(ctl_table *table, return 1; } +static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char val[TCP_CA_NAME_MAX]; + ctl_table tbl = { + .data = val, + .maxlen = TCP_CA_NAME_MAX, + }; + int ret; + + tcp_get_default_congestion_control(val); + + ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos); + if (write && ret == 0) + ret = tcp_set_default_congestion_control(val); + return ret; +} + +int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, + void **context) +{ + char val[TCP_CA_NAME_MAX]; + ctl_table tbl = { + .data = val, + .maxlen = TCP_CA_NAME_MAX, + }; + int ret; + + tcp_get_default_congestion_control(val); + ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen, + context); + if (ret == 0 && newval && newlen) + ret = tcp_set_default_congestion_control(val); + return ret; +} + + ctl_table ipv4_table[] = { { .ctl_name = NET_IPV4_TCP_TIMESTAMPS, @@ -611,70 +650,6 @@ ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, - { - .ctl_name = NET_TCP_WESTWOOD, - .procname = "tcp_westwood", - .data = &sysctl_tcp_westwood, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_VEGAS, - .procname = "tcp_vegas_cong_avoid", - .data = &sysctl_tcp_vegas_cong_avoid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_VEGAS_ALPHA, - .procname = "tcp_vegas_alpha", - .data = &sysctl_tcp_vegas_alpha, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_VEGAS_BETA, - .procname = "tcp_vegas_beta", - .data = &sysctl_tcp_vegas_beta, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_VEGAS_GAMMA, - .procname = "tcp_vegas_gamma", - .data = &sysctl_tcp_vegas_gamma, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_BIC, - .procname = "tcp_bic", - .data = &sysctl_tcp_bic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_BIC_FAST_CONVERGENCE, - .procname = "tcp_bic_fast_convergence", - .data = &sysctl_tcp_bic_fast_convergence, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_BIC_LOW_WINDOW, - .procname = "tcp_bic_low_window", - .data = &sysctl_tcp_bic_low_window, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, { .ctl_name = NET_TCP_MODERATE_RCVBUF, .procname = "tcp_moderate_rcvbuf", @@ -692,13 +667,14 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_dointvec, }, { - .ctl_name = NET_TCP_BIC_BETA, - .procname = "tcp_bic_beta", - .data = &sysctl_tcp_bic_beta, - .maxlen = sizeof(int), + .ctl_name = NET_TCP_CONG_CONTROL, + .procname = "tcp_congestion_control", .mode = 0644, - .proc_handler = &proc_dointvec, + .maxlen = TCP_CA_NAME_MAX, + .proc_handler = &proc_tcp_congestion_control, + .strategy = &sysctl_tcp_congestion_control, }, + { .ctl_name = 0 } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 674bbd8cfd3..f3dbc8dc126 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2333,6 +2333,8 @@ void __init tcp_init(void) printk(KERN_INFO "TCP: Hash tables configured " "(established %d bind %d)\n", tcp_ehash_size << 1, tcp_bhash_size); + + tcp_register_congestion_control(&tcp_reno); } EXPORT_SYMBOL(tcp_accept); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c new file mode 100644 index 00000000000..665394a63ae --- /dev/null +++ b/net/ipv4/tcp_cong.c @@ -0,0 +1,195 @@ +/* + * Plugable TCP congestion control support and newReno + * congestion control. + * Based on ideas from I/O scheduler suport and Web100. + * + * Copyright (C) 2005 Stephen Hemminger + */ + +#include +#include +#include +#include +#include +#include + +static DEFINE_SPINLOCK(tcp_cong_list_lock); +static LIST_HEAD(tcp_cong_list); + +/* Simple linear search, don't expect many entries! */ +static struct tcp_congestion_ops *tcp_ca_find(const char *name) +{ + struct tcp_congestion_ops *e; + + list_for_each_entry(e, &tcp_cong_list, list) { + if (strcmp(e->name, name) == 0) + return e; + } + + return NULL; +} + +/* + * Attach new congestion control algorthim to the list + * of available options. + */ +int tcp_register_congestion_control(struct tcp_congestion_ops *ca) +{ + int ret = 0; + + /* all algorithms must implement ssthresh and cong_avoid ops */ + if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) { + printk(KERN_ERR "TCP %s does not implement required ops\n", + ca->name); + return -EINVAL; + } + + spin_lock(&tcp_cong_list_lock); + if (tcp_ca_find(ca->name)) { + printk(KERN_NOTICE "TCP %s already registered\n", ca->name); + ret = -EEXIST; + } else { + list_add_rcu(&ca->list, &tcp_cong_list); + printk(KERN_INFO "TCP %s registered\n", ca->name); + } + spin_unlock(&tcp_cong_list_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(tcp_register_congestion_control); + +/* + * Remove congestion control algorithm, called from + * the module's remove function. Module ref counts are used + * to ensure that this can't be done till all sockets using + * that method are closed. + */ +void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) +{ + spin_lock(&tcp_cong_list_lock); + list_del_rcu(&ca->list); + spin_unlock(&tcp_cong_list_lock); +} +EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); + +/* Assign choice of congestion control. */ +void tcp_init_congestion_control(struct tcp_sock *tp) +{ + struct tcp_congestion_ops *ca; + + rcu_read_lock(); + list_for_each_entry_rcu(ca, &tcp_cong_list, list) { + if (try_module_get(ca->owner)) { + tp->ca_ops = ca; + break; + } + + } + rcu_read_unlock(); + + if (tp->ca_ops->init) + tp->ca_ops->init(tp); +} + +/* Manage refcounts on socket close. */ +void tcp_cleanup_congestion_control(struct tcp_sock *tp) +{ + if (tp->ca_ops->release) + tp->ca_ops->release(tp); + module_put(tp->ca_ops->owner); +} + +/* Used by sysctl to change default congestion control */ +int tcp_set_default_congestion_control(const char *name) +{ + struct tcp_congestion_ops *ca; + int ret = -ENOENT; + + spin_lock(&tcp_cong_list_lock); + ca = tcp_ca_find(name); +#ifdef CONFIG_KMOD + if (!ca) { + spin_unlock(&tcp_cong_list_lock); + + request_module("tcp_%s", name); + spin_lock(&tcp_cong_list_lock); + ca = tcp_ca_find(name); + } +#endif + + if (ca) { + list_move(&ca->list, &tcp_cong_list); + ret = 0; + } + spin_unlock(&tcp_cong_list_lock); + + return ret; +} + +/* Get current default congestion control */ +void tcp_get_default_congestion_control(char *name) +{ + struct tcp_congestion_ops *ca; + /* We will always have reno... */ + BUG_ON(list_empty(&tcp_cong_list)); + + rcu_read_lock(); + ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list); + strncpy(name, ca->name, TCP_CA_NAME_MAX); + rcu_read_unlock(); +} + +/* + * TCP Reno congestion control + * This is special case used for fallback as well. + */ +/* This is Jacobson's slow start and congestion avoidance. + * SIGCOMM '88, p. 328. + */ +void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight, + int flag) +{ + if (in_flight < tp->snd_cwnd) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* In "safe" area, increase. */ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } else { + /* In dangerous area, increase slowly. + * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd + */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } +} +EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); + +/* Slow start threshold is half the congestion window (min 2) */ +u32 tcp_reno_ssthresh(struct tcp_sock *tp) +{ + return max(tp->snd_cwnd >> 1U, 2U); +} +EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); + +/* Lower bound on congestion window. */ +u32 tcp_reno_min_cwnd(struct tcp_sock *tp) +{ + return tp->snd_ssthresh/2; +} +EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); + +struct tcp_congestion_ops tcp_reno = { + .name = "reno", + .owner = THIS_MODULE, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, +}; + +EXPORT_SYMBOL_GPL(tcp_reno); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 634befc0792..867acc0f79d 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -42,7 +42,6 @@ struct tcpdiag_entry static struct sock *tcpnl; - #define TCPDIAG_PUT(skb, attrtype, attrlen) \ ({ int rtalen = RTA_LENGTH(attrlen); \ struct rtattr *rta; \ @@ -61,7 +60,6 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, struct nlmsghdr *nlh; struct tcp_info *info = NULL; struct tcpdiag_meminfo *minfo = NULL; - struct tcpvegas_info *vinfo = NULL; unsigned char *b = skb->tail; nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); @@ -73,9 +71,6 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, if (ext & (1<<(TCPDIAG_INFO-1))) info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); - if ((tcp_is_westwood(tp) || tcp_is_vegas(tp)) - && (ext & (1<<(TCPDIAG_VEGASINFO-1)))) - vinfo = TCPDIAG_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*vinfo)); } r->tcpdiag_family = sk->sk_family; r->tcpdiag_state = sk->sk_state; @@ -166,19 +161,8 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, if (info) tcp_get_info(sk, info); - if (vinfo) { - if (tcp_is_vegas(tp)) { - vinfo->tcpv_enabled = tp->vegas.doing_vegas_now; - vinfo->tcpv_rttcnt = tp->vegas.cntRTT; - vinfo->tcpv_rtt = jiffies_to_usecs(tp->vegas.baseRTT); - vinfo->tcpv_minrtt = jiffies_to_usecs(tp->vegas.minRTT); - } else { - vinfo->tcpv_enabled = 0; - vinfo->tcpv_rttcnt = 0; - vinfo->tcpv_rtt = jiffies_to_usecs(tp->westwood.rtt); - vinfo->tcpv_minrtt = jiffies_to_usecs(tp->westwood.rtt_min); - } - } + if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info) + tp->ca_ops->get_info(tp, ext, skb); nlh->nlmsg_len = skb->tail - b; return skb->len; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5bad504630a..7bbbbc33eb4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -61,7 +61,6 @@ * Panu Kuhlberg: Experimental audit of TCP (re)transmission * engine. Lots of bugs are found. * Pasi Sarolahti: F-RTO for dealing with spurious RTOs - * Angelo Dell'Aera: TCP Westwood+ support */ #include @@ -88,23 +87,9 @@ int sysctl_tcp_rfc1337; int sysctl_tcp_max_orphans = NR_FILE; int sysctl_tcp_frto; int sysctl_tcp_nometrics_save; -int sysctl_tcp_westwood; -int sysctl_tcp_vegas_cong_avoid; int sysctl_tcp_moderate_rcvbuf = 1; -/* Default values of the Vegas variables, in fixed-point representation - * with V_PARAM_SHIFT bits to the right of the binary point. - */ -#define V_PARAM_SHIFT 1 -int sysctl_tcp_vegas_alpha = 1<snd_cwnd_stamp = tcp_time_stamp; } -static void init_bictcp(struct tcp_sock *tp) -{ - tp->bictcp.cnt = 0; - - tp->bictcp.last_max_cwnd = 0; - tp->bictcp.last_cwnd = 0; - tp->bictcp.last_stamp = 0; -} - /* 5. Recalculate window clamp after socket hit its memory bounds. */ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) { @@ -558,45 +534,6 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ tcp_grow_window(sk, tp, skb); } -/* When starting a new connection, pin down the current choice of - * congestion algorithm. - */ -void tcp_ca_init(struct tcp_sock *tp) -{ - if (sysctl_tcp_westwood) - tp->adv_cong = TCP_WESTWOOD; - else if (sysctl_tcp_bic) - tp->adv_cong = TCP_BIC; - else if (sysctl_tcp_vegas_cong_avoid) { - tp->adv_cong = TCP_VEGAS; - tp->vegas.baseRTT = 0x7fffffff; - tcp_vegas_enable(tp); - } -} - -/* Do RTT sampling needed for Vegas. - * Basically we: - * o min-filter RTT samples from within an RTT to get the current - * propagation delay + queuing delay (we are min-filtering to try to - * avoid the effects of delayed ACKs) - * o min-filter RTT samples from a much longer window (forever for now) - * to find the propagation delay (baseRTT) - */ -static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt) -{ - __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */ - - /* Filter to find propagation delay: */ - if (vrtt < tp->vegas.baseRTT) - tp->vegas.baseRTT = vrtt; - - /* Find the min RTT during the last RTT to find - * the current prop. delay + queuing delay: - */ - tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt); - tp->vegas.cntRTT++; -} - /* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge @@ -606,13 +543,10 @@ static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt) * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ -static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) +static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt) { long m = mrtt; /* RTT */ - if (tcp_vegas_enabled(tp)) - vegas_rtt_calc(tp, mrtt); - /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. @@ -670,7 +604,8 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) tp->rtt_seq = tp->snd_nxt; } - tcp_westwood_update_rtt(tp, tp->srtt >> 3); + if (tp->ca_ops->rtt_sample) + tp->ca_ops->rtt_sample(tp, *usrtt); } /* Calculate rto without backoff. This is the second half of Van Jacobson's @@ -1185,8 +1120,8 @@ void tcp_enter_frto(struct sock *sk) tp->snd_una == tp->high_seq || (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(tp); - if (!tcp_westwood_ssthresh(tp)) - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); + tcp_ca_event(tp, CA_EVENT_FRTO); } /* Have to clear retransmission markers here to keep the bookkeeping @@ -1252,8 +1187,6 @@ static void tcp_enter_frto_loss(struct sock *sk) tcp_set_ca_state(tp, TCP_CA_Loss); tp->high_seq = tp->frto_highmark; TCP_ECN_queue_cwr(tp); - - init_bictcp(tp); } void tcp_clear_retrans(struct tcp_sock *tp) @@ -1283,7 +1216,8 @@ void tcp_enter_loss(struct sock *sk, int how) if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(tp); - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); + tcp_ca_event(tp, CA_EVENT_LOSS); } tp->snd_cwnd = 1; tp->snd_cwnd_cnt = 0; @@ -1596,28 +1530,14 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) } /* Decrease cwnd each second ack. */ - static void tcp_cwnd_down(struct tcp_sock *tp) { int decr = tp->snd_cwnd_cnt + 1; - __u32 limit; - - /* - * TCP Westwood - * Here limit is evaluated as BWestimation*RTTmin (for obtaining it - * in packets we use mss_cache). If sysctl_tcp_westwood is off - * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is - * still used as usual. It prevents other strange cases in which - * BWE*RTTmin could assume value 0. It should not happen but... - */ - - if (!(limit = tcp_westwood_bw_rttmin(tp))) - limit = tp->snd_ssthresh/2; tp->snd_cwnd_cnt = decr&1; decr >>= 1; - if (decr && tp->snd_cwnd > limit) + if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp)) tp->snd_cwnd -= decr; tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); @@ -1654,8 +1574,8 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg) static void tcp_undo_cwr(struct tcp_sock *tp, int undo) { if (tp->prior_ssthresh) { - if (tcp_is_bic(tp)) - tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd); + if (tp->ca_ops->undo_cwnd) + tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp); else tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); @@ -1767,11 +1687,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) static inline void tcp_complete_cwr(struct tcp_sock *tp) { - if (tcp_westwood_cwnd(tp)) - tp->snd_ssthresh = tp->snd_cwnd; - else - tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); tp->snd_cwnd_stamp = tcp_time_stamp; + tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR); } static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) @@ -1946,7 +1864,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, if (tp->ca_state < TCP_CA_CWR) { if (!(flag&FLAG_ECE)) tp->prior_ssthresh = tcp_current_ssthresh(tp); - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); TCP_ECN_queue_cwr(tp); } @@ -1963,7 +1881,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* Read draft-ietf-tcplw-high-performance before mucking * with this code. (Superceeds RFC1323) */ -static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag) +static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag) { __u32 seq_rtt; @@ -1983,13 +1901,13 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag) * in window is lost... Voila. --ANK (010210) */ seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; - tcp_rtt_estimator(tp, seq_rtt); + tcp_rtt_estimator(tp, seq_rtt, usrtt); tcp_set_rto(tp); tp->backoff = 0; tcp_bound_rto(tp); } -static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag) +static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag) { /* We don't have a timestamp. Can only use * packets that are not retransmitted to determine @@ -2003,338 +1921,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag) if (flag & FLAG_RETRANS_DATA_ACKED) return; - tcp_rtt_estimator(tp, seq_rtt); + tcp_rtt_estimator(tp, seq_rtt, usrtt); tcp_set_rto(tp); tp->backoff = 0; tcp_bound_rto(tp); } static inline void tcp_ack_update_rtt(struct tcp_sock *tp, - int flag, s32 seq_rtt) + int flag, s32 seq_rtt, u32 *usrtt) { /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) - tcp_ack_saw_tstamp(tp, flag); + tcp_ack_saw_tstamp(tp, usrtt, flag); else if (seq_rtt >= 0) - tcp_ack_no_tstamp(tp, seq_rtt, flag); + tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag); } -/* - * Compute congestion window to use. - * - * This is from the implementation of BICTCP in - * Lison-Xu, Kahaled Harfoush, and Injog Rhee. - * "Binary Increase Congestion Control for Fast, Long Distance - * Networks" in InfoComm 2004 - * Available from: - * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf - * - * Unless BIC is enabled and congestion window is large - * this behaves the same as the original Reno. - */ -static inline __u32 bictcp_cwnd(struct tcp_sock *tp) -{ - /* orignal Reno behaviour */ - if (!tcp_is_bic(tp)) - return tp->snd_cwnd; - - if (tp->bictcp.last_cwnd == tp->snd_cwnd && - (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5)) - return tp->bictcp.cnt; - - tp->bictcp.last_cwnd = tp->snd_cwnd; - tp->bictcp.last_stamp = tcp_time_stamp; - - /* start off normal */ - if (tp->snd_cwnd <= sysctl_tcp_bic_low_window) - tp->bictcp.cnt = tp->snd_cwnd; - - /* binary increase */ - else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) { - __u32 dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd) - / BICTCP_B; - - if (dist > BICTCP_MAX_INCREMENT) - /* linear increase */ - tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT; - else if (dist <= 1U) - /* binary search increase */ - tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR - / BICTCP_B; - else - /* binary search increase */ - tp->bictcp.cnt = tp->snd_cwnd / dist; - } else { - /* slow start amd linear increase */ - if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B) - /* slow start */ - tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR - / BICTCP_B; - else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd - + BICTCP_MAX_INCREMENT*(BICTCP_B-1)) - /* slow start */ - tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1) - / (tp->snd_cwnd-tp->bictcp.last_max_cwnd); - else - /* linear increase */ - tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT; - } - return tp->bictcp.cnt; -} - -/* This is Jacobson's slow start and congestion avoidance. - * SIGCOMM '88, p. 328. - */ -static inline void reno_cong_avoid(struct tcp_sock *tp) +static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, + u32 in_flight, int good) { - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { - /* In dangerous area, increase slowly. - * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd - */ - if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt=0; - } else - tp->snd_cwnd_cnt++; - } + tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good); tp->snd_cwnd_stamp = tcp_time_stamp; } -/* This is based on the congestion detection/avoidance scheme described in - * Lawrence S. Brakmo and Larry L. Peterson. - * "TCP Vegas: End to end congestion avoidance on a global internet." - * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, - * October 1995. Available from: - * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps - * - * See http://www.cs.arizona.edu/xkernel/ for their implementation. - * The main aspects that distinguish this implementation from the - * Arizona Vegas implementation are: - * o We do not change the loss detection or recovery mechanisms of - * Linux in any way. Linux already recovers from losses quite well, - * using fine-grained timers, NewReno, and FACK. - * o To avoid the performance penalty imposed by increasing cwnd - * only every-other RTT during slow start, we increase during - * every RTT during slow start, just like Reno. - * o Largely to allow continuous cwnd growth during slow start, - * we use the rate at which ACKs come back as the "actual" - * rate, rather than the rate at which data is sent. - * o To speed convergence to the right rate, we set the cwnd - * to achieve the right ("actual") rate when we exit slow start. - * o To filter out the noise caused by delayed ACKs, we use the - * minimum RTT sample observed during the last RTT to calculate - * the actual rate. - * o When the sender re-starts from idle, it waits until it has - * received ACKs for an entire flight of new data before making - * a cwnd adjustment decision. The original Vegas implementation - * assumed senders never went idle. - */ -static void vegas_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt) -{ - /* The key players are v_beg_snd_una and v_beg_snd_nxt. - * - * These are so named because they represent the approximate values - * of snd_una and snd_nxt at the beginning of the current RTT. More - * precisely, they represent the amount of data sent during the RTT. - * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, - * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding - * bytes of data have been ACKed during the course of the RTT, giving - * an "actual" rate of: - * - * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) - * - * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, - * because delayed ACKs can cover more than one segment, so they - * don't line up nicely with the boundaries of RTTs. - * - * Another unfortunate fact of life is that delayed ACKs delay the - * advance of the left edge of our send window, so that the number - * of bytes we send in an RTT is often less than our cwnd will allow. - * So we keep track of our cwnd separately, in v_beg_snd_cwnd. - */ - - if (after(ack, tp->vegas.beg_snd_nxt)) { - /* Do the Vegas once-per-RTT cwnd adjustment. */ - u32 old_wnd, old_snd_cwnd; - - - /* Here old_wnd is essentially the window of data that was - * sent during the previous RTT, and has all - * been acknowledged in the course of the RTT that ended - * with the ACK we just received. Likewise, old_snd_cwnd - * is the cwnd during the previous RTT. - */ - old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) / - tp->mss_cache_std; - old_snd_cwnd = tp->vegas.beg_snd_cwnd; - - /* Save the extent of the current window so we can use this - * at the end of the next RTT. - */ - tp->vegas.beg_snd_una = tp->vegas.beg_snd_nxt; - tp->vegas.beg_snd_nxt = tp->snd_nxt; - tp->vegas.beg_snd_cwnd = tp->snd_cwnd; - - /* Take into account the current RTT sample too, to - * decrease the impact of delayed acks. This double counts - * this sample since we count it for the next window as well, - * but that's not too awful, since we're taking the min, - * rather than averaging. - */ - vegas_rtt_calc(tp, seq_rtt); - - /* We do the Vegas calculations only if we got enough RTT - * samples that we can be reasonably sure that we got - * at least one RTT sample that wasn't from a delayed ACK. - * If we only had 2 samples total, - * then that means we're getting only 1 ACK per RTT, which - * means they're almost certainly delayed ACKs. - * If we have 3 samples, we should be OK. - */ - - if (tp->vegas.cntRTT <= 2) { - /* We don't have enough RTT samples to do the Vegas - * calculation, so we'll behave like Reno. - */ - if (tp->snd_cwnd > tp->snd_ssthresh) - tp->snd_cwnd++; - } else { - u32 rtt, target_cwnd, diff; - - /* We have enough RTT samples, so, using the Vegas - * algorithm, we determine if we should increase or - * decrease cwnd, and by how much. - */ - - /* Pluck out the RTT we are using for the Vegas - * calculations. This is the min RTT seen during the - * last RTT. Taking the min filters out the effects - * of delayed ACKs, at the cost of noticing congestion - * a bit later. - */ - rtt = tp->vegas.minRTT; - - /* Calculate the cwnd we should have, if we weren't - * going too fast. - * - * This is: - * (actual rate in segments) * baseRTT - * We keep it as a fixed point number with - * V_PARAM_SHIFT bits to the right of the binary point. - */ - target_cwnd = ((old_wnd * tp->vegas.baseRTT) - << V_PARAM_SHIFT) / rtt; - - /* Calculate the difference between the window we had, - * and the window we would like to have. This quantity - * is the "Diff" from the Arizona Vegas papers. - * - * Again, this is a fixed point number with - * V_PARAM_SHIFT bits to the right of the binary - * point. - */ - diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; - - if (tp->snd_cwnd < tp->snd_ssthresh) { - /* Slow start. */ - if (diff > sysctl_tcp_vegas_gamma) { - /* Going too fast. Time to slow down - * and switch to congestion avoidance. - */ - tp->snd_ssthresh = 2; - - /* Set cwnd to match the actual rate - * exactly: - * cwnd = (actual rate) * baseRTT - * Then we add 1 because the integer - * truncation robs us of full link - * utilization. - */ - tp->snd_cwnd = min(tp->snd_cwnd, - (target_cwnd >> - V_PARAM_SHIFT)+1); - - } - } else { - /* Congestion avoidance. */ - u32 next_snd_cwnd; - - /* Figure out where we would like cwnd - * to be. - */ - if (diff > sysctl_tcp_vegas_beta) { - /* The old window was too fast, so - * we slow down. - */ - next_snd_cwnd = old_snd_cwnd - 1; - } else if (diff < sysctl_tcp_vegas_alpha) { - /* We don't have enough extra packets - * in the network, so speed up. - */ - next_snd_cwnd = old_snd_cwnd + 1; - } else { - /* Sending just as fast as we - * should be. - */ - next_snd_cwnd = old_snd_cwnd; - } - - /* Adjust cwnd upward or downward, toward the - * desired value. - */ - if (next_snd_cwnd > tp->snd_cwnd) - tp->snd_cwnd++; - else if (next_snd_cwnd < tp->snd_cwnd) - tp->snd_cwnd--; - } - } - - /* Wipe the slate clean for the next RTT. */ - tp->vegas.cntRTT = 0; - tp->vegas.minRTT = 0x7fffffff; - } - - /* The following code is executed for every ack we receive, - * except for conditions checked in should_advance_cwnd() - * before the call to tcp_cong_avoid(). Mainly this means that - * we only execute this code if the ack actually acked some - * data. - */ - - /* If we are in slow start, increase our cwnd in response to this ACK. - * (If we are not in slow start then we are in congestion avoidance, - * and adjust our congestion window only once per RTT. See the code - * above.) - */ - if (tp->snd_cwnd <= tp->snd_ssthresh) - tp->snd_cwnd++; - - /* to keep cwnd from growing without bound */ - tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); - - /* Make sure that we are never so timid as to reduce our cwnd below - * 2 MSS. - * - * Going below 2 MSS would risk huge delayed ACKs from our receiver. - */ - tp->snd_cwnd = max(tp->snd_cwnd, 2U); - - tp->snd_cwnd_stamp = tcp_time_stamp; -} - -static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt) -{ - if (tcp_vegas_enabled(tp)) - vegas_cong_avoid(tp, ack, seq_rtt); - else - reno_cong_avoid(tp); -} - /* Restart timer after forward progress on connection. * RFC2988 recommends to restart timer to now+rto. */ @@ -2415,13 +2024,18 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, /* Remove acknowledged frames from the retransmission queue. */ -static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) +static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; __u32 now = tcp_time_stamp; int acked = 0; __s32 seq_rtt = -1; + struct timeval usnow; + u32 pkts_acked = 0; + + if (seq_usrtt) + do_gettimeofday(&usnow); while ((skb = skb_peek(&sk->sk_write_queue)) && skb != sk->sk_send_head) { @@ -2448,6 +2062,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) */ if (!(scb->flags & TCPCB_FLAG_SYN)) { acked |= FLAG_DATA_ACKED; + ++pkts_acked; } else { acked |= FLAG_SYN_ACKED; tp->retrans_stamp = 0; @@ -2461,6 +2076,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) seq_rtt = -1; } else if (seq_rtt < 0) seq_rtt = now - scb->when; + if (seq_usrtt) + *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000 + + (usnow.tv_usec - skb->stamp.tv_usec); + if (sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= tcp_skb_pcount(skb); if (sacked & TCPCB_LOST) @@ -2479,8 +2098,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) } if (acked&FLAG_ACKED) { - tcp_ack_update_rtt(tp, acked, seq_rtt); + tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt); tcp_ack_packets_out(sk, tp); + + if (tp->ca_ops->pkts_acked) + tp->ca_ops->pkts_acked(tp, pkts_acked); } #if FASTRETRANS_DEBUG > 0 @@ -2624,257 +2246,6 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) tp->frto_counter = (tp->frto_counter + 1) % 3; } -/* - * TCP Westwood+ - */ - -/* - * @init_westwood - * This function initializes fields used in TCP Westwood+. We can't - * get no information about RTTmin at this time so we simply set it to - * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative - * since in this way we're sure it will be updated in a consistent - * way as soon as possible. It will reasonably happen within the first - * RTT period of the connection lifetime. - */ - -static void init_westwood(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tp->westwood.bw_ns_est = 0; - tp->westwood.bw_est = 0; - tp->westwood.accounted = 0; - tp->westwood.cumul_ack = 0; - tp->westwood.rtt_win_sx = tcp_time_stamp; - tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT; - tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT; - tp->westwood.snd_una = tp->snd_una; -} - -/* - * @westwood_do_filter - * Low-pass filter. Implemented using constant coeffients. - */ - -static inline __u32 westwood_do_filter(__u32 a, __u32 b) -{ - return (((7 * a) + b) >> 3); -} - -static void westwood_filter(struct sock *sk, __u32 delta) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tp->westwood.bw_ns_est = - westwood_do_filter(tp->westwood.bw_ns_est, - tp->westwood.bk / delta); - tp->westwood.bw_est = - westwood_do_filter(tp->westwood.bw_est, - tp->westwood.bw_ns_est); -} - -/* - * @westwood_update_rttmin - * It is used to update RTTmin. In this case we MUST NOT use - * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN! - */ - -static inline __u32 westwood_update_rttmin(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - __u32 rttmin = tp->westwood.rtt_min; - - if (tp->westwood.rtt != 0 && - (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin)) - rttmin = tp->westwood.rtt; - - return rttmin; -} - -/* - * @westwood_acked - * Evaluate increases for dk. - */ - -static inline __u32 westwood_acked(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - - return tp->snd_una - tp->westwood.snd_una; -} - -/* - * @westwood_new_window - * It evaluates if we are receiving data inside the same RTT window as - * when we started. - * Return value: - * It returns 0 if we are still evaluating samples in the same RTT - * window, 1 if the sample has to be considered in the next window. - */ - -static int westwood_new_window(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - __u32 left_bound; - __u32 rtt; - int ret = 0; - - left_bound = tp->westwood.rtt_win_sx; - rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN); - - /* - * A RTT-window has passed. Be careful since if RTT is less than - * 50ms we don't filter but we continue 'building the sample'. - * This minimum limit was choosen since an estimation on small - * time intervals is better to avoid... - * Obvioulsy on a LAN we reasonably will always have - * right_bound = left_bound + WESTWOOD_RTT_MIN - */ - - if ((left_bound + rtt) < tcp_time_stamp) - ret = 1; - - return ret; -} - -/* - * @westwood_update_window - * It updates RTT evaluation window if it is the right moment to do - * it. If so it calls filter for evaluating bandwidth. - */ - -static void __westwood_update_window(struct sock *sk, __u32 now) -{ - struct tcp_sock *tp = tcp_sk(sk); - __u32 delta = now - tp->westwood.rtt_win_sx; - - if (delta) { - if (tp->westwood.rtt) - westwood_filter(sk, delta); - - tp->westwood.bk = 0; - tp->westwood.rtt_win_sx = tcp_time_stamp; - } -} - - -static void westwood_update_window(struct sock *sk, __u32 now) -{ - if (westwood_new_window(sk)) - __westwood_update_window(sk, now); -} - -/* - * @__tcp_westwood_fast_bw - * It is called when we are in fast path. In particular it is called when - * header prediction is successfull. In such case infact update is - * straight forward and doesn't need any particular care. - */ - -static void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - westwood_update_window(sk, tcp_time_stamp); - - tp->westwood.bk += westwood_acked(sk); - tp->westwood.snd_una = tp->snd_una; - tp->westwood.rtt_min = westwood_update_rttmin(sk); -} - -static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb) -{ - if (tcp_is_westwood(tcp_sk(sk))) - __tcp_westwood_fast_bw(sk, skb); -} - - -/* - * @westwood_dupack_update - * It updates accounted and cumul_ack when receiving a dupack. - */ - -static void westwood_dupack_update(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tp->westwood.accounted += tp->mss_cache_std; - tp->westwood.cumul_ack = tp->mss_cache_std; -} - -static inline int westwood_may_change_cumul(struct tcp_sock *tp) -{ - return (tp->westwood.cumul_ack > tp->mss_cache_std); -} - -static inline void westwood_partial_update(struct tcp_sock *tp) -{ - tp->westwood.accounted -= tp->westwood.cumul_ack; - tp->westwood.cumul_ack = tp->mss_cache_std; -} - -static inline void westwood_complete_update(struct tcp_sock *tp) -{ - tp->westwood.cumul_ack -= tp->westwood.accounted; - tp->westwood.accounted = 0; -} - -/* - * @westwood_acked_count - * This function evaluates cumul_ack for evaluating dk in case of - * delayed or partial acks. - */ - -static inline __u32 westwood_acked_count(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tp->westwood.cumul_ack = westwood_acked(sk); - - /* If cumul_ack is 0 this is a dupack since it's not moving - * tp->snd_una. - */ - if (!(tp->westwood.cumul_ack)) - westwood_dupack_update(sk); - - if (westwood_may_change_cumul(tp)) { - /* Partial or delayed ack */ - if (tp->westwood.accounted >= tp->westwood.cumul_ack) - westwood_partial_update(tp); - else - westwood_complete_update(tp); - } - - tp->westwood.snd_una = tp->snd_una; - - return tp->westwood.cumul_ack; -} - - -/* - * @__tcp_westwood_slow_bw - * It is called when something is going wrong..even if there could - * be no problems! Infact a simple delayed packet may trigger a - * dupack. But we need to be careful in such case. - */ - -static void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - westwood_update_window(sk, tcp_time_stamp); - - tp->westwood.bk += westwood_acked_count(sk); - tp->westwood.rtt_min = westwood_update_rttmin(sk); -} - -static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb) -{ - if (tcp_is_westwood(tcp_sk(sk))) - __tcp_westwood_slow_bw(sk, skb); -} - /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) { @@ -2884,6 +2255,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) u32 ack = TCP_SKB_CB(skb)->ack_seq; u32 prior_in_flight; s32 seq_rtt; + s32 seq_usrtt = 0; int prior_packets; /* If the ack is newer than sent or older than previous acks @@ -2902,9 +2274,10 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) */ tcp_update_wl(tp, ack, ack_seq); tp->snd_una = ack; - tcp_westwood_fast_bw(sk, skb); flag |= FLAG_WIN_UPDATE; + tcp_ca_event(tp, CA_EVENT_FAST_ACK); + NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); } else { if (ack_seq != TCP_SKB_CB(skb)->end_seq) @@ -2920,7 +2293,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) flag |= FLAG_ECE; - tcp_westwood_slow_bw(sk,skb); + tcp_ca_event(tp, CA_EVENT_SLOW_ACK); } /* We passed data and got it acked, remove any soft error @@ -2935,22 +2308,20 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) prior_in_flight = tcp_packets_in_flight(tp); /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, &seq_rtt); + flag |= tcp_clean_rtx_queue(sk, &seq_rtt, + tp->ca_ops->rtt_sample ? &seq_usrtt : NULL); if (tp->frto_counter) tcp_process_frto(sk, prior_snd_una); if (tcp_ack_is_dubious(tp, flag)) { /* Advanve CWND, if state allows this. */ - if ((flag & FLAG_DATA_ACKED) && - (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) && - tcp_may_raise_cwnd(tp, flag)) - tcp_cong_avoid(tp, ack, seq_rtt); + if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag)) + tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 0); tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); } else { - if ((flag & FLAG_DATA_ACKED) && - (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd)) - tcp_cong_avoid(tp, ack, seq_rtt); + if ((flag & FLAG_DATA_ACKED)) + tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1); } if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) @@ -4552,6 +3923,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_init_metrics(sk); + tcp_init_congestion_control(tp); + /* Prevent spurious tcp_cwnd_restart() on first data * packet. */ @@ -4708,9 +4081,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if(tp->af_specific->conn_request(sk, skb) < 0) return 1; - init_westwood(sk); - init_bictcp(tp); - /* Now we have several options: In theory there is * nothing else in the frame. KA9Q has an option to * send data with the syn, BSD accepts data with the @@ -4732,9 +4102,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; case TCP_SYN_SENT: - init_westwood(sk); - init_bictcp(tp); - queued = tcp_rcv_synsent_state_process(sk, skb, th, len); if (queued >= 0) return queued; @@ -4816,7 +4183,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !tp->srtt) - tcp_ack_saw_tstamp(tp, 0); + tcp_ack_saw_tstamp(tp, 0, 0); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; @@ -4828,6 +4195,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tcp_init_metrics(sk); + tcp_init_congestion_control(tp); + /* Prevent spurious tcp_cwnd_restart() on * first data packet. */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2d41d5d6ad1..9122814c13a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2048,6 +2048,7 @@ static int tcp_v4_init_sock(struct sock *sk) tp->mss_cache_std = tp->mss_cache = 536; tp->reordering = sysctl_tcp_reordering; + tp->ca_ops = &tcp_reno; sk->sk_state = TCP_CLOSE; @@ -2070,6 +2071,8 @@ int tcp_v4_destroy_sock(struct sock *sk) tcp_clear_xmit_timers(sk); + tcp_cleanup_congestion_control(tp); + /* Cleanup up the write buffer. */ sk_stream_writequeue_purge(sk); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b3943e7562f..f42a284164b 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -774,6 +774,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->frto_counter = 0; newtp->frto_highmark = 0; + newtp->ca_ops = &tcp_reno; + tcp_set_ca_state(newtp, TCP_CA_Open); tcp_init_xmit_timers(newsk); skb_queue_head_init(&newtp->out_of_order_queue); @@ -842,8 +844,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, if (newtp->ecn_flags&TCP_ECN_OK) sock_set_flag(newsk, SOCK_NO_LARGESEND); - tcp_ca_init(newtp); - TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS); } return newsk; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f17c6577e33..0e17c244875 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -111,8 +111,7 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst) u32 restart_cwnd = tcp_init_cwnd(tp, dst); u32 cwnd = tp->snd_cwnd; - if (tcp_is_vegas(tp)) - tcp_vegas_enable(tp); + tcp_ca_event(tp, CA_EVENT_CWND_RESTART); tp->snd_ssthresh = tcp_current_ssthresh(tp); restart_cwnd = min(restart_cwnd, cwnd); @@ -280,6 +279,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) #define SYSCTL_FLAG_WSCALE 0x2 #define SYSCTL_FLAG_SACK 0x4 + /* If congestion control is doing timestamping */ + if (tp->ca_ops->rtt_sample) + do_gettimeofday(&skb->stamp); + sysctl_flags = 0; if (tcb->flags & TCPCB_FLAG_SYN) { tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; @@ -304,17 +307,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); } - /* - * If the connection is idle and we are restarting, - * then we don't want to do any Vegas calculations - * until we get fresh RTT samples. So when we - * restart, we reset our Vegas state to a clean - * slate. After we get acks for this flight of - * packets, _then_ we can make Vegas calculations - * again. - */ - if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0) - tcp_vegas_enable(tp); + if (tcp_packets_in_flight(tp) == 0) + tcp_ca_event(tp, CA_EVENT_TX_START); th = (struct tcphdr *) skb_push(skb, tcp_header_size); skb->h.th = th; @@ -521,6 +515,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) * skbs, which it never sent before. --ANK */ TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; + buff->stamp = skb->stamp; if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { tp->lost_out -= tcp_skb_pcount(skb); @@ -1449,7 +1444,6 @@ static inline void tcp_connect_init(struct sock *sk) tp->window_clamp = dst_metric(dst, RTAX_WINDOW); tp->advmss = dst_metric(dst, RTAX_ADVMSS); tcp_initialize_rcv_mss(sk); - tcp_ca_init(tp); tcp_select_initial_window(tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), @@ -1503,7 +1497,6 @@ int tcp_connect(struct sock *sk) TCP_SKB_CB(buff)->end_seq = tp->write_seq; tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; - tcp_ca_init(tp); /* Send it off. */ TCP_SKB_CB(buff)->when = tcp_time_stamp; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2414937f2a8..fce56039b0e 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2025,7 +2025,7 @@ static int tcp_v6_init_sock(struct sock *sk) sk->sk_state = TCP_CLOSE; tp->af_specific = &ipv6_specific; - + tp->ca_ops = &tcp_reno; sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); -- cgit v1.2.3 From 7c99c909fa69a183c1b80bd64fb9f0d11459aff3 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 23 Jun 2005 12:20:36 -0700 Subject: [TCP]: Change tcp_diag to use the existing __RTA_PUT() macro. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_diag.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 867acc0f79d..a4e512036d8 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -43,13 +43,7 @@ struct tcpdiag_entry static struct sock *tcpnl; #define TCPDIAG_PUT(skb, attrtype, attrlen) \ -({ int rtalen = RTA_LENGTH(attrlen); \ - struct rtattr *rta; \ - if (skb_tailroom(skb) < RTA_ALIGN(rtalen)) goto nlmsg_failure; \ - rta = (void*)__skb_put(skb, RTA_ALIGN(rtalen)); \ - rta->rta_type = attrtype; \ - rta->rta_len = rtalen; \ - RTA_DATA(rta); }) + RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, int ext, u32 pid, u32 seq, u16 nlmsg_flags) @@ -167,6 +161,7 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, nlh->nlmsg_len = skb->tail - b; return skb->len; +rtattr_failure: nlmsg_failure: skb_trim(skb, b - skb->data); return -1; -- cgit v1.2.3 From 056ede6cface66b400cd3b8e60ed077cc5b85c18 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 23 Jun 2005 12:21:28 -0700 Subject: [TCP]: Report congestion control algorithm in tcp_diag. Enhancement to the tcp_diag interface used by the iproute2 ss command to report the tcp congestion control being used by a socket. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/tcp_diag.h | 4 ++-- net/ipv4/tcp_diag.c | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/tcp_diag.h b/include/linux/tcp_diag.h index ceee962e1d1..7a599674394 100644 --- a/include/linux/tcp_diag.h +++ b/include/linux/tcp_diag.h @@ -99,9 +99,10 @@ enum TCPDIAG_MEMINFO, TCPDIAG_INFO, TCPDIAG_VEGASINFO, + TCPDIAG_CONG, }; -#define TCPDIAG_MAX TCPDIAG_VEGASINFO +#define TCPDIAG_MAX TCPDIAG_CONG /* TCPDIAG_MEM */ @@ -123,5 +124,4 @@ struct tcpvegas_info { __u32 tcpv_minrtt; }; - #endif /* _TCP_DIAG_H_ */ diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index a4e512036d8..f66945cb158 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -65,6 +65,11 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, if (ext & (1<<(TCPDIAG_INFO-1))) info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); + if (ext & (1<<(TCPDIAG_CONG-1))) { + size_t len = strlen(tp->ca_ops->name); + strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1), + tp->ca_ops->name); + } } r->tcpdiag_family = sk->sk_family; r->tcpdiag_state = sk->sk_state; -- cgit v1.2.3 From 9d7bcfc6b8586ee5a52043f061e0411965e71b88 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 23 Jun 2005 12:22:36 -0700 Subject: [TCP]: Update sysctl and congestion control documentation. Update the documentation to remove the old sysctl values and include the new congestion control infrastructure. Includes changes to tcp.txt by Ian McDonald. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 56 +++------------------------ Documentation/networking/tcp.txt | 69 +++++++++++++++++++++++++++++++++- 2 files changed, 73 insertions(+), 52 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index a2c893a7475..ab65714d95f 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -304,57 +304,6 @@ tcp_low_latency - BOOLEAN changed would be a Beowulf compute cluster. Default: 0 -tcp_westwood - BOOLEAN - Enable TCP Westwood+ congestion control algorithm. - TCP Westwood+ is a sender-side only modification of the TCP Reno - protocol stack that optimizes the performance of TCP congestion - control. It is based on end-to-end bandwidth estimation to set - congestion window and slow start threshold after a congestion - episode. Using this estimation, TCP Westwood+ adaptively sets a - slow start threshold and a congestion window which takes into - account the bandwidth used at the time congestion is experienced. - TCP Westwood+ significantly increases fairness wrt TCP Reno in - wired networks and throughput over wireless links. - Default: 0 - -tcp_vegas_cong_avoid - BOOLEAN - Enable TCP Vegas congestion avoidance algorithm. - TCP Vegas is a sender-side only change to TCP that anticipates - the onset of congestion by estimating the bandwidth. TCP Vegas - adjusts the sending rate by modifying the congestion - window. TCP Vegas should provide less packet loss, but it is - not as aggressive as TCP Reno. - Default:0 - -tcp_bic - BOOLEAN - Enable BIC TCP congestion control algorithm. - BIC-TCP is a sender-side only change that ensures a linear RTT - fairness under large windows while offering both scalability and - bounded TCP-friendliness. The protocol combines two schemes - called additive increase and binary search increase. When the - congestion window is large, additive increase with a large - increment ensures linear RTT fairness as well as good - scalability. Under small congestion windows, binary search - increase provides TCP friendliness. - Default: 0 - -tcp_bic_low_window - INTEGER - Sets the threshold window (in packets) where BIC TCP starts to - adjust the congestion window. Below this threshold BIC TCP behaves - the same as the default TCP Reno. - Default: 14 - -tcp_bic_fast_convergence - BOOLEAN - Forces BIC TCP to more quickly respond to changes in congestion - window. Allows two flows sharing the same connection to converge - more rapidly. - Default: 1 - -tcp_default_win_scale - INTEGER - Sets the minimum window scale TCP will negotiate for on all - conections. - Default: 7 - tcp_tso_win_divisor - INTEGER This allows control over what percentage of the congestion window can be consumed by a single TSO frame. @@ -368,6 +317,11 @@ tcp_frto - BOOLEAN where packet loss is typically due to random radio interference rather than intermediate router congestion. +tcp_congestion_control - STRING + Set the congestion control algorithm to be used for new + connections. The algorithm "reno" is always available, but + additional choices may be available based on kernel configuration. + somaxconn - INTEGER Limit of socket listen() backlog, known in userspace as SOMAXCONN. Defaults to 128. See also tcp_max_syn_backlog for additional tuning diff --git a/Documentation/networking/tcp.txt b/Documentation/networking/tcp.txt index 71749007091..0fa30042557 100644 --- a/Documentation/networking/tcp.txt +++ b/Documentation/networking/tcp.txt @@ -1,5 +1,72 @@ -How the new TCP output machine [nyi] works. +TCP protocol +============ + +Last updated: 21 June 2005 + +Contents +======== + +- Congestion control +- How the new TCP output machine [nyi] works + +Congestion control +================== + +The following variables are used in the tcp_sock for congestion control: +snd_cwnd The size of the congestion window +snd_ssthresh Slow start threshold. We are in slow start if + snd_cwnd is less than this. +snd_cwnd_cnt A counter used to slow down the rate of increase + once we exceed slow start threshold. +snd_cwnd_clamp This is the maximum size that snd_cwnd can grow to. +snd_cwnd_stamp Timestamp for when congestion window last validated. +snd_cwnd_used Used as a highwater mark for how much of the + congestion window is in use. It is used to adjust + snd_cwnd down when the link is limited by the + application rather than the network. + +As of 2.6.13, Linux supports pluggable congestion control algorithms. +A congestion control mechanism can be registered through functions in +tcp_cong.c. The functions used by the congestion control mechanism are +registered via passing a tcp_congestion_ops struct to +tcp_register_congestion_control. As a minimum name, ssthresh, +cong_avoid, min_cwnd must be valid. +Private data for a congestion control mechanism is stored in tp->ca_priv. +tcp_ca(tp) returns a pointer to this space. This is preallocated space - it +is important to check the size of your private data will fit this space, or +alternatively space could be allocated elsewhere and a pointer to it could +be stored here. + +There are three kinds of congestion control algorithms currently: The +simplest ones are derived from TCP reno (highspeed, scalable) and just +provide an alternative the congestion window calculation. More complex +ones like BIC try to look at other events to provide better +heuristics. There are also round trip time based algorithms like +Vegas and Westwood+. + +Good TCP congestion control is a complex problem because the algorithm +needs to maintain fairness and performance. Please review current +research and RFC's before developing new modules. + +The method that is used to determine which congestion control mechanism is +determined by the setting of the sysctl net.ipv4.tcp_congestion_control. +The default congestion control will be the last one registered (LIFO); +so if you built everything as modules. the default will be reno. If you +build with the default's from Kconfig, then BIC will be builtin (not a module) +and it will end up the default. + +If you really want a particular default value then you will need +to set it with the sysctl. If you use a sysctl, the module will be autoloaded +if needed and you will get the expected protocol. If you ask for an +unknown congestion method, then the sysctl attempt will fail. + +If you remove a tcp congestion control module, then you will get the next +available one. Since reno can not be built as a module, and can not be +deleted, it will always be available. + +How the new TCP output machine [nyi] works. +=========================================== Data is kept on a single queue. The skb->users flag tells us if the frame is one that has been queued already. To add a frame we throw it on the end. Ack -- cgit v1.2.3 From 83803034f4233d810c4adc52008921da060c55d1 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 23 Jun 2005 12:23:25 -0700 Subject: [TCP]: Add TCP BIC congestion control module. TCP BIC congestion control reworked to use the new congestion control infrastructure. This version is more up to date than the BIC code in 2.6.12; it incorporates enhancements from BICTCP 1.1, to handle low latency links. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 21 ++++ net/ipv4/Makefile | 1 + net/ipv4/tcp_bic.c | 331 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 353 insertions(+) create mode 100644 net/ipv4/tcp_bic.c diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 567b03b1c34..712ebacacb6 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -433,5 +433,26 @@ config IP_TCPDIAG config IP_TCPDIAG_IPV6 def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6) +# TCP Reno is builtin (required as fallback) +menu "TCP congestion control" + depends on INET + +config TCP_CONG_BIC + tristate "Binary Increase Congestion (BIC) control" + depends on INET + default y + ---help--- + BIC-TCP is a sender-side only change that ensures a linear RTT + fairness under large windows while offering both scalability and + bounded TCP-friendliness. The protocol combines two schemes + called additive increase and binary search increase. When the + congestion window is large, additive increase with a large + increment ensures linear RTT fairness as well as good + scalability. Under small congestion windows, binary search + increase provides TCP friendliness. + See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ + +endmenu + source "net/ipv4/ipvs/Kconfig" diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 89c0b4cb470..1d1cac5ac06 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -31,6 +31,7 @@ obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_IP_VS) += ipvs/ obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o +obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c new file mode 100644 index 00000000000..ec38d45d664 --- /dev/null +++ b/net/ipv4/tcp_bic.c @@ -0,0 +1,331 @@ +/* + * Binary Increase Congestion control for TCP + * + * This is from the implementation of BICTCP in + * Lison-Xu, Kahaled Harfoush, and Injong Rhee. + * "Binary Increase Congestion Control for Fast, Long Distance + * Networks" in InfoComm 2004 + * Available from: + * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf + * + * Unless BIC is enabled and congestion window is large + * this behaves the same as the original Reno. + */ + +#include +#include +#include +#include + + +#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation + * max_cwnd = snd_cwnd * beta + */ +#define BICTCP_B 4 /* + * In binary search, + * go to point (max+min)/N + */ + +static int fast_convergence = 1; +static int max_increment = 32; +static int low_window = 14; +static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ +static int low_utilization_threshold = 153; +static int low_utilization_period = 2; +static int initial_ssthresh = 100; +static int smooth_part = 20; + +module_param(fast_convergence, int, 0644); +MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence"); +module_param(max_increment, int, 0644); +MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search"); +module_param(low_window, int, 0644); +MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)"); +module_param(beta, int, 0644); +MODULE_PARM_DESC(beta, "beta for multiplicative increase"); +module_param(low_utilization_threshold, int, 0644); +MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode"); +module_param(low_utilization_period, int, 0644); +MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)"); +module_param(initial_ssthresh, int, 0644); +MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); +module_param(smooth_part, int, 0644); +MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax"); + + +/* BIC TCP Parameters */ +struct bictcp { + u32 cnt; /* increase cwnd by 1 after ACKs */ + u32 last_max_cwnd; /* last maximum snd_cwnd */ + u32 loss_cwnd; /* congestion window at last loss */ + u32 last_cwnd; /* the last snd_cwnd */ + u32 last_time; /* time when updated last_cwnd */ + u32 delay_min; /* min delay */ + u32 delay_max; /* max delay */ + u32 last_delay; + u8 low_utilization;/* 0: high; 1: low */ + u32 low_utilization_start; /* starting time of low utilization detection*/ + u32 epoch_start; /* beginning of an epoch */ +#define ACK_RATIO_SHIFT 4 + u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ +}; + +static inline void bictcp_reset(struct bictcp *ca) +{ + ca->cnt = 0; + ca->last_max_cwnd = 0; + ca->loss_cwnd = 0; + ca->last_cwnd = 0; + ca->last_time = 0; + ca->delay_min = 0; + ca->delay_max = 0; + ca->last_delay = 0; + ca->low_utilization = 0; + ca->low_utilization_start = 0; + ca->epoch_start = 0; + ca->delayed_ack = 2 << ACK_RATIO_SHIFT; +} + +static void bictcp_init(struct tcp_sock *tp) +{ + bictcp_reset(tcp_ca(tp)); + if (initial_ssthresh) + tp->snd_ssthresh = initial_ssthresh; +} + +/* + * Compute congestion window to use. + */ +static inline void bictcp_update(struct bictcp *ca, u32 cwnd) +{ + if (ca->last_cwnd == cwnd && + (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) + return; + + ca->last_cwnd = cwnd; + ca->last_time = tcp_time_stamp; + + if (ca->epoch_start == 0) /* record the beginning of an epoch */ + ca->epoch_start = tcp_time_stamp; + + /* start off normal */ + if (cwnd <= low_window) { + ca->cnt = cwnd; + return; + } + + /* binary increase */ + if (cwnd < ca->last_max_cwnd) { + __u32 dist = (ca->last_max_cwnd - cwnd) + / BICTCP_B; + + if (dist > max_increment) + /* linear increase */ + ca->cnt = cwnd / max_increment; + else if (dist <= 1U) + /* binary search increase */ + ca->cnt = (cwnd * smooth_part) / BICTCP_B; + else + /* binary search increase */ + ca->cnt = cwnd / dist; + } else { + /* slow start AMD linear increase */ + if (cwnd < ca->last_max_cwnd + BICTCP_B) + /* slow start */ + ca->cnt = (cwnd * smooth_part) / BICTCP_B; + else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1)) + /* slow start */ + ca->cnt = (cwnd * (BICTCP_B-1)) + / cwnd-ca->last_max_cwnd; + else + /* linear increase */ + ca->cnt = cwnd / max_increment; + } + + /* if in slow start or link utilization is very low */ + if ( ca->loss_cwnd == 0 || + (cwnd > ca->loss_cwnd && ca->low_utilization)) { + if (ca->cnt > 20) /* increase cwnd 5% per RTT */ + ca->cnt = 20; + } + + ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack; + if (ca->cnt == 0) /* cannot be zero */ + ca->cnt = 1; +} + + +/* Detect low utilization in congestion avoidance */ +static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag) +{ + struct bictcp *ca = tcp_ca(tp); + u32 dist, delay; + + /* No time stamp */ + if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) || + /* Discard delay samples right after fast recovery */ + tcp_time_stamp < ca->epoch_start + HZ || + /* this delay samples may not be accurate */ + flag == 0) { + ca->last_delay = 0; + goto notlow; + } + + delay = ca->last_delay<<3; /* use the same scale as tp->srtt*/ + ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr; + if (delay == 0) /* no previous delay sample */ + goto notlow; + + /* first time call or link delay decreases */ + if (ca->delay_min == 0 || ca->delay_min > delay) { + ca->delay_min = ca->delay_max = delay; + goto notlow; + } + + if (ca->delay_max < delay) + ca->delay_max = delay; + + /* utilization is low, if avg delay < dist*threshold + for checking_period time */ + dist = ca->delay_max - ca->delay_min; + if (dist <= ca->delay_min>>6 || + tp->srtt - ca->delay_min >= (dist*low_utilization_threshold)>>10) + goto notlow; + + if (ca->low_utilization_start == 0) { + ca->low_utilization = 0; + ca->low_utilization_start = tcp_time_stamp; + } else if ((s32)(tcp_time_stamp - ca->low_utilization_start) + > low_utilization_period*HZ) { + ca->low_utilization = 1; + } + + return; + + notlow: + ca->low_utilization = 0; + ca->low_utilization_start = 0; + +} + +static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack, + u32 seq_rtt, u32 in_flight, int data_acked) +{ + struct bictcp *ca = tcp_ca(tp); + + bictcp_low_utilization(tp, data_acked); + + if (in_flight < tp->snd_cwnd) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* In "safe" area, increase. */ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } else { + bictcp_update(ca, tp->snd_cwnd); + + /* In dangerous area, increase slowly. + * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd + */ + if (tp->snd_cwnd_cnt >= ca->cnt) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } + +} + +/* + * behave like Reno until low_window is reached, + * then increase congestion window slowly + */ +static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp) +{ + struct bictcp *ca = tcp_ca(tp); + + ca->epoch_start = 0; /* end of epoch */ + + /* in case of wrong delay_max*/ + if (ca->delay_min > 0 && ca->delay_max > ca->delay_min) + ca->delay_max = ca->delay_min + + ((ca->delay_max - ca->delay_min)* 90) / 100; + + /* Wmax and fast convergence */ + if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) + ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) + / (2 * BICTCP_BETA_SCALE); + else + ca->last_max_cwnd = tp->snd_cwnd; + + ca->loss_cwnd = tp->snd_cwnd; + + + if (tp->snd_cwnd <= low_window) + return max(tp->snd_cwnd >> 1U, 2U); + else + return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); +} + +static u32 bictcp_undo_cwnd(struct tcp_sock *tp) +{ + struct bictcp *ca = tcp_ca(tp); + + return max(tp->snd_cwnd, ca->last_max_cwnd); +} + +static u32 bictcp_min_cwnd(struct tcp_sock *tp) +{ + return tp->snd_ssthresh; +} + +static void bictcp_state(struct tcp_sock *tp, u8 new_state) +{ + if (new_state == TCP_CA_Loss) + bictcp_reset(tcp_ca(tp)); +} + +/* Track delayed acknowledgement ratio using sliding window + * ratio = (15*ratio + sample) / 16 + */ +static void bictcp_acked(struct tcp_sock *tp, u32 cnt) +{ + if (cnt > 0 && tp->ca_state == TCP_CA_Open) { + struct bictcp *ca = tcp_ca(tp); + cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; + ca->delayed_ack += cnt; + } +} + + +static struct tcp_congestion_ops bictcp = { + .init = bictcp_init, + .ssthresh = bictcp_recalc_ssthresh, + .cong_avoid = bictcp_cong_avoid, + .set_state = bictcp_state, + .undo_cwnd = bictcp_undo_cwnd, + .min_cwnd = bictcp_min_cwnd, + .pkts_acked = bictcp_acked, + .owner = THIS_MODULE, + .name = "bic", +}; + +static int __init bictcp_register(void) +{ + BUG_ON(sizeof(struct bictcp) > TCP_CA_PRIV_SIZE); + return tcp_register_congestion_control(&bictcp); +} + +static void __exit bictcp_unregister(void) +{ + tcp_unregister_congestion_control(&bictcp); +} + +module_init(bictcp_register); +module_exit(bictcp_unregister); + +MODULE_AUTHOR("Stephen Hemminger"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("BIC TCP"); -- cgit v1.2.3 From 8727076289ec55298a05cabddf02b374d13c1624 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 23 Jun 2005 12:24:09 -0700 Subject: [TCP]: Add TCP Westwood congestion control module. This is the existing 2.6.12 Westwood code moved from tcp_input to the new congestion framework. A lot of the inline functions have been eliminated to try and make it clearer. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 15 +++ net/ipv4/Makefile | 1 + net/ipv4/tcp_westwood.c | 259 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 275 insertions(+) create mode 100644 net/ipv4/tcp_westwood.c diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 712ebacacb6..adbe855d931 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -452,6 +452,21 @@ config TCP_CONG_BIC increase provides TCP friendliness. See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ +config TCP_CONG_WESTWOOD + tristate "TCP Westwood+" + depends on INET + default m + ---help--- + TCP Westwood+ is a sender-side only modification of the TCP Reno + protocol stack that optimizes the performance of TCP congestion + control. It is based on end-to-end bandwidth estimation to set + congestion window and slow start threshold after a congestion + episode. Using this estimation, TCP Westwood+ adaptively sets a + slow start threshold and a congestion window which takes into + account the bandwidth used at the time congestion is experienced. + TCP Westwood+ significantly increases fairness wrt TCP Reno in + wired networks and throughput over wireless links. + endmenu source "net/ipv4/ipvs/Kconfig" diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 1d1cac5ac06..dedfbe62a10 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -32,6 +32,7 @@ obj-$(CONFIG_IP_VS) += ipvs/ obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o +obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c new file mode 100644 index 00000000000..ef827242c94 --- /dev/null +++ b/net/ipv4/tcp_westwood.c @@ -0,0 +1,259 @@ +/* + * TCP Westwood+ + * + * Angelo Dell'Aera: TCP Westwood+ support + */ + +#include +#include +#include +#include +#include +#include + +/* TCP Westwood structure */ +struct westwood { + u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */ + u32 bw_est; /* bandwidth estimate */ + u32 rtt_win_sx; /* here starts a new evaluation... */ + u32 bk; + u32 snd_una; /* used for evaluating the number of acked bytes */ + u32 cumul_ack; + u32 accounted; + u32 rtt; + u32 rtt_min; /* minimum observed RTT */ +}; + + +/* TCP Westwood functions and constants */ +#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */ +#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */ + +/* + * @tcp_westwood_create + * This function initializes fields used in TCP Westwood+, + * it is called after the initial SYN, so the sequence numbers + * are correct but new passive connections we have no + * information about RTTmin at this time so we simply set it to + * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative + * since in this way we're sure it will be updated in a consistent + * way as soon as possible. It will reasonably happen within the first + * RTT period of the connection lifetime. + */ +static void tcp_westwood_init(struct tcp_sock *tp) +{ + struct westwood *w = tcp_ca(tp); + + w->bk = 0; + w->bw_ns_est = 0; + w->bw_est = 0; + w->accounted = 0; + w->cumul_ack = 0; + w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; + w->rtt_win_sx = tcp_time_stamp; + w->snd_una = tp->snd_una; +} + +/* + * @westwood_do_filter + * Low-pass filter. Implemented using constant coefficients. + */ +static inline u32 westwood_do_filter(u32 a, u32 b) +{ + return (((7 * a) + b) >> 3); +} + +static inline void westwood_filter(struct westwood *w, u32 delta) +{ + w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta); + w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est); +} + +/* + * @westwood_pkts_acked + * Called after processing group of packets. + * but all westwood needs is the last sample of srtt. + */ +static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt) +{ + struct westwood *w = tcp_ca(tp); + if (cnt > 0) + w->rtt = tp->srtt >> 3; +} + +/* + * @westwood_update_window + * It updates RTT evaluation window if it is the right moment to do + * it. If so it calls filter for evaluating bandwidth. + */ +static void westwood_update_window(struct tcp_sock *tp) +{ + struct westwood *w = tcp_ca(tp); + s32 delta = tcp_time_stamp - w->rtt_win_sx; + + /* + * See if a RTT-window has passed. + * Be careful since if RTT is less than + * 50ms we don't filter but we continue 'building the sample'. + * This minimum limit was chosen since an estimation on small + * time intervals is better to avoid... + * Obviously on a LAN we reasonably will always have + * right_bound = left_bound + WESTWOOD_RTT_MIN + */ + if (w->rtt && delta > max_t(u32, w->rtt, TCP_WESTWOOD_RTT_MIN)) { + westwood_filter(w, delta); + + w->bk = 0; + w->rtt_win_sx = tcp_time_stamp; + } +} + +/* + * @westwood_fast_bw + * It is called when we are in fast path. In particular it is called when + * header prediction is successful. In such case in fact update is + * straight forward and doesn't need any particular care. + */ +static inline void westwood_fast_bw(struct tcp_sock *tp) +{ + struct westwood *w = tcp_ca(tp); + + westwood_update_window(tp); + + w->bk += tp->snd_una - w->snd_una; + w->snd_una = tp->snd_una; + w->rtt_min = min(w->rtt, w->rtt_min); +} + +/* + * @westwood_acked_count + * This function evaluates cumul_ack for evaluating bk in case of + * delayed or partial acks. + */ +static inline u32 westwood_acked_count(struct tcp_sock *tp) +{ + struct westwood *w = tcp_ca(tp); + + w->cumul_ack = tp->snd_una - w->snd_una; + + /* If cumul_ack is 0 this is a dupack since it's not moving + * tp->snd_una. + */ + if (!w->cumul_ack) { + w->accounted += tp->mss_cache; + w->cumul_ack = tp->mss_cache; + } + + if (w->cumul_ack > tp->mss_cache) { + /* Partial or delayed ack */ + if (w->accounted >= w->cumul_ack) { + w->accounted -= w->cumul_ack; + w->cumul_ack = tp->mss_cache; + } else { + w->cumul_ack -= w->accounted; + w->accounted = 0; + } + } + + w->snd_una = tp->snd_una; + + return w->cumul_ack; +} + +static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp) +{ + struct westwood *w = tcp_ca(tp); + return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); +} + +/* + * TCP Westwood + * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it + * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 + * so avoids ever returning 0. + */ +static u32 tcp_westwood_cwnd_min(struct tcp_sock *tp) +{ + return westwood_bw_rttmin(tp); +} + +static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event) +{ + struct westwood *w = tcp_ca(tp); + + switch(event) { + case CA_EVENT_FAST_ACK: + westwood_fast_bw(tp); + break; + + case CA_EVENT_COMPLETE_CWR: + tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(tp); + break; + + case CA_EVENT_FRTO: + tp->snd_ssthresh = westwood_bw_rttmin(tp); + break; + + case CA_EVENT_SLOW_ACK: + westwood_update_window(tp); + w->bk += westwood_acked_count(tp); + w->rtt_min = min(w->rtt, w->rtt_min); + break; + + default: + /* don't care */ + break; + } +} + + +/* Extract info for Tcp socket info provided via netlink. */ +static void tcp_westwood_info(struct tcp_sock *tp, u32 ext, + struct sk_buff *skb) +{ + const struct westwood *ca = tcp_ca(tp); + if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { + struct rtattr *rta; + struct tcpvegas_info *info; + + rta = __RTA_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*info)); + info = RTA_DATA(rta); + info->tcpv_enabled = 1; + info->tcpv_rttcnt = 0; + info->tcpv_rtt = jiffies_to_usecs(ca->rtt); + info->tcpv_minrtt = jiffies_to_usecs(ca->rtt_min); + rtattr_failure: ; + } +} + + +static struct tcp_congestion_ops tcp_westwood = { + .init = tcp_westwood_init, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .min_cwnd = tcp_westwood_cwnd_min, + .cwnd_event = tcp_westwood_event, + .get_info = tcp_westwood_info, + .pkts_acked = tcp_westwood_pkts_acked, + + .owner = THIS_MODULE, + .name = "westwood" +}; + +static int __init tcp_westwood_register(void) +{ + BUG_ON(sizeof(struct westwood) > TCP_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_westwood); +} + +static void __exit tcp_westwood_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_westwood); +} + +module_init(tcp_westwood_register); +module_exit(tcp_westwood_unregister); + +MODULE_AUTHOR("Stephen Hemminger, Angelo Dell'Aera"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Westwood+"); -- cgit v1.2.3 From a628d29b56d3f420bf3ff1d7543a9caf3ce3b994 Mon Sep 17 00:00:00 2001 From: John Heffner Date: Thu, 23 Jun 2005 12:24:58 -0700 Subject: [TCP]: Add High Speed TCP congestion control module. Sally Floyd's high speed TCP congestion control. This is useful for comparison and research. Signed-off-by: John Heffner Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 11 +++ net/ipv4/Makefile | 1 + net/ipv4/tcp_highspeed.c | 181 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 193 insertions(+) create mode 100644 net/ipv4/tcp_highspeed.c diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index adbe855d931..910e25b6575 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -467,6 +467,17 @@ config TCP_CONG_WESTWOOD TCP Westwood+ significantly increases fairness wrt TCP Reno in wired networks and throughput over wireless links. +config TCP_CONG_HSTCP + tristate "High Speed TCP" + depends on INET && EXPERIMENTAL + default n + ---help--- + Sally Floyd's High Speed TCP (RFC 3649) congestion control. + A modification to TCP's congestion control mechanism for use + with large congestion windows. A table indicates how much to + increase the congestion window by when an ACK is received. + For more detail see http://www.icir.org/floyd/hstcp.html + endmenu source "net/ipv4/ipvs/Kconfig" diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index dedfbe62a10..ddf5d7785bf 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -33,6 +33,7 @@ obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o +obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c new file mode 100644 index 00000000000..36c51f8136b --- /dev/null +++ b/net/ipv4/tcp_highspeed.c @@ -0,0 +1,181 @@ +/* + * Sally Floyd's High Speed TCP (RFC 3649) congestion control + * + * See http://www.icir.org/floyd/hstcp.html + * + * John Heffner + */ + +#include +#include +#include + + +/* From AIMD tables from RFC 3649 appendix B, + * with fixed-point MD scaled <<8. + */ +static const struct hstcp_aimd_val { + unsigned int cwnd; + unsigned int md; +} hstcp_aimd_vals[] = { + { 38, 128, /* 0.50 */ }, + { 118, 112, /* 0.44 */ }, + { 221, 104, /* 0.41 */ }, + { 347, 98, /* 0.38 */ }, + { 495, 93, /* 0.37 */ }, + { 663, 89, /* 0.35 */ }, + { 851, 86, /* 0.34 */ }, + { 1058, 83, /* 0.33 */ }, + { 1284, 81, /* 0.32 */ }, + { 1529, 78, /* 0.31 */ }, + { 1793, 76, /* 0.30 */ }, + { 2076, 74, /* 0.29 */ }, + { 2378, 72, /* 0.28 */ }, + { 2699, 71, /* 0.28 */ }, + { 3039, 69, /* 0.27 */ }, + { 3399, 68, /* 0.27 */ }, + { 3778, 66, /* 0.26 */ }, + { 4177, 65, /* 0.26 */ }, + { 4596, 64, /* 0.25 */ }, + { 5036, 62, /* 0.25 */ }, + { 5497, 61, /* 0.24 */ }, + { 5979, 60, /* 0.24 */ }, + { 6483, 59, /* 0.23 */ }, + { 7009, 58, /* 0.23 */ }, + { 7558, 57, /* 0.22 */ }, + { 8130, 56, /* 0.22 */ }, + { 8726, 55, /* 0.22 */ }, + { 9346, 54, /* 0.21 */ }, + { 9991, 53, /* 0.21 */ }, + { 10661, 52, /* 0.21 */ }, + { 11358, 52, /* 0.20 */ }, + { 12082, 51, /* 0.20 */ }, + { 12834, 50, /* 0.20 */ }, + { 13614, 49, /* 0.19 */ }, + { 14424, 48, /* 0.19 */ }, + { 15265, 48, /* 0.19 */ }, + { 16137, 47, /* 0.19 */ }, + { 17042, 46, /* 0.18 */ }, + { 17981, 45, /* 0.18 */ }, + { 18955, 45, /* 0.18 */ }, + { 19965, 44, /* 0.17 */ }, + { 21013, 43, /* 0.17 */ }, + { 22101, 43, /* 0.17 */ }, + { 23230, 42, /* 0.17 */ }, + { 24402, 41, /* 0.16 */ }, + { 25618, 41, /* 0.16 */ }, + { 26881, 40, /* 0.16 */ }, + { 28193, 39, /* 0.16 */ }, + { 29557, 39, /* 0.15 */ }, + { 30975, 38, /* 0.15 */ }, + { 32450, 38, /* 0.15 */ }, + { 33986, 37, /* 0.15 */ }, + { 35586, 36, /* 0.14 */ }, + { 37253, 36, /* 0.14 */ }, + { 38992, 35, /* 0.14 */ }, + { 40808, 35, /* 0.14 */ }, + { 42707, 34, /* 0.13 */ }, + { 44694, 33, /* 0.13 */ }, + { 46776, 33, /* 0.13 */ }, + { 48961, 32, /* 0.13 */ }, + { 51258, 32, /* 0.13 */ }, + { 53677, 31, /* 0.12 */ }, + { 56230, 30, /* 0.12 */ }, + { 58932, 30, /* 0.12 */ }, + { 61799, 29, /* 0.12 */ }, + { 64851, 28, /* 0.11 */ }, + { 68113, 28, /* 0.11 */ }, + { 71617, 27, /* 0.11 */ }, + { 75401, 26, /* 0.10 */ }, + { 79517, 26, /* 0.10 */ }, + { 84035, 25, /* 0.10 */ }, + { 89053, 24, /* 0.10 */ }, +}; + +#define HSTCP_AIMD_MAX ARRAY_SIZE(hstcp_aimd_vals) + +struct hstcp { + u32 ai; +}; + +static void hstcp_init(struct tcp_sock *tp) +{ + struct hstcp *ca = tcp_ca(tp); + + ca->ai = 0; + + /* Ensure the MD arithmetic works. This is somewhat pedantic, + * since I don't think we will see a cwnd this large. :) */ + tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); +} + +static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt, + u32 in_flight, int good) +{ + struct hstcp *ca = tcp_ca(tp); + + if (in_flight < tp->snd_cwnd) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } else { + /* Update AIMD parameters */ + if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { + while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && + ca->ai < HSTCP_AIMD_MAX) + ca->ai++; + } else if (tp->snd_cwnd < hstcp_aimd_vals[ca->ai].cwnd) { + while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && + ca->ai > 0) + ca->ai--; + } + + /* Do additive increase */ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) { + tp->snd_cwnd_cnt += ca->ai; + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + tp->snd_cwnd++; + tp->snd_cwnd_cnt -= tp->snd_cwnd; + } + } + } +} + +static u32 hstcp_ssthresh(struct tcp_sock *tp) +{ + struct hstcp *ca = tcp_ca(tp); + + /* Do multiplicative decrease */ + return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U); +} + + +static struct tcp_congestion_ops tcp_highspeed = { + .init = hstcp_init, + .ssthresh = hstcp_ssthresh, + .cong_avoid = hstcp_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + + .owner = THIS_MODULE, + .name = "highspeed" +}; + +static int __init hstcp_register(void) +{ + BUG_ON(sizeof(struct hstcp) > TCP_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_highspeed); +} + +static void __exit hstcp_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_highspeed); +} + +module_init(hstcp_register); +module_exit(hstcp_unregister); + +MODULE_AUTHOR("John Heffner"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("High Speed TCP"); -- cgit v1.2.3 From 835b3f0c0d7e1f716c45ec576662eac7a68b8548 Mon Sep 17 00:00:00 2001 From: Daniele Lacamera <(root at danielinux.net)net> Date: Thu, 23 Jun 2005 12:26:34 -0700 Subject: [TCP]: Add TCP Hybla congestion control module. TCP Hybla congestion avoidance. - "In heterogeneous networks, TCP connections that incorporate a terrestrial or satellite radio link are greatly disadvantaged with respect to entirely wired connections, because of their longer round trip times (RTTs). To cope with this problem, a new TCP proposal, the TCP Hybla, is presented and discussed in the paper[1]. It stems from an analytical evaluation of the congestion window dynamics in the TCP standard versions (Tahoe, Reno, NewReno), which suggests the necessary modifications to remove the performance dependence on RTT.[...]"[1] [1]: Carlo Caini, Rosario Firrincieli, "TCP Hybla: a TCP enhancement for heterogeneous networks", International Journal of Satellite Communications and Networking Volume 22, Issue 5 , Pages 547 - 566. September 2004. Signed-off-by: Daniele Lacamera (root at danielinux.net)net Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 10 +++ net/ipv4/Makefile | 1 + net/ipv4/tcp_hybla.c | 187 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 198 insertions(+) create mode 100644 net/ipv4/tcp_hybla.c diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 910e25b6575..516ffe84281 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -478,6 +478,16 @@ config TCP_CONG_HSTCP increase the congestion window by when an ACK is received. For more detail see http://www.icir.org/floyd/hstcp.html +config TCP_CONG_HYBLA + tristate "TCP-Hybla congestion control algorithm" + depends on INET && EXPERIMENTAL + default n + ---help--- + TCP-Hybla is a sender-side only change that eliminates penalization of + long-RTT, large-bandwidth connections, like when satellite legs are + involved, expecially when sharing a common bottleneck with normal + terrestrial connections. + endmenu source "net/ipv4/ipvs/Kconfig" diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index ddf5d7785bf..ede7a5d617f 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o +obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c new file mode 100644 index 00000000000..13a66342c30 --- /dev/null +++ b/net/ipv4/tcp_hybla.c @@ -0,0 +1,187 @@ +/* + * TCP HYBLA + * + * TCP-HYBLA Congestion control algorithm, based on: + * C.Caini, R.Firrincieli, "TCP-Hybla: A TCP Enhancement + * for Heterogeneous Networks", + * International Journal on satellite Communications, + * September 2004 + * Daniele Lacamera + * root at danielinux.net + */ + +#include +#include +#include + +/* Tcp Hybla structure. */ +struct hybla { + u8 hybla_en; + u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */ + u32 rho; /* Rho parameter, integer part */ + u32 rho2; /* Rho * Rho, integer part */ + u32 rho_3ls; /* Rho parameter, <<3 */ + u32 rho2_7ls; /* Rho^2, <<7 */ + u32 minrtt; /* Minimum smoothed round trip time value seen */ +}; + +/* Hybla reference round trip time (default= 1/40 sec = 25 ms), + expressed in jiffies */ +static int rtt0 = 25; +module_param(rtt0, int, 0644); +MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)"); + + +/* This is called to refresh values for hybla parameters */ +static inline void hybla_recalc_param (struct tcp_sock *tp) +{ + struct hybla *ca = tcp_ca(tp); + + ca->rho_3ls = max_t(u32, tp->srtt / msecs_to_jiffies(rtt0), 8); + ca->rho = ca->rho_3ls >> 3; + ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; + ca->rho2 = ca->rho2_7ls >>7; +} + +static void hybla_init(struct tcp_sock *tp) +{ + struct hybla *ca = tcp_ca(tp); + + ca->rho = 0; + ca->rho2 = 0; + ca->rho_3ls = 0; + ca->rho2_7ls = 0; + ca->snd_cwnd_cents = 0; + ca->hybla_en = 1; + tp->snd_cwnd = 2; + tp->snd_cwnd_clamp = 65535; + + /* 1st Rho measurement based on initial srtt */ + hybla_recalc_param(tp); + + /* set minimum rtt as this is the 1st ever seen */ + ca->minrtt = tp->srtt; + tp->snd_cwnd = ca->rho; +} + +static void hybla_state(struct tcp_sock *tp, u8 ca_state) +{ + struct hybla *ca = tcp_ca(tp); + + ca->hybla_en = (ca_state == TCP_CA_Open); +} + +static inline u32 hybla_fraction(u32 odds) +{ + static const u32 fractions[] = { + 128, 139, 152, 165, 181, 197, 215, 234, + }; + + return (odds < ARRAY_SIZE(fractions)) ? fractions[odds] : 128; +} + +/* TCP Hybla main routine. + * This is the algorithm behavior: + * o Recalc Hybla parameters if min_rtt has changed + * o Give cwnd a new value based on the model proposed + * o remember increments <1 + */ +static void hybla_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, + u32 in_flight, int flag) +{ + struct hybla *ca = tcp_ca(tp); + u32 increment, odd, rho_fractions; + int is_slowstart = 0; + + /* Recalculate rho only if this srtt is the lowest */ + if (tp->srtt < ca->minrtt){ + hybla_recalc_param(tp); + ca->minrtt = tp->srtt; + } + + if (!ca->hybla_en) + return tcp_reno_cong_avoid(tp, ack, rtt, in_flight, flag); + + if (in_flight < tp->snd_cwnd) + return; + + if (ca->rho == 0) + hybla_recalc_param(tp); + + rho_fractions = ca->rho_3ls - (ca->rho << 3); + + if (tp->snd_cwnd < tp->snd_ssthresh) { + /* + * slow start + * INC = 2^RHO - 1 + * This is done by splitting the rho parameter + * into 2 parts: an integer part and a fraction part. + * Inrement<<7 is estimated by doing: + * [2^(int+fract)]<<7 + * that is equal to: + * (2^int) * [(2^fract) <<7] + * 2^int is straightly computed as 1<rho) * hybla_fraction(rho_fractions)) + - 128; + } else { + /* + * congestion avoidance + * INC = RHO^2 / W + * as long as increment is estimated as (rho<<7)/window + * it already is <<7 and we can easily count its fractions. + */ + increment = ca->rho2_7ls / tp->snd_cwnd; + if (increment < 128) + tp->snd_cwnd_cnt++; + } + + odd = increment % 128; + tp->snd_cwnd += increment >> 7; + ca->snd_cwnd_cents += odd; + + /* check when fractions goes >=128 and increase cwnd by 1. */ + while(ca->snd_cwnd_cents >= 128) { + tp->snd_cwnd++; + ca->snd_cwnd_cents -= 128; + tp->snd_cwnd_cnt = 0; + } + + /* clamp down slowstart cwnd to ssthresh value. */ + if (is_slowstart) + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + + tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); +} + +static struct tcp_congestion_ops tcp_hybla = { + .init = hybla_init, + .ssthresh = tcp_reno_ssthresh, + .min_cwnd = tcp_reno_min_cwnd, + .cong_avoid = hybla_cong_avoid, + .set_state = hybla_state, + + .owner = THIS_MODULE, + .name = "hybla" +}; + +static int __init hybla_register(void) +{ + BUG_ON(sizeof(struct hybla) > TCP_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_hybla); +} + +static void __exit hybla_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_hybla); +} + +module_init(hybla_register); +module_exit(hybla_unregister); + +MODULE_AUTHOR("Daniele Lacamera"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Hybla"); -- cgit v1.2.3 From b87d8561d8667d221b728ccdcb18eb95b16a687b Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 23 Jun 2005 12:27:19 -0700 Subject: [TCP]: Add TCP Vegas congestion control module. TCP Vegas code modified for the new TCP infrastructure. Vegas now uses microsecond resolution timestamps for better estimation of performance over higher speed links. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 11 ++ net/ipv4/Makefile | 1 + net/ipv4/tcp_vegas.c | 411 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 423 insertions(+) create mode 100644 net/ipv4/tcp_vegas.c diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 516ffe84281..6c105b60cc0 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -488,6 +488,17 @@ config TCP_CONG_HYBLA involved, expecially when sharing a common bottleneck with normal terrestrial connections. +config TCP_CONG_VEGAS + tristate "TCP Vegas" + depends on INET && EXPERIMENTAL + default n + ---help--- + TCP Vegas is a sender-side only change to TCP that anticipates + the onset of congestion by estimating the bandwidth. TCP Vegas + adjusts the sending rate by modifying the congestion + window. TCP Vegas should provide less packet loss, but it is + not as aggressive as TCP Reno. + endmenu source "net/ipv4/ipvs/Kconfig" diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index ede7a5d617f..a801a97bd01 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -35,6 +35,7 @@ obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o +obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c new file mode 100644 index 00000000000..9bd443db519 --- /dev/null +++ b/net/ipv4/tcp_vegas.c @@ -0,0 +1,411 @@ +/* + * TCP Vegas congestion control + * + * This is based on the congestion detection/avoidance scheme described in + * Lawrence S. Brakmo and Larry L. Peterson. + * "TCP Vegas: End to end congestion avoidance on a global internet." + * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, + * October 1995. Available from: + * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps + * + * See http://www.cs.arizona.edu/xkernel/ for their implementation. + * The main aspects that distinguish this implementation from the + * Arizona Vegas implementation are: + * o We do not change the loss detection or recovery mechanisms of + * Linux in any way. Linux already recovers from losses quite well, + * using fine-grained timers, NewReno, and FACK. + * o To avoid the performance penalty imposed by increasing cwnd + * only every-other RTT during slow start, we increase during + * every RTT during slow start, just like Reno. + * o Largely to allow continuous cwnd growth during slow start, + * we use the rate at which ACKs come back as the "actual" + * rate, rather than the rate at which data is sent. + * o To speed convergence to the right rate, we set the cwnd + * to achieve the right ("actual") rate when we exit slow start. + * o To filter out the noise caused by delayed ACKs, we use the + * minimum RTT sample observed during the last RTT to calculate + * the actual rate. + * o When the sender re-starts from idle, it waits until it has + * received ACKs for an entire flight of new data before making + * a cwnd adjustment decision. The original Vegas implementation + * assumed senders never went idle. + */ + +#include +#include +#include +#include +#include + +#include + +/* Default values of the Vegas variables, in fixed-point representation + * with V_PARAM_SHIFT bits to the right of the binary point. + */ +#define V_PARAM_SHIFT 1 +static int alpha = 1<doing_vegas_now = 1; + + /* Set the beginning of the next send window. */ + vegas->beg_snd_nxt = tp->snd_nxt; + + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; +} + +/* Stop taking Vegas samples for now. */ +static inline void vegas_disable(struct tcp_sock *tp) +{ + struct vegas *vegas = tcp_ca(tp); + + vegas->doing_vegas_now = 0; +} + +static void tcp_vegas_init(struct tcp_sock *tp) +{ + struct vegas *vegas = tcp_ca(tp); + + vegas->baseRTT = 0x7fffffff; + vegas_enable(tp); +} + +/* Do RTT sampling needed for Vegas. + * Basically we: + * o min-filter RTT samples from within an RTT to get the current + * propagation delay + queuing delay (we are min-filtering to try to + * avoid the effects of delayed ACKs) + * o min-filter RTT samples from a much longer window (forever for now) + * to find the propagation delay (baseRTT) + */ +static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt) +{ + struct vegas *vegas = tcp_ca(tp); + u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */ + + /* Filter to find propagation delay: */ + if (vrtt < vegas->baseRTT) + vegas->baseRTT = vrtt; + + /* Find the min RTT during the last RTT to find + * the current prop. delay + queuing delay: + */ + vegas->minRTT = min(vegas->minRTT, vrtt); + vegas->cntRTT++; +} + +static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state) +{ + + if (ca_state == TCP_CA_Open) + vegas_enable(tp); + else + vegas_disable(tp); +} + +/* + * If the connection is idle and we are restarting, + * then we don't want to do any Vegas calculations + * until we get fresh RTT samples. So when we + * restart, we reset our Vegas state to a clean + * slate. After we get acks for this flight of + * packets, _then_ we can make Vegas calculations + * again. + */ +static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event) +{ + if (event == CA_EVENT_CWND_RESTART || + event == CA_EVENT_TX_START) + tcp_vegas_init(tp); +} + +static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack, + u32 seq_rtt, u32 in_flight, int flag) +{ + struct vegas *vegas = tcp_ca(tp); + + if (!vegas->doing_vegas_now) + return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag); + + /* The key players are v_beg_snd_una and v_beg_snd_nxt. + * + * These are so named because they represent the approximate values + * of snd_una and snd_nxt at the beginning of the current RTT. More + * precisely, they represent the amount of data sent during the RTT. + * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, + * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding + * bytes of data have been ACKed during the course of the RTT, giving + * an "actual" rate of: + * + * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) + * + * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, + * because delayed ACKs can cover more than one segment, so they + * don't line up nicely with the boundaries of RTTs. + * + * Another unfortunate fact of life is that delayed ACKs delay the + * advance of the left edge of our send window, so that the number + * of bytes we send in an RTT is often less than our cwnd will allow. + * So we keep track of our cwnd separately, in v_beg_snd_cwnd. + */ + + if (after(ack, vegas->beg_snd_nxt)) { + /* Do the Vegas once-per-RTT cwnd adjustment. */ + u32 old_wnd, old_snd_cwnd; + + + /* Here old_wnd is essentially the window of data that was + * sent during the previous RTT, and has all + * been acknowledged in the course of the RTT that ended + * with the ACK we just received. Likewise, old_snd_cwnd + * is the cwnd during the previous RTT. + */ + old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) / + tp->mss_cache; + old_snd_cwnd = vegas->beg_snd_cwnd; + + /* Save the extent of the current window so we can use this + * at the end of the next RTT. + */ + vegas->beg_snd_una = vegas->beg_snd_nxt; + vegas->beg_snd_nxt = tp->snd_nxt; + vegas->beg_snd_cwnd = tp->snd_cwnd; + + /* Take into account the current RTT sample too, to + * decrease the impact of delayed acks. This double counts + * this sample since we count it for the next window as well, + * but that's not too awful, since we're taking the min, + * rather than averaging. + */ + tcp_vegas_rtt_calc(tp, seq_rtt*1000); + + /* We do the Vegas calculations only if we got enough RTT + * samples that we can be reasonably sure that we got + * at least one RTT sample that wasn't from a delayed ACK. + * If we only had 2 samples total, + * then that means we're getting only 1 ACK per RTT, which + * means they're almost certainly delayed ACKs. + * If we have 3 samples, we should be OK. + */ + + if (vegas->cntRTT <= 2) { + /* We don't have enough RTT samples to do the Vegas + * calculation, so we'll behave like Reno. + */ + if (tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd++; + } else { + u32 rtt, target_cwnd, diff; + + /* We have enough RTT samples, so, using the Vegas + * algorithm, we determine if we should increase or + * decrease cwnd, and by how much. + */ + + /* Pluck out the RTT we are using for the Vegas + * calculations. This is the min RTT seen during the + * last RTT. Taking the min filters out the effects + * of delayed ACKs, at the cost of noticing congestion + * a bit later. + */ + rtt = vegas->minRTT; + + /* Calculate the cwnd we should have, if we weren't + * going too fast. + * + * This is: + * (actual rate in segments) * baseRTT + * We keep it as a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary point. + */ + target_cwnd = ((old_wnd * vegas->baseRTT) + << V_PARAM_SHIFT) / rtt; + + /* Calculate the difference between the window we had, + * and the window we would like to have. This quantity + * is the "Diff" from the Arizona Vegas papers. + * + * Again, this is a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary + * point. + */ + diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; + + if (tp->snd_cwnd < tp->snd_ssthresh) { + /* Slow start. */ + if (diff > gamma) { + /* Going too fast. Time to slow down + * and switch to congestion avoidance. + */ + tp->snd_ssthresh = 2; + + /* Set cwnd to match the actual rate + * exactly: + * cwnd = (actual rate) * baseRTT + * Then we add 1 because the integer + * truncation robs us of full link + * utilization. + */ + tp->snd_cwnd = min(tp->snd_cwnd, + (target_cwnd >> + V_PARAM_SHIFT)+1); + + } + } else { + /* Congestion avoidance. */ + u32 next_snd_cwnd; + + /* Figure out where we would like cwnd + * to be. + */ + if (diff > beta) { + /* The old window was too fast, so + * we slow down. + */ + next_snd_cwnd = old_snd_cwnd - 1; + } else if (diff < alpha) { + /* We don't have enough extra packets + * in the network, so speed up. + */ + next_snd_cwnd = old_snd_cwnd + 1; + } else { + /* Sending just as fast as we + * should be. + */ + next_snd_cwnd = old_snd_cwnd; + } + + /* Adjust cwnd upward or downward, toward the + * desired value. + */ + if (next_snd_cwnd > tp->snd_cwnd) + tp->snd_cwnd++; + else if (next_snd_cwnd < tp->snd_cwnd) + tp->snd_cwnd--; + } + } + + /* Wipe the slate clean for the next RTT. */ + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; + } + + /* The following code is executed for every ack we receive, + * except for conditions checked in should_advance_cwnd() + * before the call to tcp_cong_avoid(). Mainly this means that + * we only execute this code if the ack actually acked some + * data. + */ + + /* If we are in slow start, increase our cwnd in response to this ACK. + * (If we are not in slow start then we are in congestion avoidance, + * and adjust our congestion window only once per RTT. See the code + * above.) + */ + if (tp->snd_cwnd <= tp->snd_ssthresh) + tp->snd_cwnd++; + + /* to keep cwnd from growing without bound */ + tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); + + /* Make sure that we are never so timid as to reduce our cwnd below + * 2 MSS. + * + * Going below 2 MSS would risk huge delayed ACKs from our receiver. + */ + tp->snd_cwnd = max(tp->snd_cwnd, 2U); +} + +/* Extract info for Tcp socket info provided via netlink. */ +static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext, + struct sk_buff *skb) +{ + const struct vegas *ca = tcp_ca(tp); + if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { + struct tcpvegas_info *info; + + info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO, + sizeof(*info))); + + info->tcpv_enabled = ca->doing_vegas_now; + info->tcpv_rttcnt = ca->cntRTT; + info->tcpv_rtt = ca->baseRTT; + info->tcpv_minrtt = ca->minRTT; + rtattr_failure: ; + } +} + +static struct tcp_congestion_ops tcp_vegas = { + .init = tcp_vegas_init, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_vegas_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + .rtt_sample = tcp_vegas_rtt_calc, + .set_state = tcp_vegas_state, + .cwnd_event = tcp_vegas_cwnd_event, + .get_info = tcp_vegas_get_info, + + .owner = THIS_MODULE, + .name = "vegas", +}; + +static int __init tcp_vegas_register(void) +{ + BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE); + tcp_register_congestion_control(&tcp_vegas); + return 0; +} + +static void __exit tcp_vegas_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_vegas); +} + +module_init(tcp_vegas_register); +module_exit(tcp_vegas_unregister); + +MODULE_AUTHOR("Stephen Hemminger"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Vegas"); -- cgit v1.2.3 From a7868ea68d29eb2c037952aeb3b549cf05749a18 Mon Sep 17 00:00:00 2001 From: Baruch Even Date: Thu, 23 Jun 2005 12:28:11 -0700 Subject: [TCP]: Add H-TCP congestion control module. H-TCP is a congestion control algorithm developed at the Hamilton Institute, by Douglas Leith and Robert Shorten. It is extending the standard Reno algorithm with mode switching is thus a relatively simple modification. H-TCP is defined in a layered manner as it is still a research platform. The basic form includes the modification of beta according to the ratio of maxRTT to min RTT and the alpha=2*factor*(1-beta) relation, where factor is dependant on the time since last congestion. The other layers improve convergence by adding appropriate factors to alpha. The following patch implements the H-TCP algorithm in it's basic form. Signed-Off-By: Baruch Even Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 13 +++ net/ipv4/Makefile | 1 + net/ipv4/tcp_htcp.c | 289 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 303 insertions(+) create mode 100644 net/ipv4/tcp_htcp.c diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 6c105b60cc0..73a25b52bf7 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -467,6 +467,18 @@ config TCP_CONG_WESTWOOD TCP Westwood+ significantly increases fairness wrt TCP Reno in wired networks and throughput over wireless links. +config TCP_CONG_HTCP + tristate "H-TCP" + depends on INET + default m + ---help--- + H-TCP is a send-side only modifications of the TCP Reno + protocol stack that optimizes the performance of TCP + congestion control for high speed network links. It uses a + modeswitch to change the alpha and beta parameters of TCP Reno + based on network conditions and in a way so as to be fair with + other Reno and H-TCP flows. + config TCP_CONG_HSTCP tristate "High Speed TCP" depends on INET && EXPERIMENTAL @@ -499,6 +511,7 @@ config TCP_CONG_VEGAS window. TCP Vegas should provide less packet loss, but it is not as aggressive as TCP Reno. + endmenu source "net/ipv4/ipvs/Kconfig" diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index a801a97bd01..e96ed17deb9 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -35,6 +35,7 @@ obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o +obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c new file mode 100644 index 00000000000..40168275acf --- /dev/null +++ b/net/ipv4/tcp_htcp.c @@ -0,0 +1,289 @@ +/* + * H-TCP congestion control. The algorithm is detailed in: + * R.N.Shorten, D.J.Leith: + * "H-TCP: TCP for high-speed and long-distance networks" + * Proc. PFLDnet, Argonne, 2004. + * http://www.hamilton.ie/net/htcp3.pdf + */ + +#include +#include +#include +#include + +#define ALPHA_BASE (1<<7) /* 1.0 with shift << 7 */ +#define BETA_MIN (1<<6) /* 0.5 with shift << 7 */ +#define BETA_MAX 102 /* 0.8 with shift << 7 */ + +static int use_rtt_scaling = 1; +module_param(use_rtt_scaling, int, 0644); +MODULE_PARM_DESC(use_rtt_scaling, "turn on/off RTT scaling"); + +static int use_bandwidth_switch = 1; +module_param(use_bandwidth_switch, int, 0644); +MODULE_PARM_DESC(use_bandwidth_switch, "turn on/off bandwidth switcher"); + +struct htcp { + u16 alpha; /* Fixed point arith, << 7 */ + u8 beta; /* Fixed point arith, << 7 */ + u8 modeswitch; /* Delay modeswitch until we had at least one congestion event */ + u8 ccount; /* Number of RTTs since last congestion event */ + u8 undo_ccount; + u16 packetcount; + u32 minRTT; + u32 maxRTT; + u32 snd_cwnd_cnt2; + + u32 undo_maxRTT; + u32 undo_old_maxB; + + /* Bandwidth estimation */ + u32 minB; + u32 maxB; + u32 old_maxB; + u32 Bi; + u32 lasttime; +}; + +static inline void htcp_reset(struct htcp *ca) +{ + ca->undo_ccount = ca->ccount; + ca->undo_maxRTT = ca->maxRTT; + ca->undo_old_maxB = ca->old_maxB; + + ca->ccount = 0; + ca->snd_cwnd_cnt2 = 0; +} + +static u32 htcp_cwnd_undo(struct tcp_sock *tp) +{ + struct htcp *ca = tcp_ca(tp); + ca->ccount = ca->undo_ccount; + ca->maxRTT = ca->undo_maxRTT; + ca->old_maxB = ca->undo_old_maxB; + return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta); +} + +static inline void measure_rtt(struct tcp_sock *tp) +{ + struct htcp *ca = tcp_ca(tp); + u32 srtt = tp->srtt>>3; + + /* keep track of minimum RTT seen so far, minRTT is zero at first */ + if (ca->minRTT > srtt || !ca->minRTT) + ca->minRTT = srtt; + + /* max RTT */ + if (tp->ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) { + if (ca->maxRTT < ca->minRTT) + ca->maxRTT = ca->minRTT; + if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50) + ca->maxRTT = srtt; + } +} + +static void measure_achieved_throughput(struct tcp_sock *tp, u32 pkts_acked) +{ + struct htcp *ca = tcp_ca(tp); + u32 now = tcp_time_stamp; + + /* achieved throughput calculations */ + if (tp->ca_state != TCP_CA_Open && tp->ca_state != TCP_CA_Disorder) { + ca->packetcount = 0; + ca->lasttime = now; + return; + } + + ca->packetcount += pkts_acked; + + if (ca->packetcount >= tp->snd_cwnd - (ca->alpha>>7? : 1) + && now - ca->lasttime >= ca->minRTT + && ca->minRTT > 0) { + __u32 cur_Bi = ca->packetcount*HZ/(now - ca->lasttime); + if (ca->ccount <= 3) { + /* just after backoff */ + ca->minB = ca->maxB = ca->Bi = cur_Bi; + } else { + ca->Bi = (3*ca->Bi + cur_Bi)/4; + if (ca->Bi > ca->maxB) + ca->maxB = ca->Bi; + if (ca->minB > ca->maxB) + ca->minB = ca->maxB; + } + ca->packetcount = 0; + ca->lasttime = now; + } +} + +static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT) +{ + if (use_bandwidth_switch) { + u32 maxB = ca->maxB; + u32 old_maxB = ca->old_maxB; + ca->old_maxB = ca->maxB; + + if (!between(5*maxB, 4*old_maxB, 6*old_maxB)) { + ca->beta = BETA_MIN; + ca->modeswitch = 0; + return; + } + } + + if (ca->modeswitch && minRTT > max(HZ/100, 1) && maxRTT) { + ca->beta = (minRTT<<7)/maxRTT; + if (ca->beta < BETA_MIN) + ca->beta = BETA_MIN; + else if (ca->beta > BETA_MAX) + ca->beta = BETA_MAX; + } else { + ca->beta = BETA_MIN; + ca->modeswitch = 1; + } +} + +static inline void htcp_alpha_update(struct htcp *ca) +{ + u32 minRTT = ca->minRTT; + u32 factor = 1; + u32 diff = ca->ccount * minRTT; /* time since last backoff */ + + if (diff > HZ) { + diff -= HZ; + factor = 1+ ( 10*diff + ((diff/2)*(diff/2)/HZ) )/HZ; + } + + if (use_rtt_scaling && minRTT) { + u32 scale = (HZ<<3)/(10*minRTT); + scale = min(max(scale, 1U<<2), 10U<<3); /* clamping ratio to interval [0.5,10]<<3 */ + factor = (factor<<3)/scale; + if (!factor) + factor = 1; + } + + ca->alpha = 2*factor*((1<<7)-ca->beta); + if (!ca->alpha) + ca->alpha = ALPHA_BASE; +} + +/* After we have the rtt data to calculate beta, we'd still prefer to wait one + * rtt before we adjust our beta to ensure we are working from a consistent + * data. + * + * This function should be called when we hit a congestion event since only at + * that point do we really have a real sense of maxRTT (the queues en route + * were getting just too full now). + */ +static void htcp_param_update(struct tcp_sock *tp) +{ + struct htcp *ca = tcp_ca(tp); + u32 minRTT = ca->minRTT; + u32 maxRTT = ca->maxRTT; + + htcp_beta_update(ca, minRTT, maxRTT); + htcp_alpha_update(ca); + + /* add slowly fading memory for maxRTT to accommodate routing changes etc */ + if (minRTT > 0 && maxRTT > minRTT) + ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100; +} + +static u32 htcp_recalc_ssthresh(struct tcp_sock *tp) +{ + struct htcp *ca = tcp_ca(tp); + htcp_param_update(tp); + return max((tp->snd_cwnd * ca->beta) >> 7, 2U); +} + +static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, + u32 in_flight, int data_acked) +{ + struct htcp *ca = tcp_ca(tp); + + if (in_flight < tp->snd_cwnd) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* In "safe" area, increase. */ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } else { + measure_rtt(tp); + + /* keep track of number of round-trip times since last backoff event */ + if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) { + ca->ccount++; + ca->snd_cwnd_cnt2 = 0; + htcp_alpha_update(ca); + } + + /* In dangerous area, increase slowly. + * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd + */ + if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + ca->ccount++; + } + } +} + +/* Lower bound on congestion window. */ +static u32 htcp_min_cwnd(struct tcp_sock *tp) +{ + return tp->snd_ssthresh; +} + + +static void htcp_init(struct tcp_sock *tp) +{ + struct htcp *ca = tcp_ca(tp); + + memset(ca, 0, sizeof(struct htcp)); + ca->alpha = ALPHA_BASE; + ca->beta = BETA_MIN; +} + +static void htcp_state(struct tcp_sock *tp, u8 new_state) +{ + switch (new_state) { + case TCP_CA_CWR: + case TCP_CA_Recovery: + case TCP_CA_Loss: + htcp_reset(tcp_ca(tp)); + break; + } +} + +static struct tcp_congestion_ops htcp = { + .init = htcp_init, + .ssthresh = htcp_recalc_ssthresh, + .min_cwnd = htcp_min_cwnd, + .cong_avoid = htcp_cong_avoid, + .set_state = htcp_state, + .undo_cwnd = htcp_cwnd_undo, + .pkts_acked = measure_achieved_throughput, + .owner = THIS_MODULE, + .name = "htcp", +}; + +static int __init htcp_register(void) +{ + BUG_ON(sizeof(struct htcp) > TCP_CA_PRIV_SIZE); + BUILD_BUG_ON(BETA_MIN >= BETA_MAX); + if (!use_bandwidth_switch) + htcp.pkts_acked = NULL; + return tcp_register_congestion_control(&htcp); +} + +static void __exit htcp_unregister(void) +{ + tcp_unregister_congestion_control(&htcp); +} + +module_init(htcp_register); +module_exit(htcp_unregister); + +MODULE_AUTHOR("Baruch Even"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("H-TCP"); -- cgit v1.2.3 From 0e57976b6376f7fda6bef8b7dee2a3c8819ec9e9 Mon Sep 17 00:00:00 2001 From: John Heffner Date: Thu, 23 Jun 2005 12:29:07 -0700 Subject: [TCP]: Add Scalable TCP congestion control module. This patch implements Tom Kelly's Scalable TCP congestion control algorithm for the modular framework. The algorithm has some nice scaling properties, and has been used a fair bit in research, though is known to have significant fairness issues, so it's not really suitable for general purpose use. Signed-off-by: John Heffner Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 9 +++++++ net/ipv4/Makefile | 1 + net/ipv4/tcp_scalable.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 78 insertions(+) create mode 100644 net/ipv4/tcp_scalable.c diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 73a25b52bf7..690e88ba248 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -511,6 +511,15 @@ config TCP_CONG_VEGAS window. TCP Vegas should provide less packet loss, but it is not as aggressive as TCP Reno. +config TCP_CONG_SCALABLE + tristate "Scalable TCP" + depends on INET && EXPERIMENTAL + default n + ---help--- + Scalable TCP is a sender-side only change to TCP which uses a + MIMD congestion control algorithm which has some nice scaling + properties, though is known to have fairness issues. + See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/ endmenu diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index e96ed17deb9..5718cdb3a61 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o +obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c new file mode 100644 index 00000000000..70e108e15c7 --- /dev/null +++ b/net/ipv4/tcp_scalable.c @@ -0,0 +1,68 @@ +/* Tom Kelly's Scalable TCP + * + * See htt://www-lce.eng.cam.ac.uk/~ctk21/scalable/ + * + * John Heffner + */ + +#include +#include +#include + +/* These factors derived from the recommended values in the aer: + * .01 and and 7/8. We use 50 instead of 100 to account for + * delayed ack. + */ +#define TCP_SCALABLE_AI_CNT 50U +#define TCP_SCALABLE_MD_SCALE 3 + +static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, + u32 in_flight, int flag) +{ + if (in_flight < tp->snd_cwnd) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + tp->snd_cwnd++; + } else { + tp->snd_cwnd_cnt++; + if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } + } + tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +static u32 tcp_scalable_ssthresh(struct tcp_sock *tp) +{ + return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); +} + + +static struct tcp_congestion_ops tcp_scalable = { + .ssthresh = tcp_scalable_ssthresh, + .cong_avoid = tcp_scalable_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + + .owner = THIS_MODULE, + .name = "scalable", +}; + +static int __init tcp_scalable_register(void) +{ + return tcp_register_congestion_control(&tcp_scalable); +} + +static void __exit tcp_scalable_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_scalable); +} + +module_init(tcp_scalable_register); +module_exit(tcp_scalable_unregister); + +MODULE_AUTHOR("John Heffner"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Scalable TCP"); -- cgit v1.2.3