diff options
-rw-r--r-- | Documentation/networking/ip-sysctl.txt | 56 | ||||
-rw-r--r-- | Documentation/networking/tcp.txt | 69 | ||||
-rw-r--r-- | include/linux/sysctl.h | 9 | ||||
-rw-r--r-- | include/linux/tcp.h | 49 | ||||
-rw-r--r-- | include/linux/tcp_diag.h | 4 | ||||
-rw-r--r-- | include/net/tcp.h | 237 | ||||
-rw-r--r-- | net/ipv4/Kconfig | 90 | ||||
-rw-r--r-- | net/ipv4/Makefile | 10 | ||||
-rw-r--r-- | net/ipv4/sysctl_net_ipv4.c | 114 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_bic.c | 331 | ||||
-rw-r--r-- | net/ipv4/tcp_cong.c | 195 | ||||
-rw-r--r-- | net/ipv4/tcp_diag.c | 34 | ||||
-rw-r--r-- | net/ipv4/tcp_highspeed.c | 181 | ||||
-rw-r--r-- | net/ipv4/tcp_htcp.c | 289 | ||||
-rw-r--r-- | net/ipv4/tcp_hybla.c | 187 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 737 | ||||
-rw-r--r-- | net/ipv4/tcp_ipv4.c | 3 | ||||
-rw-r--r-- | net/ipv4/tcp_minisocks.c | 4 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 23 | ||||
-rw-r--r-- | net/ipv4/tcp_scalable.c | 68 | ||||
-rw-r--r-- | net/ipv4/tcp_vegas.c | 411 | ||||
-rw-r--r-- | net/ipv4/tcp_westwood.c | 259 | ||||
-rw-r--r-- | net/ipv6/tcp_ipv6.c | 2 |
24 files changed, 2304 insertions, 1060 deletions
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index a2c893a7475..ab65714d95f 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -304,57 +304,6 @@ tcp_low_latency - BOOLEAN changed would be a Beowulf compute cluster. Default: 0 -tcp_westwood - BOOLEAN - Enable TCP Westwood+ congestion control algorithm. - TCP Westwood+ is a sender-side only modification of the TCP Reno - protocol stack that optimizes the performance of TCP congestion - control. It is based on end-to-end bandwidth estimation to set - congestion window and slow start threshold after a congestion - episode. Using this estimation, TCP Westwood+ adaptively sets a - slow start threshold and a congestion window which takes into - account the bandwidth used at the time congestion is experienced. - TCP Westwood+ significantly increases fairness wrt TCP Reno in - wired networks and throughput over wireless links. - Default: 0 - -tcp_vegas_cong_avoid - BOOLEAN - Enable TCP Vegas congestion avoidance algorithm. - TCP Vegas is a sender-side only change to TCP that anticipates - the onset of congestion by estimating the bandwidth. TCP Vegas - adjusts the sending rate by modifying the congestion - window. TCP Vegas should provide less packet loss, but it is - not as aggressive as TCP Reno. - Default:0 - -tcp_bic - BOOLEAN - Enable BIC TCP congestion control algorithm. - BIC-TCP is a sender-side only change that ensures a linear RTT - fairness under large windows while offering both scalability and - bounded TCP-friendliness. The protocol combines two schemes - called additive increase and binary search increase. When the - congestion window is large, additive increase with a large - increment ensures linear RTT fairness as well as good - scalability. Under small congestion windows, binary search - increase provides TCP friendliness. - Default: 0 - -tcp_bic_low_window - INTEGER - Sets the threshold window (in packets) where BIC TCP starts to - adjust the congestion window. Below this threshold BIC TCP behaves - the same as the default TCP Reno. - Default: 14 - -tcp_bic_fast_convergence - BOOLEAN - Forces BIC TCP to more quickly respond to changes in congestion - window. Allows two flows sharing the same connection to converge - more rapidly. - Default: 1 - -tcp_default_win_scale - INTEGER - Sets the minimum window scale TCP will negotiate for on all - conections. - Default: 7 - tcp_tso_win_divisor - INTEGER This allows control over what percentage of the congestion window can be consumed by a single TSO frame. @@ -368,6 +317,11 @@ tcp_frto - BOOLEAN where packet loss is typically due to random radio interference rather than intermediate router congestion. +tcp_congestion_control - STRING + Set the congestion control algorithm to be used for new + connections. The algorithm "reno" is always available, but + additional choices may be available based on kernel configuration. + somaxconn - INTEGER Limit of socket listen() backlog, known in userspace as SOMAXCONN. Defaults to 128. See also tcp_max_syn_backlog for additional tuning diff --git a/Documentation/networking/tcp.txt b/Documentation/networking/tcp.txt index 71749007091..0fa30042557 100644 --- a/Documentation/networking/tcp.txt +++ b/Documentation/networking/tcp.txt @@ -1,5 +1,72 @@ -How the new TCP output machine [nyi] works. +TCP protocol +============ + +Last updated: 21 June 2005 + +Contents +======== + +- Congestion control +- How the new TCP output machine [nyi] works + +Congestion control +================== + +The following variables are used in the tcp_sock for congestion control: +snd_cwnd The size of the congestion window +snd_ssthresh Slow start threshold. We are in slow start if + snd_cwnd is less than this. +snd_cwnd_cnt A counter used to slow down the rate of increase + once we exceed slow start threshold. +snd_cwnd_clamp This is the maximum size that snd_cwnd can grow to. +snd_cwnd_stamp Timestamp for when congestion window last validated. +snd_cwnd_used Used as a highwater mark for how much of the + congestion window is in use. It is used to adjust + snd_cwnd down when the link is limited by the + application rather than the network. + +As of 2.6.13, Linux supports pluggable congestion control algorithms. +A congestion control mechanism can be registered through functions in +tcp_cong.c. The functions used by the congestion control mechanism are +registered via passing a tcp_congestion_ops struct to +tcp_register_congestion_control. As a minimum name, ssthresh, +cong_avoid, min_cwnd must be valid. +Private data for a congestion control mechanism is stored in tp->ca_priv. +tcp_ca(tp) returns a pointer to this space. This is preallocated space - it +is important to check the size of your private data will fit this space, or +alternatively space could be allocated elsewhere and a pointer to it could +be stored here. + +There are three kinds of congestion control algorithms currently: The +simplest ones are derived from TCP reno (highspeed, scalable) and just +provide an alternative the congestion window calculation. More complex +ones like BIC try to look at other events to provide better +heuristics. There are also round trip time based algorithms like +Vegas and Westwood+. + +Good TCP congestion control is a complex problem because the algorithm +needs to maintain fairness and performance. Please review current +research and RFC's before developing new modules. + +The method that is used to determine which congestion control mechanism is +determined by the setting of the sysctl net.ipv4.tcp_congestion_control. +The default congestion control will be the last one registered (LIFO); +so if you built everything as modules. the default will be reno. If you +build with the default's from Kconfig, then BIC will be builtin (not a module) +and it will end up the default. + +If you really want a particular default value then you will need +to set it with the sysctl. If you use a sysctl, the module will be autoloaded +if needed and you will get the expected protocol. If you ask for an +unknown congestion method, then the sysctl attempt will fail. + +If you remove a tcp congestion control module, then you will get the next +available one. Since reno can not be built as a module, and can not be +deleted, it will always be available. + +How the new TCP output machine [nyi] works. +=========================================== Data is kept on a single queue. The skb->users flag tells us if the frame is one that has been queued already. To add a frame we throw it on the end. Ack diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 614e939c78a..72965bfe6cf 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -333,21 +333,14 @@ enum NET_TCP_FRTO=92, NET_TCP_LOW_LATENCY=93, NET_IPV4_IPFRAG_SECRET_INTERVAL=94, - NET_TCP_WESTWOOD=95, NET_IPV4_IGMP_MAX_MSF=96, NET_TCP_NO_METRICS_SAVE=97, - NET_TCP_VEGAS=98, - NET_TCP_VEGAS_ALPHA=99, - NET_TCP_VEGAS_BETA=100, - NET_TCP_VEGAS_GAMMA=101, - NET_TCP_BIC=102, - NET_TCP_BIC_FAST_CONVERGENCE=103, - NET_TCP_BIC_LOW_WINDOW=104, NET_TCP_DEFAULT_WIN_SCALE=105, NET_TCP_MODERATE_RCVBUF=106, NET_TCP_TSO_WIN_DIVISOR=107, NET_TCP_BIC_BETA=108, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109, + NET_TCP_CONG_CONTROL=110, }; enum { diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 97a7c9e03df..3ea75dd6640 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -203,13 +203,6 @@ struct tcp_sack_block { __u32 end_seq; }; -enum tcp_congestion_algo { - TCP_RENO=0, - TCP_VEGAS, - TCP_WESTWOOD, - TCP_BIC, -}; - struct tcp_options_received { /* PAWS/RTTM data */ long ts_recent_stamp;/* Time we stored ts_recent (for aging) */ @@ -305,7 +298,7 @@ struct tcp_sock { __u8 reordering; /* Packet reordering metric. */ __u8 frto_counter; /* Number of new acks after RTO */ - __u8 adv_cong; /* Using Vegas, Westwood, or BIC */ + __u8 unused; __u8 defer_accept; /* User waits for some data after accept() */ /* RTT measurement */ @@ -401,37 +394,10 @@ struct tcp_sock { __u32 time; } rcvq_space; -/* TCP Westwood structure */ - struct { - __u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */ - __u32 bw_est; /* bandwidth estimate */ - __u32 rtt_win_sx; /* here starts a new evaluation... */ - __u32 bk; - __u32 snd_una; /* used for evaluating the number of acked bytes */ - __u32 cumul_ack; - __u32 accounted; - __u32 rtt; - __u32 rtt_min; /* minimum observed RTT */ - } westwood; - -/* Vegas variables */ - struct { - __u32 beg_snd_nxt; /* right edge during last RTT */ - __u32 beg_snd_una; /* left edge during last RTT */ - __u32 beg_snd_cwnd; /* saves the size of the cwnd */ - __u8 doing_vegas_now;/* if true, do vegas for this RTT */ - __u16 cntRTT; /* # of RTTs measured within last RTT */ - __u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ - __u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ - } vegas; - - /* BI TCP Parameters */ - struct { - __u32 cnt; /* increase cwnd by 1 after this number of ACKs */ - __u32 last_max_cwnd; /* last maximium snd_cwnd */ - __u32 last_cwnd; /* the last snd_cwnd */ - __u32 last_stamp; /* time when updated last_cwnd */ - } bictcp; + /* Pluggable TCP congestion control hook */ + struct tcp_congestion_ops *ca_ops; + u32 ca_priv[16]; +#define TCP_CA_PRIV_SIZE (16*sizeof(u32)) }; static inline struct tcp_sock *tcp_sk(const struct sock *sk) @@ -439,6 +405,11 @@ static inline struct tcp_sock *tcp_sk(const struct sock *sk) return (struct tcp_sock *)sk; } +static inline void *tcp_ca(const struct tcp_sock *tp) +{ + return (void *) tp->ca_priv; +} + #endif #endif /* _LINUX_TCP_H */ diff --git a/include/linux/tcp_diag.h b/include/linux/tcp_diag.h index ceee962e1d1..7a599674394 100644 --- a/include/linux/tcp_diag.h +++ b/include/linux/tcp_diag.h @@ -99,9 +99,10 @@ enum TCPDIAG_MEMINFO, TCPDIAG_INFO, TCPDIAG_VEGASINFO, + TCPDIAG_CONG, }; -#define TCPDIAG_MAX TCPDIAG_VEGASINFO +#define TCPDIAG_MAX TCPDIAG_CONG /* TCPDIAG_MEM */ @@ -123,5 +124,4 @@ struct tcpvegas_info { __u32 tcpv_minrtt; }; - #endif /* _TCP_DIAG_H_ */ diff --git a/include/net/tcp.h b/include/net/tcp.h index f730935b824..e427cf35915 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -505,25 +505,6 @@ static __inline__ int tcp_sk_listen_hashfn(struct sock *sk) #else # define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG) #endif - -#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation - * max_cwnd = snd_cwnd * beta - */ -#define BICTCP_MAX_INCREMENT 32 /* - * Limit on the amount of - * increment allowed during - * binary search. - */ -#define BICTCP_FUNC_OF_MIN_INCR 11 /* - * log(B/Smin)/log(B/(B-1))+1, - * Smin:min increment - * B:log factor - */ -#define BICTCP_B 4 /* - * In binary search, - * go to point (max+min)/N - */ - /* * TCP option */ @@ -596,16 +577,7 @@ extern int sysctl_tcp_adv_win_scale; extern int sysctl_tcp_tw_reuse; extern int sysctl_tcp_frto; extern int sysctl_tcp_low_latency; -extern int sysctl_tcp_westwood; -extern int sysctl_tcp_vegas_cong_avoid; -extern int sysctl_tcp_vegas_alpha; -extern int sysctl_tcp_vegas_beta; -extern int sysctl_tcp_vegas_gamma; extern int sysctl_tcp_nometrics_save; -extern int sysctl_tcp_bic; -extern int sysctl_tcp_bic_fast_convergence; -extern int sysctl_tcp_bic_low_window; -extern int sysctl_tcp_bic_beta; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; @@ -1136,6 +1108,80 @@ static inline void tcp_packets_out_dec(struct tcp_sock *tp, tp->packets_out -= tcp_skb_pcount(skb); } +/* Events passed to congestion control interface */ +enum tcp_ca_event { + CA_EVENT_TX_START, /* first transmit when no packets in flight */ + CA_EVENT_CWND_RESTART, /* congestion window restart */ + CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ + CA_EVENT_FRTO, /* fast recovery timeout */ + CA_EVENT_LOSS, /* loss timeout */ + CA_EVENT_FAST_ACK, /* in sequence ack */ + CA_EVENT_SLOW_ACK, /* other ack */ +}; + +/* + * Interface for adding new TCP congestion control handlers + */ +#define TCP_CA_NAME_MAX 16 +struct tcp_congestion_ops { + struct list_head list; + + /* initialize private data (optional) */ + void (*init)(struct tcp_sock *tp); + /* cleanup private data (optional) */ + void (*release)(struct tcp_sock *tp); + + /* return slow start threshold (required) */ + u32 (*ssthresh)(struct tcp_sock *tp); + /* lower bound for congestion window (optional) */ + u32 (*min_cwnd)(struct tcp_sock *tp); + /* do new cwnd calculation (required) */ + void (*cong_avoid)(struct tcp_sock *tp, u32 ack, + u32 rtt, u32 in_flight, int good_ack); + /* round trip time sample per acked packet (optional) */ + void (*rtt_sample)(struct tcp_sock *tp, u32 usrtt); + /* call before changing ca_state (optional) */ + void (*set_state)(struct tcp_sock *tp, u8 new_state); + /* call when cwnd event occurs (optional) */ + void (*cwnd_event)(struct tcp_sock *tp, enum tcp_ca_event ev); + /* new value of cwnd after loss (optional) */ + u32 (*undo_cwnd)(struct tcp_sock *tp); + /* hook for packet ack accounting (optional) */ + void (*pkts_acked)(struct tcp_sock *tp, u32 num_acked); + /* get info for tcp_diag (optional) */ + void (*get_info)(struct tcp_sock *tp, u32 ext, struct sk_buff *skb); + + char name[TCP_CA_NAME_MAX]; + struct module *owner; +}; + +extern int tcp_register_congestion_control(struct tcp_congestion_ops *type); +extern void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); + +extern void tcp_init_congestion_control(struct tcp_sock *tp); +extern void tcp_cleanup_congestion_control(struct tcp_sock *tp); +extern int tcp_set_default_congestion_control(const char *name); +extern void tcp_get_default_congestion_control(char *name); + +extern struct tcp_congestion_ops tcp_reno; +extern u32 tcp_reno_ssthresh(struct tcp_sock *tp); +extern void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, + u32 rtt, u32 in_flight, int flag); +extern u32 tcp_reno_min_cwnd(struct tcp_sock *tp); + +static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state) +{ + if (tp->ca_ops->set_state) + tp->ca_ops->set_state(tp, ca_state); + tp->ca_state = ca_state; +} + +static inline void tcp_ca_event(struct tcp_sock *tp, enum tcp_ca_event event) +{ + if (tp->ca_ops->cwnd_event) + tp->ca_ops->cwnd_event(tp, event); +} + /* This determines how many packets are "in the network" to the best * of our knowledge. In many cases it is conservative, but where * detailed information is available from the receiver (via SACK @@ -1155,91 +1201,6 @@ static __inline__ unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) return (tp->packets_out - tp->left_out + tp->retrans_out); } -/* - * Which congestion algorithim is in use on the connection. - */ -#define tcp_is_vegas(__tp) ((__tp)->adv_cong == TCP_VEGAS) -#define tcp_is_westwood(__tp) ((__tp)->adv_cong == TCP_WESTWOOD) -#define tcp_is_bic(__tp) ((__tp)->adv_cong == TCP_BIC) - -/* Recalculate snd_ssthresh, we want to set it to: - * - * Reno: - * one half the current congestion window, but no - * less than two segments - * - * BIC: - * behave like Reno until low_window is reached, - * then increase congestion window slowly - */ -static inline __u32 tcp_recalc_ssthresh(struct tcp_sock *tp) -{ - if (tcp_is_bic(tp)) { - if (sysctl_tcp_bic_fast_convergence && - tp->snd_cwnd < tp->bictcp.last_max_cwnd) - tp->bictcp.last_max_cwnd = (tp->snd_cwnd * - (BICTCP_BETA_SCALE - + sysctl_tcp_bic_beta)) - / (2 * BICTCP_BETA_SCALE); - else - tp->bictcp.last_max_cwnd = tp->snd_cwnd; - - if (tp->snd_cwnd > sysctl_tcp_bic_low_window) - return max((tp->snd_cwnd * sysctl_tcp_bic_beta) - / BICTCP_BETA_SCALE, 2U); - } - - return max(tp->snd_cwnd >> 1U, 2U); -} - -/* Stop taking Vegas samples for now. */ -#define tcp_vegas_disable(__tp) ((__tp)->vegas.doing_vegas_now = 0) - -static inline void tcp_vegas_enable(struct tcp_sock *tp) -{ - /* There are several situations when we must "re-start" Vegas: - * - * o when a connection is established - * o after an RTO - * o after fast recovery - * o when we send a packet and there is no outstanding - * unacknowledged data (restarting an idle connection) - * - * In these circumstances we cannot do a Vegas calculation at the - * end of the first RTT, because any calculation we do is using - * stale info -- both the saved cwnd and congestion feedback are - * stale. - * - * Instead we must wait until the completion of an RTT during - * which we actually receive ACKs. - */ - - /* Begin taking Vegas samples next time we send something. */ - tp->vegas.doing_vegas_now = 1; - - /* Set the beginning of the next send window. */ - tp->vegas.beg_snd_nxt = tp->snd_nxt; - - tp->vegas.cntRTT = 0; - tp->vegas.minRTT = 0x7fffffff; -} - -/* Should we be taking Vegas samples right now? */ -#define tcp_vegas_enabled(__tp) ((__tp)->vegas.doing_vegas_now) - -extern void tcp_ca_init(struct tcp_sock *tp); - -static inline void tcp_set_ca_state(struct tcp_sock *tp, u8 ca_state) -{ - if (tcp_is_vegas(tp)) { - if (ca_state == TCP_CA_Open) - tcp_vegas_enable(tp); - else - tcp_vegas_disable(tp); - } - tp->ca_state = ca_state; -} - /* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd. * The exception is rate halving phase, when cwnd is decreasing towards * ssthresh. @@ -1288,7 +1249,7 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) static inline void __tcp_enter_cwr(struct tcp_sock *tp) { tp->undo_marker = 0; - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1U); tp->snd_cwnd_cnt = 0; @@ -1876,52 +1837,4 @@ struct tcp_iter_state { extern int tcp_proc_register(struct tcp_seq_afinfo *afinfo); extern void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo); -/* TCP Westwood functions and constants */ - -#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */ -#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */ - -static inline void tcp_westwood_update_rtt(struct tcp_sock *tp, __u32 rtt_seq) -{ - if (tcp_is_westwood(tp)) - tp->westwood.rtt = rtt_seq; -} - -static inline __u32 __tcp_westwood_bw_rttmin(const struct tcp_sock *tp) -{ - return max((tp->westwood.bw_est) * (tp->westwood.rtt_min) / - (__u32) (tp->mss_cache_std), - 2U); -} - -static inline __u32 tcp_westwood_bw_rttmin(const struct tcp_sock *tp) -{ - return tcp_is_westwood(tp) ? __tcp_westwood_bw_rttmin(tp) : 0; -} - -static inline int tcp_westwood_ssthresh(struct tcp_sock *tp) -{ - __u32 ssthresh = 0; - - if (tcp_is_westwood(tp)) { - ssthresh = __tcp_westwood_bw_rttmin(tp); - if (ssthresh) - tp->snd_ssthresh = ssthresh; - } - - return (ssthresh != 0); -} - -static inline int tcp_westwood_cwnd(struct tcp_sock *tp) -{ - __u32 cwnd = 0; - - if (tcp_is_westwood(tp)) { - cwnd = __tcp_westwood_bw_rttmin(tp); - if (cwnd) - tp->snd_cwnd = cwnd; - } - - return (cwnd != 0); -} #endif /* _TCP_H */ diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 567b03b1c34..690e88ba248 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -433,5 +433,95 @@ config IP_TCPDIAG config IP_TCPDIAG_IPV6 def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6) +# TCP Reno is builtin (required as fallback) +menu "TCP congestion control" + depends on INET + +config TCP_CONG_BIC + tristate "Binary Increase Congestion (BIC) control" + depends on INET + default y + ---help--- + BIC-TCP is a sender-side only change that ensures a linear RTT + fairness under large windows while offering both scalability and + bounded TCP-friendliness. The protocol combines two schemes + called additive increase and binary search increase. When the + congestion window is large, additive increase with a large + increment ensures linear RTT fairness as well as good + scalability. Under small congestion windows, binary search + increase provides TCP friendliness. + See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ + +config TCP_CONG_WESTWOOD + tristate "TCP Westwood+" + depends on INET + default m + ---help--- + TCP Westwood+ is a sender-side only modification of the TCP Reno + protocol stack that optimizes the performance of TCP congestion + control. It is based on end-to-end bandwidth estimation to set + congestion window and slow start threshold after a congestion + episode. Using this estimation, TCP Westwood+ adaptively sets a + slow start threshold and a congestion window which takes into + account the bandwidth used at the time congestion is experienced. + TCP Westwood+ significantly increases fairness wrt TCP Reno in + wired networks and throughput over wireless links. + +config TCP_CONG_HTCP + tristate "H-TCP" + depends on INET + default m + ---help--- + H-TCP is a send-side only modifications of the TCP Reno + protocol stack that optimizes the performance of TCP + congestion control for high speed network links. It uses a + modeswitch to change the alpha and beta parameters of TCP Reno + based on network conditions and in a way so as to be fair with + other Reno and H-TCP flows. + +config TCP_CONG_HSTCP + tristate "High Speed TCP" + depends on INET && EXPERIMENTAL + default n + ---help--- + Sally Floyd's High Speed TCP (RFC 3649) congestion control. + A modification to TCP's congestion control mechanism for use + with large congestion windows. A table indicates how much to + increase the congestion window by when an ACK is received. + For more detail see http://www.icir.org/floyd/hstcp.html + +config TCP_CONG_HYBLA + tristate "TCP-Hybla congestion control algorithm" + depends on INET && EXPERIMENTAL + default n + ---help--- + TCP-Hybla is a sender-side only change that eliminates penalization of + long-RTT, large-bandwidth connections, like when satellite legs are + involved, expecially when sharing a common bottleneck with normal + terrestrial connections. + +config TCP_CONG_VEGAS + tristate "TCP Vegas" + depends on INET && EXPERIMENTAL + default n + ---help--- + TCP Vegas is a sender-side only change to TCP that anticipates + the onset of congestion by estimating the bandwidth. TCP Vegas + adjusts the sending rate by modifying the congestion + window. TCP Vegas should provide less packet loss, but it is + not as aggressive as TCP Reno. + +config TCP_CONG_SCALABLE + tristate "Scalable TCP" + depends on INET && EXPERIMENTAL + default n + ---help--- + Scalable TCP is a sender-side only change to TCP which uses a + MIMD congestion control algorithm which has some nice scaling + properties, though is known to have fairness issues. + See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/ + +endmenu + source "net/ipv4/ipvs/Kconfig" diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 65d57d8e1ad..5718cdb3a61 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -5,7 +5,8 @@ obj-y := utils.o route.o inetpeer.o protocol.o \ ip_input.o ip_fragment.o ip_forward.o ip_options.o \ ip_output.o ip_sockglue.o \ - tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \ + tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ + tcp_minisocks.o tcp_cong.o \ datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ sysctl_net_ipv4.o fib_frontend.o fib_semantics.o @@ -30,6 +31,13 @@ obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_IP_VS) += ipvs/ obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o +obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o +obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o +obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o +obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o +obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o +obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o +obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 23068bddbf0..e3289453241 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -118,6 +118,45 @@ static int ipv4_sysctl_forward_strategy(ctl_table *table, return 1; } +static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char val[TCP_CA_NAME_MAX]; + ctl_table tbl = { + .data = val, + .maxlen = TCP_CA_NAME_MAX, + }; + int ret; + + tcp_get_default_congestion_control(val); + + ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos); + if (write && ret == 0) + ret = tcp_set_default_congestion_control(val); + return ret; +} + +int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, + void **context) +{ + char val[TCP_CA_NAME_MAX]; + ctl_table tbl = { + .data = val, + .maxlen = TCP_CA_NAME_MAX, + }; + int ret; + + tcp_get_default_congestion_control(val); + ret = sysctl_string(&tbl, name, nlen, oldval, oldlenp, newval, newlen, + context); + if (ret == 0 && newval && newlen) + ret = tcp_set_default_congestion_control(val); + return ret; +} + + ctl_table ipv4_table[] = { { .ctl_name = NET_IPV4_TCP_TIMESTAMPS, @@ -612,70 +651,6 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_dointvec, }, { - .ctl_name = NET_TCP_WESTWOOD, - .procname = "tcp_westwood", - .data = &sysctl_tcp_westwood, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_VEGAS, - .procname = "tcp_vegas_cong_avoid", - .data = &sysctl_tcp_vegas_cong_avoid, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_VEGAS_ALPHA, - .procname = "tcp_vegas_alpha", - .data = &sysctl_tcp_vegas_alpha, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_VEGAS_BETA, - .procname = "tcp_vegas_beta", - .data = &sysctl_tcp_vegas_beta, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_VEGAS_GAMMA, - .procname = "tcp_vegas_gamma", - .data = &sysctl_tcp_vegas_gamma, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_BIC, - .procname = "tcp_bic", - .data = &sysctl_tcp_bic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_BIC_FAST_CONVERGENCE, - .procname = "tcp_bic_fast_convergence", - .data = &sysctl_tcp_bic_fast_convergence, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = NET_TCP_BIC_LOW_WINDOW, - .procname = "tcp_bic_low_window", - .data = &sysctl_tcp_bic_low_window, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { .ctl_name = NET_TCP_MODERATE_RCVBUF, .procname = "tcp_moderate_rcvbuf", .data = &sysctl_tcp_moderate_rcvbuf, @@ -692,13 +667,14 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_dointvec, }, { - .ctl_name = NET_TCP_BIC_BETA, - .procname = "tcp_bic_beta", - .data = &sysctl_tcp_bic_beta, - .maxlen = sizeof(int), + .ctl_name = NET_TCP_CONG_CONTROL, + .procname = "tcp_congestion_control", .mode = 0644, - .proc_handler = &proc_dointvec, + .maxlen = TCP_CA_NAME_MAX, + .proc_handler = &proc_tcp_congestion_control, + .strategy = &sysctl_tcp_congestion_control, }, + { .ctl_name = 0 } }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 674bbd8cfd3..f3dbc8dc126 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2333,6 +2333,8 @@ void __init tcp_init(void) printk(KERN_INFO "TCP: Hash tables configured " "(established %d bind %d)\n", tcp_ehash_size << 1, tcp_bhash_size); + + tcp_register_congestion_control(&tcp_reno); } EXPORT_SYMBOL(tcp_accept); diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c new file mode 100644 index 00000000000..ec38d45d664 --- /dev/null +++ b/net/ipv4/tcp_bic.c @@ -0,0 +1,331 @@ +/* + * Binary Increase Congestion control for TCP + * + * This is from the implementation of BICTCP in + * Lison-Xu, Kahaled Harfoush, and Injong Rhee. + * "Binary Increase Congestion Control for Fast, Long Distance + * Networks" in InfoComm 2004 + * Available from: + * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf + * + * Unless BIC is enabled and congestion window is large + * this behaves the same as the original Reno. + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <net/tcp.h> + + +#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation + * max_cwnd = snd_cwnd * beta + */ +#define BICTCP_B 4 /* + * In binary search, + * go to point (max+min)/N + */ + +static int fast_convergence = 1; +static int max_increment = 32; +static int low_window = 14; +static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ +static int low_utilization_threshold = 153; +static int low_utilization_period = 2; +static int initial_ssthresh = 100; +static int smooth_part = 20; + +module_param(fast_convergence, int, 0644); +MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence"); +module_param(max_increment, int, 0644); +MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search"); +module_param(low_window, int, 0644); +MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)"); +module_param(beta, int, 0644); +MODULE_PARM_DESC(beta, "beta for multiplicative increase"); +module_param(low_utilization_threshold, int, 0644); +MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode"); +module_param(low_utilization_period, int, 0644); +MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)"); +module_param(initial_ssthresh, int, 0644); +MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); +module_param(smooth_part, int, 0644); +MODULE_PARM_DESC(smooth_part, "log(B/(B*Smin))/log(B/(B-1))+B, # of RTT from Wmax-B to Wmax"); + + +/* BIC TCP Parameters */ +struct bictcp { + u32 cnt; /* increase cwnd by 1 after ACKs */ + u32 last_max_cwnd; /* last maximum snd_cwnd */ + u32 loss_cwnd; /* congestion window at last loss */ + u32 last_cwnd; /* the last snd_cwnd */ + u32 last_time; /* time when updated last_cwnd */ + u32 delay_min; /* min delay */ + u32 delay_max; /* max delay */ + u32 last_delay; + u8 low_utilization;/* 0: high; 1: low */ + u32 low_utilization_start; /* starting time of low utilization detection*/ + u32 epoch_start; /* beginning of an epoch */ +#define ACK_RATIO_SHIFT 4 + u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ +}; + +static inline void bictcp_reset(struct bictcp *ca) +{ + ca->cnt = 0; + ca->last_max_cwnd = 0; + ca->loss_cwnd = 0; + ca->last_cwnd = 0; + ca->last_time = 0; + ca->delay_min = 0; + ca->delay_max = 0; + ca->last_delay = 0; + ca->low_utilization = 0; + ca->low_utilization_start = 0; + ca->epoch_start = 0; + ca->delayed_ack = 2 << ACK_RATIO_SHIFT; +} + +static void bictcp_init(struct tcp_sock *tp) +{ + bictcp_reset(tcp_ca(tp)); + if (initial_ssthresh) + tp->snd_ssthresh = initial_ssthresh; +} + +/* + * Compute congestion window to use. + */ +static inline void bictcp_update(struct bictcp *ca, u32 cwnd) +{ + if (ca->last_cwnd == cwnd && + (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) + return; + + ca->last_cwnd = cwnd; + ca->last_time = tcp_time_stamp; + + if (ca->epoch_start == 0) /* record the beginning of an epoch */ + ca->epoch_start = tcp_time_stamp; + + /* start off normal */ + if (cwnd <= low_window) { + ca->cnt = cwnd; + return; + } + + /* binary increase */ + if (cwnd < ca->last_max_cwnd) { + __u32 dist = (ca->last_max_cwnd - cwnd) + / BICTCP_B; + + if (dist > max_increment) + /* linear increase */ + ca->cnt = cwnd / max_increment; + else if (dist <= 1U) + /* binary search increase */ + ca->cnt = (cwnd * smooth_part) / BICTCP_B; + else + /* binary search increase */ + ca->cnt = cwnd / dist; + } else { + /* slow start AMD linear increase */ + if (cwnd < ca->last_max_cwnd + BICTCP_B) + /* slow start */ + ca->cnt = (cwnd * smooth_part) / BICTCP_B; + else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1)) + /* slow start */ + ca->cnt = (cwnd * (BICTCP_B-1)) + / cwnd-ca->last_max_cwnd; + else + /* linear increase */ + ca->cnt = cwnd / max_increment; + } + + /* if in slow start or link utilization is very low */ + if ( ca->loss_cwnd == 0 || + (cwnd > ca->loss_cwnd && ca->low_utilization)) { + if (ca->cnt > 20) /* increase cwnd 5% per RTT */ + ca->cnt = 20; + } + + ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack; + if (ca->cnt == 0) /* cannot be zero */ + ca->cnt = 1; +} + + +/* Detect low utilization in congestion avoidance */ +static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag) +{ + struct bictcp *ca = tcp_ca(tp); + u32 dist, delay; + + /* No time stamp */ + if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) || + /* Discard delay samples right after fast recovery */ + tcp_time_stamp < ca->epoch_start + HZ || + /* this delay samples may not be accurate */ + flag == 0) { + ca->last_delay = 0; + goto notlow; + } + + delay = ca->last_delay<<3; /* use the same scale as tp->srtt*/ + ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr; + if (delay == 0) /* no previous delay sample */ + goto notlow; + + /* first time call or link delay decreases */ + if (ca->delay_min == 0 || ca->delay_min > delay) { + ca->delay_min = ca->delay_max = delay; + goto notlow; + } + + if (ca->delay_max < delay) + ca->delay_max = delay; + + /* utilization is low, if avg delay < dist*threshold + for checking_period time */ + dist = ca->delay_max - ca->delay_min; + if (dist <= ca->delay_min>>6 || + tp->srtt - ca->delay_min >= (dist*low_utilization_threshold)>>10) + goto notlow; + + if (ca->low_utilization_start == 0) { + ca->low_utilization = 0; + ca->low_utilization_start = tcp_time_stamp; + } else if ((s32)(tcp_time_stamp - ca->low_utilization_start) + > low_utilization_period*HZ) { + ca->low_utilization = 1; + } + + return; + + notlow: + ca->low_utilization = 0; + ca->low_utilization_start = 0; + +} + +static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack, + u32 seq_rtt, u32 in_flight, int data_acked) +{ + struct bictcp *ca = tcp_ca(tp); + + bictcp_low_utilization(tp, data_acked); + + if (in_flight < tp->snd_cwnd) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* In "safe" area, increase. */ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } else { + bictcp_update(ca, tp->snd_cwnd); + + /* In dangerous area, increase slowly. + * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd + */ + if (tp->snd_cwnd_cnt >= ca->cnt) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } + +} + +/* + * behave like Reno until low_window is reached, + * then increase congestion window slowly + */ +static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp) +{ + struct bictcp *ca = tcp_ca(tp); + + ca->epoch_start = 0; /* end of epoch */ + + /* in case of wrong delay_max*/ + if (ca->delay_min > 0 && ca->delay_max > ca->delay_min) + ca->delay_max = ca->delay_min + + ((ca->delay_max - ca->delay_min)* 90) / 100; + + /* Wmax and fast convergence */ + if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) + ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) + / (2 * BICTCP_BETA_SCALE); + else + ca->last_max_cwnd = tp->snd_cwnd; + + ca->loss_cwnd = tp->snd_cwnd; + + + if (tp->snd_cwnd <= low_window) + return max(tp->snd_cwnd >> 1U, 2U); + else + return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); +} + +static u32 bictcp_undo_cwnd(struct tcp_sock *tp) +{ + struct bictcp *ca = tcp_ca(tp); + + return max(tp->snd_cwnd, ca->last_max_cwnd); +} + +static u32 bictcp_min_cwnd(struct tcp_sock *tp) +{ + return tp->snd_ssthresh; +} + +static void bictcp_state(struct tcp_sock *tp, u8 new_state) +{ + if (new_state == TCP_CA_Loss) + bictcp_reset(tcp_ca(tp)); +} + +/* Track delayed acknowledgement ratio using sliding window + * ratio = (15*ratio + sample) / 16 + */ +static void bictcp_acked(struct tcp_sock *tp, u32 cnt) +{ + if (cnt > 0 && tp->ca_state == TCP_CA_Open) { + struct bictcp *ca = tcp_ca(tp); + cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; + ca->delayed_ack += cnt; + } +} + + +static struct tcp_congestion_ops bictcp = { + .init = bictcp_init, + .ssthresh = bictcp_recalc_ssthresh, + .cong_avoid = bictcp_cong_avoid, + .set_state = bictcp_state, + .undo_cwnd = bictcp_undo_cwnd, + .min_cwnd = bictcp_min_cwnd, + .pkts_acked = bictcp_acked, + .owner = THIS_MODULE, + .name = "bic", +}; + +static int __init bictcp_register(void) +{ + BUG_ON(sizeof(struct bictcp) > TCP_CA_PRIV_SIZE); + return tcp_register_congestion_control(&bictcp); +} + +static void __exit bictcp_unregister(void) +{ + tcp_unregister_congestion_control(&bictcp); +} + +module_init(bictcp_register); +module_exit(bictcp_unregister); + +MODULE_AUTHOR("Stephen Hemminger"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("BIC TCP"); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c new file mode 100644 index 00000000000..665394a63ae --- /dev/null +++ b/net/ipv4/tcp_cong.c @@ -0,0 +1,195 @@ +/* + * Plugable TCP congestion control support and newReno + * congestion control. + * Based on ideas from I/O scheduler suport and Web100. + * + * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org> + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/types.h> +#include <linux/list.h> +#include <net/tcp.h> + +static DEFINE_SPINLOCK(tcp_cong_list_lock); +static LIST_HEAD(tcp_cong_list); + +/* Simple linear search, don't expect many entries! */ +static struct tcp_congestion_ops *tcp_ca_find(const char *name) +{ + struct tcp_congestion_ops *e; + + list_for_each_entry(e, &tcp_cong_list, list) { + if (strcmp(e->name, name) == 0) + return e; + } + + return NULL; +} + +/* + * Attach new congestion control algorthim to the list + * of available options. + */ +int tcp_register_congestion_control(struct tcp_congestion_ops *ca) +{ + int ret = 0; + + /* all algorithms must implement ssthresh and cong_avoid ops */ + if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) { + printk(KERN_ERR "TCP %s does not implement required ops\n", + ca->name); + return -EINVAL; + } + + spin_lock(&tcp_cong_list_lock); + if (tcp_ca_find(ca->name)) { + printk(KERN_NOTICE "TCP %s already registered\n", ca->name); + ret = -EEXIST; + } else { + list_add_rcu(&ca->list, &tcp_cong_list); + printk(KERN_INFO "TCP %s registered\n", ca->name); + } + spin_unlock(&tcp_cong_list_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(tcp_register_congestion_control); + +/* + * Remove congestion control algorithm, called from + * the module's remove function. Module ref counts are used + * to ensure that this can't be done till all sockets using + * that method are closed. + */ +void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) +{ + spin_lock(&tcp_cong_list_lock); + list_del_rcu(&ca->list); + spin_unlock(&tcp_cong_list_lock); +} +EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); + +/* Assign choice of congestion control. */ +void tcp_init_congestion_control(struct tcp_sock *tp) +{ + struct tcp_congestion_ops *ca; + + rcu_read_lock(); + list_for_each_entry_rcu(ca, &tcp_cong_list, list) { + if (try_module_get(ca->owner)) { + tp->ca_ops = ca; + break; + } + + } + rcu_read_unlock(); + + if (tp->ca_ops->init) + tp->ca_ops->init(tp); +} + +/* Manage refcounts on socket close. */ +void tcp_cleanup_congestion_control(struct tcp_sock *tp) +{ + if (tp->ca_ops->release) + tp->ca_ops->release(tp); + module_put(tp->ca_ops->owner); +} + +/* Used by sysctl to change default congestion control */ +int tcp_set_default_congestion_control(const char *name) +{ + struct tcp_congestion_ops *ca; + int ret = -ENOENT; + + spin_lock(&tcp_cong_list_lock); + ca = tcp_ca_find(name); +#ifdef CONFIG_KMOD + if (!ca) { + spin_unlock(&tcp_cong_list_lock); + + request_module("tcp_%s", name); + spin_lock(&tcp_cong_list_lock); + ca = tcp_ca_find(name); + } +#endif + + if (ca) { + list_move(&ca->list, &tcp_cong_list); + ret = 0; + } + spin_unlock(&tcp_cong_list_lock); + + return ret; +} + +/* Get current default congestion control */ +void tcp_get_default_congestion_control(char *name) +{ + struct tcp_congestion_ops *ca; + /* We will always have reno... */ + BUG_ON(list_empty(&tcp_cong_list)); + + rcu_read_lock(); + ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list); + strncpy(name, ca->name, TCP_CA_NAME_MAX); + rcu_read_unlock(); +} + +/* + * TCP Reno congestion control + * This is special case used for fallback as well. + */ +/* This is Jacobson's slow start and congestion avoidance. + * SIGCOMM '88, p. 328. + */ +void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight, + int flag) +{ + if (in_flight < tp->snd_cwnd) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* In "safe" area, increase. */ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } else { + /* In dangerous area, increase slowly. + * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd + */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } +} +EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); + +/* Slow start threshold is half the congestion window (min 2) */ +u32 tcp_reno_ssthresh(struct tcp_sock *tp) +{ + return max(tp->snd_cwnd >> 1U, 2U); +} +EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); + +/* Lower bound on congestion window. */ +u32 tcp_reno_min_cwnd(struct tcp_sock *tp) +{ + return tp->snd_ssthresh/2; +} +EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); + +struct tcp_congestion_ops tcp_reno = { + .name = "reno", + .owner = THIS_MODULE, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, +}; + +EXPORT_SYMBOL_GPL(tcp_reno); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 634befc0792..f66945cb158 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -42,15 +42,8 @@ struct tcpdiag_entry static struct sock *tcpnl; - #define TCPDIAG_PUT(skb, attrtype, attrlen) \ -({ int rtalen = RTA_LENGTH(attrlen); \ - struct rtattr *rta; \ - if (skb_tailroom(skb) < RTA_ALIGN(rtalen)) goto nlmsg_failure; \ - rta = (void*)__skb_put(skb, RTA_ALIGN(rtalen)); \ - rta->rta_type = attrtype; \ - rta->rta_len = rtalen; \ - RTA_DATA(rta); }) + RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, int ext, u32 pid, u32 seq, u16 nlmsg_flags) @@ -61,7 +54,6 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, struct nlmsghdr *nlh; struct tcp_info *info = NULL; struct tcpdiag_meminfo *minfo = NULL; - struct tcpvegas_info *vinfo = NULL; unsigned char *b = skb->tail; nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); @@ -73,9 +65,11 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, if (ext & (1<<(TCPDIAG_INFO-1))) info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); - if ((tcp_is_westwood(tp) || tcp_is_vegas(tp)) - && (ext & (1<<(TCPDIAG_VEGASINFO-1)))) - vinfo = TCPDIAG_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*vinfo)); + if (ext & (1<<(TCPDIAG_CONG-1))) { + size_t len = strlen(tp->ca_ops->name); + strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1), + tp->ca_ops->name); + } } r->tcpdiag_family = sk->sk_family; r->tcpdiag_state = sk->sk_state; @@ -166,23 +160,13 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, if (info) tcp_get_info(sk, info); - if (vinfo) { - if (tcp_is_vegas(tp)) { - vinfo->tcpv_enabled = tp->vegas.doing_vegas_now; - vinfo->tcpv_rttcnt = tp->vegas.cntRTT; - vinfo->tcpv_rtt = jiffies_to_usecs(tp->vegas.baseRTT); - vinfo->tcpv_minrtt = jiffies_to_usecs(tp->vegas.minRTT); - } else { - vinfo->tcpv_enabled = 0; - vinfo->tcpv_rttcnt = 0; - vinfo->tcpv_rtt = jiffies_to_usecs(tp->westwood.rtt); - vinfo->tcpv_minrtt = jiffies_to_usecs(tp->westwood.rtt_min); - } - } + if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info) + tp->ca_ops->get_info(tp, ext, skb); nlh->nlmsg_len = skb->tail - b; return skb->len; +rtattr_failure: nlmsg_failure: skb_trim(skb, b - skb->data); return -1; diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c new file mode 100644 index 00000000000..36c51f8136b --- /dev/null +++ b/net/ipv4/tcp_highspeed.c @@ -0,0 +1,181 @@ +/* + * Sally Floyd's High Speed TCP (RFC 3649) congestion control + * + * See http://www.icir.org/floyd/hstcp.html + * + * John Heffner <jheffner@psc.edu> + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <net/tcp.h> + + +/* From AIMD tables from RFC 3649 appendix B, + * with fixed-point MD scaled <<8. + */ +static const struct hstcp_aimd_val { + unsigned int cwnd; + unsigned int md; +} hstcp_aimd_vals[] = { + { 38, 128, /* 0.50 */ }, + { 118, 112, /* 0.44 */ }, + { 221, 104, /* 0.41 */ }, + { 347, 98, /* 0.38 */ }, + { 495, 93, /* 0.37 */ }, + { 663, 89, /* 0.35 */ }, + { 851, 86, /* 0.34 */ }, + { 1058, 83, /* 0.33 */ }, + { 1284, 81, /* 0.32 */ }, + { 1529, 78, /* 0.31 */ }, + { 1793, 76, /* 0.30 */ }, + { 2076, 74, /* 0.29 */ }, + { 2378, 72, /* 0.28 */ }, + { 2699, 71, /* 0.28 */ }, + { 3039, 69, /* 0.27 */ }, + { 3399, 68, /* 0.27 */ }, + { 3778, 66, /* 0.26 */ }, + { 4177, 65, /* 0.26 */ }, + { 4596, 64, /* 0.25 */ }, + { 5036, 62, /* 0.25 */ }, + { 5497, 61, /* 0.24 */ }, + { 5979, 60, /* 0.24 */ }, + { 6483, 59, /* 0.23 */ }, + { 7009, 58, /* 0.23 */ }, + { 7558, 57, /* 0.22 */ }, + { 8130, 56, /* 0.22 */ }, + { 8726, 55, /* 0.22 */ }, + { 9346, 54, /* 0.21 */ }, + { 9991, 53, /* 0.21 */ }, + { 10661, 52, /* 0.21 */ }, + { 11358, 52, /* 0.20 */ }, + { 12082, 51, /* 0.20 */ }, + { 12834, 50, /* 0.20 */ }, + { 13614, 49, /* 0.19 */ }, + { 14424, 48, /* 0.19 */ }, + { 15265, 48, /* 0.19 */ }, + { 16137, 47, /* 0.19 */ }, + { 17042, 46, /* 0.18 */ }, + { 17981, 45, /* 0.18 */ }, + { 18955, 45, /* 0.18 */ }, + { 19965, 44, /* 0.17 */ }, + { 21013, 43, /* 0.17 */ }, + { 22101, 43, /* 0.17 */ }, + { 23230, 42, /* 0.17 */ }, + { 24402, 41, /* 0.16 */ }, + { 25618, 41, /* 0.16 */ }, + { 26881, 40, /* 0.16 */ }, + { 28193, 39, /* 0.16 */ }, + { 29557, 39, /* 0.15 */ }, + { 30975, 38, /* 0.15 */ }, + { 32450, 38, /* 0.15 */ }, + { 33986, 37, /* 0.15 */ }, + { 35586, 36, /* 0.14 */ }, + { 37253, 36, /* 0.14 */ }, + { 38992, 35, /* 0.14 */ }, + { 40808, 35, /* 0.14 */ }, + { 42707, 34, /* 0.13 */ }, + { 44694, 33, /* 0.13 */ }, + { 46776, 33, /* 0.13 */ }, + { 48961, 32, /* 0.13 */ }, + { 51258, 32, /* 0.13 */ }, + { 53677, 31, /* 0.12 */ }, + { 56230, 30, /* 0.12 */ }, + { 58932, 30, /* 0.12 */ }, + { 61799, 29, /* 0.12 */ }, + { 64851, 28, /* 0.11 */ }, + { 68113, 28, /* 0.11 */ }, + { 71617, 27, /* 0.11 */ }, + { 75401, 26, /* 0.10 */ }, + { 79517, 26, /* 0.10 */ }, + { 84035, 25, /* 0.10 */ }, + { 89053, 24, /* 0.10 */ }, +}; + +#define HSTCP_AIMD_MAX ARRAY_SIZE(hstcp_aimd_vals) + +struct hstcp { + u32 ai; +}; + +static void hstcp_init(struct tcp_sock *tp) +{ + struct hstcp *ca = tcp_ca(tp); + + ca->ai = 0; + + /* Ensure the MD arithmetic works. This is somewhat pedantic, + * since I don't think we will see a cwnd this large. :) */ + tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); +} + +static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt, + u32 in_flight, int good) +{ + struct hstcp *ca = tcp_ca(tp); + + if (in_flight < tp->snd_cwnd) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } else { + /* Update AIMD parameters */ + if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { + while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && + ca->ai < HSTCP_AIMD_MAX) + ca->ai++; + } else if (tp->snd_cwnd < hstcp_aimd_vals[ca->ai].cwnd) { + while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && + ca->ai > 0) + ca->ai--; + } + + /* Do additive increase */ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) { + tp->snd_cwnd_cnt += ca->ai; + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + tp->snd_cwnd++; + tp->snd_cwnd_cnt -= tp->snd_cwnd; + } + } + } +} + +static u32 hstcp_ssthresh(struct tcp_sock *tp) +{ + struct hstcp *ca = tcp_ca(tp); + + /* Do multiplicative decrease */ + return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U); +} + + +static struct tcp_congestion_ops tcp_highspeed = { + .init = hstcp_init, + .ssthresh = hstcp_ssthresh, + .cong_avoid = hstcp_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + + .owner = THIS_MODULE, + .name = "highspeed" +}; + +static int __init hstcp_register(void) +{ + BUG_ON(sizeof(struct hstcp) > TCP_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_highspeed); +} + +static void __exit hstcp_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_highspeed); +} + +module_init(hstcp_register); +module_exit(hstcp_unregister); + +MODULE_AUTHOR("John Heffner"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("High Speed TCP"); diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c new file mode 100644 index 00000000000..40168275acf --- /dev/null +++ b/net/ipv4/tcp_htcp.c @@ -0,0 +1,289 @@ +/* + * H-TCP congestion control. The algorithm is detailed in: + * R.N.Shorten, D.J.Leith: + * "H-TCP: TCP for high-speed and long-distance networks" + * Proc. PFLDnet, Argonne, 2004. + * http://www.hamilton.ie/net/htcp3.pdf + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <net/tcp.h> + +#define ALPHA_BASE (1<<7) /* 1.0 with shift << 7 */ +#define BETA_MIN (1<<6) /* 0.5 with shift << 7 */ +#define BETA_MAX 102 /* 0.8 with shift << 7 */ + +static int use_rtt_scaling = 1; +module_param(use_rtt_scaling, int, 0644); +MODULE_PARM_DESC(use_rtt_scaling, "turn on/off RTT scaling"); + +static int use_bandwidth_switch = 1; +module_param(use_bandwidth_switch, int, 0644); +MODULE_PARM_DESC(use_bandwidth_switch, "turn on/off bandwidth switcher"); + +struct htcp { + u16 alpha; /* Fixed point arith, << 7 */ + u8 beta; /* Fixed point arith, << 7 */ + u8 modeswitch; /* Delay modeswitch until we had at least one congestion event */ + u8 ccount; /* Number of RTTs since last congestion event */ + u8 undo_ccount; + u16 packetcount; + u32 minRTT; + u32 maxRTT; + u32 snd_cwnd_cnt2; + + u32 undo_maxRTT; + u32 undo_old_maxB; + + /* Bandwidth estimation */ + u32 minB; + u32 maxB; + u32 old_maxB; + u32 Bi; + u32 lasttime; +}; + +static inline void htcp_reset(struct htcp *ca) +{ + ca->undo_ccount = ca->ccount; + ca->undo_maxRTT = ca->maxRTT; + ca->undo_old_maxB = ca->old_maxB; + + ca->ccount = 0; + ca->snd_cwnd_cnt2 = 0; +} + +static u32 htcp_cwnd_undo(struct tcp_sock *tp) +{ + struct htcp *ca = tcp_ca(tp); + ca->ccount = ca->undo_ccount; + ca->maxRTT = ca->undo_maxRTT; + ca->old_maxB = ca->undo_old_maxB; + return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta); +} + +static inline void measure_rtt(struct tcp_sock *tp) +{ + struct htcp *ca = tcp_ca(tp); + u32 srtt = tp->srtt>>3; + + /* keep track of minimum RTT seen so far, minRTT is zero at first */ + if (ca->minRTT > srtt || !ca->minRTT) + ca->minRTT = srtt; + + /* max RTT */ + if (tp->ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) { + if (ca->maxRTT < ca->minRTT) + ca->maxRTT = ca->minRTT; + if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50) + ca->maxRTT = srtt; + } +} + +static void measure_achieved_throughput(struct tcp_sock *tp, u32 pkts_acked) +{ + struct htcp *ca = tcp_ca(tp); + u32 now = tcp_time_stamp; + + /* achieved throughput calculations */ + if (tp->ca_state != TCP_CA_Open && tp->ca_state != TCP_CA_Disorder) { + ca->packetcount = 0; + ca->lasttime = now; + return; + } + + ca->packetcount += pkts_acked; + + if (ca->packetcount >= tp->snd_cwnd - (ca->alpha>>7? : 1) + && now - ca->lasttime >= ca->minRTT + && ca->minRTT > 0) { + __u32 cur_Bi = ca->packetcount*HZ/(now - ca->lasttime); + if (ca->ccount <= 3) { + /* just after backoff */ + ca->minB = ca->maxB = ca->Bi = cur_Bi; + } else { + ca->Bi = (3*ca->Bi + cur_Bi)/4; + if (ca->Bi > ca->maxB) + ca->maxB = ca->Bi; + if (ca->minB > ca->maxB) + ca->minB = ca->maxB; + } + ca->packetcount = 0; + ca->lasttime = now; + } +} + +static inline void htcp_beta_update(struct htcp *ca, u32 minRTT, u32 maxRTT) +{ + if (use_bandwidth_switch) { + u32 maxB = ca->maxB; + u32 old_maxB = ca->old_maxB; + ca->old_maxB = ca->maxB; + + if (!between(5*maxB, 4*old_maxB, 6*old_maxB)) { + ca->beta = BETA_MIN; + ca->modeswitch = 0; + return; + } + } + + if (ca->modeswitch && minRTT > max(HZ/100, 1) && maxRTT) { + ca->beta = (minRTT<<7)/maxRTT; + if (ca->beta < BETA_MIN) + ca->beta = BETA_MIN; + else if (ca->beta > BETA_MAX) + ca->beta = BETA_MAX; + } else { + ca->beta = BETA_MIN; + ca->modeswitch = 1; + } +} + +static inline void htcp_alpha_update(struct htcp *ca) +{ + u32 minRTT = ca->minRTT; + u32 factor = 1; + u32 diff = ca->ccount * minRTT; /* time since last backoff */ + + if (diff > HZ) { + diff -= HZ; + factor = 1+ ( 10*diff + ((diff/2)*(diff/2)/HZ) )/HZ; + } + + if (use_rtt_scaling && minRTT) { + u32 scale = (HZ<<3)/(10*minRTT); + scale = min(max(scale, 1U<<2), 10U<<3); /* clamping ratio to interval [0.5,10]<<3 */ + factor = (factor<<3)/scale; + if (!factor) + factor = 1; + } + + ca->alpha = 2*factor*((1<<7)-ca->beta); + if (!ca->alpha) + ca->alpha = ALPHA_BASE; +} + +/* After we have the rtt data to calculate beta, we'd still prefer to wait one + * rtt before we adjust our beta to ensure we are working from a consistent + * data. + * + * This function should be called when we hit a congestion event since only at + * that point do we really have a real sense of maxRTT (the queues en route + * were getting just too full now). + */ +static void htcp_param_update(struct tcp_sock *tp) +{ + struct htcp *ca = tcp_ca(tp); + u32 minRTT = ca->minRTT; + u32 maxRTT = ca->maxRTT; + + htcp_beta_update(ca, minRTT, maxRTT); + htcp_alpha_update(ca); + + /* add slowly fading memory for maxRTT to accommodate routing changes etc */ + if (minRTT > 0 && maxRTT > minRTT) + ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100; +} + +static u32 htcp_recalc_ssthresh(struct tcp_sock *tp) +{ + struct htcp *ca = tcp_ca(tp); + htcp_param_update(tp); + return max((tp->snd_cwnd * ca->beta) >> 7, 2U); +} + +static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, + u32 in_flight, int data_acked) +{ + struct htcp *ca = tcp_ca(tp); + + if (in_flight < tp->snd_cwnd) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* In "safe" area, increase. */ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } else { + measure_rtt(tp); + + /* keep track of number of round-trip times since last backoff event */ + if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) { + ca->ccount++; + ca->snd_cwnd_cnt2 = 0; + htcp_alpha_update(ca); + } + + /* In dangerous area, increase slowly. + * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd + */ + if ((tp->snd_cwnd_cnt++ * ca->alpha)>>7 >= tp->snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + ca->ccount++; + } + } +} + +/* Lower bound on congestion window. */ +static u32 htcp_min_cwnd(struct tcp_sock *tp) +{ + return tp->snd_ssthresh; +} + + +static void htcp_init(struct tcp_sock *tp) +{ + struct htcp *ca = tcp_ca(tp); + + memset(ca, 0, sizeof(struct htcp)); + ca->alpha = ALPHA_BASE; + ca->beta = BETA_MIN; +} + +static void htcp_state(struct tcp_sock *tp, u8 new_state) +{ + switch (new_state) { + case TCP_CA_CWR: + case TCP_CA_Recovery: + case TCP_CA_Loss: + htcp_reset(tcp_ca(tp)); + break; + } +} + +static struct tcp_congestion_ops htcp = { + .init = htcp_init, + .ssthresh = htcp_recalc_ssthresh, + .min_cwnd = htcp_min_cwnd, + .cong_avoid = htcp_cong_avoid, + .set_state = htcp_state, + .undo_cwnd = htcp_cwnd_undo, + .pkts_acked = measure_achieved_throughput, + .owner = THIS_MODULE, + .name = "htcp", +}; + +static int __init htcp_register(void) +{ + BUG_ON(sizeof(struct htcp) > TCP_CA_PRIV_SIZE); + BUILD_BUG_ON(BETA_MIN >= BETA_MAX); + if (!use_bandwidth_switch) + htcp.pkts_acked = NULL; + return tcp_register_congestion_control(&htcp); +} + +static void __exit htcp_unregister(void) +{ + tcp_unregister_congestion_control(&htcp); +} + +module_init(htcp_register); +module_exit(htcp_unregister); + +MODULE_AUTHOR("Baruch Even"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("H-TCP"); diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c new file mode 100644 index 00000000000..13a66342c30 --- /dev/null +++ b/net/ipv4/tcp_hybla.c @@ -0,0 +1,187 @@ +/* + * TCP HYBLA + * + * TCP-HYBLA Congestion control algorithm, based on: + * C.Caini, R.Firrincieli, "TCP-Hybla: A TCP Enhancement + * for Heterogeneous Networks", + * International Journal on satellite Communications, + * September 2004 + * Daniele Lacamera + * root at danielinux.net + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <net/tcp.h> + +/* Tcp Hybla structure. */ +struct hybla { + u8 hybla_en; + u32 snd_cwnd_cents; /* Keeps increment values when it is <1, <<7 */ + u32 rho; /* Rho parameter, integer part */ + u32 rho2; /* Rho * Rho, integer part */ + u32 rho_3ls; /* Rho parameter, <<3 */ + u32 rho2_7ls; /* Rho^2, <<7 */ + u32 minrtt; /* Minimum smoothed round trip time value seen */ +}; + +/* Hybla reference round trip time (default= 1/40 sec = 25 ms), + expressed in jiffies */ +static int rtt0 = 25; +module_param(rtt0, int, 0644); +MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)"); + + +/* This is called to refresh values for hybla parameters */ +static inline void hybla_recalc_param (struct tcp_sock *tp) +{ + struct hybla *ca = tcp_ca(tp); + + ca->rho_3ls = max_t(u32, tp->srtt / msecs_to_jiffies(rtt0), 8); + ca->rho = ca->rho_3ls >> 3; + ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; + ca->rho2 = ca->rho2_7ls >>7; +} + +static void hybla_init(struct tcp_sock *tp) +{ + struct hybla *ca = tcp_ca(tp); + + ca->rho = 0; + ca->rho2 = 0; + ca->rho_3ls = 0; + ca->rho2_7ls = 0; + ca->snd_cwnd_cents = 0; + ca->hybla_en = 1; + tp->snd_cwnd = 2; + tp->snd_cwnd_clamp = 65535; + + /* 1st Rho measurement based on initial srtt */ + hybla_recalc_param(tp); + + /* set minimum rtt as this is the 1st ever seen */ + ca->minrtt = tp->srtt; + tp->snd_cwnd = ca->rho; +} + +static void hybla_state(struct tcp_sock *tp, u8 ca_state) +{ + struct hybla *ca = tcp_ca(tp); + + ca->hybla_en = (ca_state == TCP_CA_Open); +} + +static inline u32 hybla_fraction(u32 odds) +{ + static const u32 fractions[] = { + 128, 139, 152, 165, 181, 197, 215, 234, + }; + + return (odds < ARRAY_SIZE(fractions)) ? fractions[odds] : 128; +} + +/* TCP Hybla main routine. + * This is the algorithm behavior: + * o Recalc Hybla parameters if min_rtt has changed + * o Give cwnd a new value based on the model proposed + * o remember increments <1 + */ +static void hybla_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, + u32 in_flight, int flag) +{ + struct hybla *ca = tcp_ca(tp); + u32 increment, odd, rho_fractions; + int is_slowstart = 0; + + /* Recalculate rho only if this srtt is the lowest */ + if (tp->srtt < ca->minrtt){ + hybla_recalc_param(tp); + ca->minrtt = tp->srtt; + } + + if (!ca->hybla_en) + return tcp_reno_cong_avoid(tp, ack, rtt, in_flight, flag); + + if (in_flight < tp->snd_cwnd) + return; + + if (ca->rho == 0) + hybla_recalc_param(tp); + + rho_fractions = ca->rho_3ls - (ca->rho << 3); + + if (tp->snd_cwnd < tp->snd_ssthresh) { + /* + * slow start + * INC = 2^RHO - 1 + * This is done by splitting the rho parameter + * into 2 parts: an integer part and a fraction part. + * Inrement<<7 is estimated by doing: + * [2^(int+fract)]<<7 + * that is equal to: + * (2^int) * [(2^fract) <<7] + * 2^int is straightly computed as 1<<int, + * while we will use hybla_slowstart_fraction_increment() to + * calculate 2^fract in a <<7 value. + */ + is_slowstart = 1; + increment = ((1 << ca->rho) * hybla_fraction(rho_fractions)) + - 128; + } else { + /* + * congestion avoidance + * INC = RHO^2 / W + * as long as increment is estimated as (rho<<7)/window + * it already is <<7 and we can easily count its fractions. + */ + increment = ca->rho2_7ls / tp->snd_cwnd; + if (increment < 128) + tp->snd_cwnd_cnt++; + } + + odd = increment % 128; + tp->snd_cwnd += increment >> 7; + ca->snd_cwnd_cents += odd; + + /* check when fractions goes >=128 and increase cwnd by 1. */ + while(ca->snd_cwnd_cents >= 128) { + tp->snd_cwnd++; + ca->snd_cwnd_cents -= 128; + tp->snd_cwnd_cnt = 0; + } + + /* clamp down slowstart cwnd to ssthresh value. */ + if (is_slowstart) + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + + tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); +} + +static struct tcp_congestion_ops tcp_hybla = { + .init = hybla_init, + .ssthresh = tcp_reno_ssthresh, + .min_cwnd = tcp_reno_min_cwnd, + .cong_avoid = hybla_cong_avoid, + .set_state = hybla_state, + + .owner = THIS_MODULE, + .name = "hybla" +}; + +static int __init hybla_register(void) +{ + BUG_ON(sizeof(struct hybla) > TCP_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_hybla); +} + +static void __exit hybla_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_hybla); +} + +module_init(hybla_register); +module_exit(hybla_unregister); + +MODULE_AUTHOR("Daniele Lacamera"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Hybla"); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5bad504630a..7bbbbc33eb4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -61,7 +61,6 @@ * Panu Kuhlberg: Experimental audit of TCP (re)transmission * engine. Lots of bugs are found. * Pasi Sarolahti: F-RTO for dealing with spurious RTOs - * Angelo Dell'Aera: TCP Westwood+ support */ #include <linux/config.h> @@ -88,23 +87,9 @@ int sysctl_tcp_rfc1337; int sysctl_tcp_max_orphans = NR_FILE; int sysctl_tcp_frto; int sysctl_tcp_nometrics_save; -int sysctl_tcp_westwood; -int sysctl_tcp_vegas_cong_avoid; int sysctl_tcp_moderate_rcvbuf = 1; -/* Default values of the Vegas variables, in fixed-point representation - * with V_PARAM_SHIFT bits to the right of the binary point. - */ -#define V_PARAM_SHIFT 1 -int sysctl_tcp_vegas_alpha = 1<<V_PARAM_SHIFT; -int sysctl_tcp_vegas_beta = 3<<V_PARAM_SHIFT; -int sysctl_tcp_vegas_gamma = 1<<V_PARAM_SHIFT; -int sysctl_tcp_bic = 1; -int sysctl_tcp_bic_fast_convergence = 1; -int sysctl_tcp_bic_low_window = 14; -int sysctl_tcp_bic_beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ - #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ @@ -333,15 +318,6 @@ static void tcp_init_buffer_space(struct sock *sk) tp->snd_cwnd_stamp = tcp_time_stamp; } -static void init_bictcp(struct tcp_sock *tp) -{ - tp->bictcp.cnt = 0; - - tp->bictcp.last_max_cwnd = 0; - tp->bictcp.last_cwnd = 0; - tp->bictcp.last_stamp = 0; -} - /* 5. Recalculate window clamp after socket hit its memory bounds. */ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) { @@ -558,45 +534,6 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ tcp_grow_window(sk, tp, skb); } -/* When starting a new connection, pin down the current choice of - * congestion algorithm. - */ -void tcp_ca_init(struct tcp_sock *tp) -{ - if (sysctl_tcp_westwood) - tp->adv_cong = TCP_WESTWOOD; - else if (sysctl_tcp_bic) - tp->adv_cong = TCP_BIC; - else if (sysctl_tcp_vegas_cong_avoid) { - tp->adv_cong = TCP_VEGAS; - tp->vegas.baseRTT = 0x7fffffff; - tcp_vegas_enable(tp); - } -} - -/* Do RTT sampling needed for Vegas. - * Basically we: - * o min-filter RTT samples from within an RTT to get the current - * propagation delay + queuing delay (we are min-filtering to try to - * avoid the effects of delayed ACKs) - * o min-filter RTT samples from a much longer window (forever for now) - * to find the propagation delay (baseRTT) - */ -static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt) -{ - __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */ - - /* Filter to find propagation delay: */ - if (vrtt < tp->vegas.baseRTT) - tp->vegas.baseRTT = vrtt; - - /* Find the min RTT during the last RTT to find - * the current prop. delay + queuing delay: - */ - tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt); - tp->vegas.cntRTT++; -} - /* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge @@ -606,13 +543,10 @@ static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt) * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ -static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) +static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt) { long m = mrtt; /* RTT */ - if (tcp_vegas_enabled(tp)) - vegas_rtt_calc(tp, mrtt); - /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. @@ -670,7 +604,8 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) tp->rtt_seq = tp->snd_nxt; } - tcp_westwood_update_rtt(tp, tp->srtt >> 3); + if (tp->ca_ops->rtt_sample) + tp->ca_ops->rtt_sample(tp, *usrtt); } /* Calculate rto without backoff. This is the second half of Van Jacobson's @@ -1185,8 +1120,8 @@ void tcp_enter_frto(struct sock *sk) tp->snd_una == tp->high_seq || (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(tp); - if (!tcp_westwood_ssthresh(tp)) - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); + tcp_ca_event(tp, CA_EVENT_FRTO); } /* Have to clear retransmission markers here to keep the bookkeeping @@ -1252,8 +1187,6 @@ static void tcp_enter_frto_loss(struct sock *sk) tcp_set_ca_state(tp, TCP_CA_Loss); tp->high_seq = tp->frto_highmark; TCP_ECN_queue_cwr(tp); - - init_bictcp(tp); } void tcp_clear_retrans(struct tcp_sock *tp) @@ -1283,7 +1216,8 @@ void tcp_enter_loss(struct sock *sk, int how) if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(tp); - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); + tcp_ca_event(tp, CA_EVENT_LOSS); } tp->snd_cwnd = 1; tp->snd_cwnd_cnt = 0; @@ -1596,28 +1530,14 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) } /* Decrease cwnd each second ack. */ - static void tcp_cwnd_down(struct tcp_sock *tp) { int decr = tp->snd_cwnd_cnt + 1; - __u32 limit; - - /* - * TCP Westwood - * Here limit is evaluated as BWestimation*RTTmin (for obtaining it - * in packets we use mss_cache). If sysctl_tcp_westwood is off - * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is - * still used as usual. It prevents other strange cases in which - * BWE*RTTmin could assume value 0. It should not happen but... - */ - - if (!(limit = tcp_westwood_bw_rttmin(tp))) - limit = tp->snd_ssthresh/2; tp->snd_cwnd_cnt = decr&1; decr >>= 1; - if (decr && tp->snd_cwnd > limit) + if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp)) tp->snd_cwnd -= decr; tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); @@ -1654,8 +1574,8 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg) static void tcp_undo_cwr(struct tcp_sock *tp, int undo) { if (tp->prior_ssthresh) { - if (tcp_is_bic(tp)) - tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd); + if (tp->ca_ops->undo_cwnd) + tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp); else tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); @@ -1767,11 +1687,9 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) static inline void tcp_complete_cwr(struct tcp_sock *tp) { - if (tcp_westwood_cwnd(tp)) - tp->snd_ssthresh = tp->snd_cwnd; - else - tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); tp->snd_cwnd_stamp = tcp_time_stamp; + tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR); } static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) @@ -1946,7 +1864,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, if (tp->ca_state < TCP_CA_CWR) { if (!(flag&FLAG_ECE)) tp->prior_ssthresh = tcp_current_ssthresh(tp); - tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); TCP_ECN_queue_cwr(tp); } @@ -1963,7 +1881,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* Read draft-ietf-tcplw-high-performance before mucking * with this code. (Superceeds RFC1323) */ -static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag) +static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag) { __u32 seq_rtt; @@ -1983,13 +1901,13 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag) * in window is lost... Voila. --ANK (010210) */ seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; - tcp_rtt_estimator(tp, seq_rtt); + tcp_rtt_estimator(tp, seq_rtt, usrtt); tcp_set_rto(tp); tp->backoff = 0; tcp_bound_rto(tp); } -static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag) +static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag) { /* We don't have a timestamp. Can only use * packets that are not retransmitted to determine @@ -2003,338 +1921,29 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag) if (flag & FLAG_RETRANS_DATA_ACKED) return; - tcp_rtt_estimator(tp, seq_rtt); + tcp_rtt_estimator(tp, seq_rtt, usrtt); tcp_set_rto(tp); tp->backoff = 0; tcp_bound_rto(tp); } static inline void tcp_ack_update_rtt(struct tcp_sock *tp, - int flag, s32 seq_rtt) + int flag, s32 seq_rtt, u32 *usrtt) { /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) - tcp_ack_saw_tstamp(tp, flag); + tcp_ack_saw_tstamp(tp, usrtt, flag); else if (seq_rtt >= 0) - tcp_ack_no_tstamp(tp, seq_rtt, flag); + tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag); } -/* - * Compute congestion window to use. - * - * This is from the implementation of BICTCP in - * Lison-Xu, Kahaled Harfoush, and Injog Rhee. - * "Binary Increase Congestion Control for Fast, Long Distance - * Networks" in InfoComm 2004 - * Available from: - * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf - * - * Unless BIC is enabled and congestion window is large - * this behaves the same as the original Reno. - */ -static inline __u32 bictcp_cwnd(struct tcp_sock *tp) -{ - /* orignal Reno behaviour */ - if (!tcp_is_bic(tp)) - return tp->snd_cwnd; - - if (tp->bictcp.last_cwnd == tp->snd_cwnd && - (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5)) - return tp->bictcp.cnt; - - tp->bictcp.last_cwnd = tp->snd_cwnd; - tp->bictcp.last_stamp = tcp_time_stamp; - - /* start off normal */ - if (tp->snd_cwnd <= sysctl_tcp_bic_low_window) - tp->bictcp.cnt = tp->snd_cwnd; - - /* binary increase */ - else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) { - __u32 dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd) - / BICTCP_B; - - if (dist > BICTCP_MAX_INCREMENT) - /* linear increase */ - tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT; - else if (dist <= 1U) - /* binary search increase */ - tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR - / BICTCP_B; - else - /* binary search increase */ - tp->bictcp.cnt = tp->snd_cwnd / dist; - } else { - /* slow start amd linear increase */ - if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B) - /* slow start */ - tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR - / BICTCP_B; - else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd - + BICTCP_MAX_INCREMENT*(BICTCP_B-1)) - /* slow start */ - tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1) - / (tp->snd_cwnd-tp->bictcp.last_max_cwnd); - else - /* linear increase */ - tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT; - } - return tp->bictcp.cnt; -} - -/* This is Jacobson's slow start and congestion avoidance. - * SIGCOMM '88, p. 328. - */ -static inline void reno_cong_avoid(struct tcp_sock *tp) +static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, + u32 in_flight, int good) { - if (tp->snd_cwnd <= tp->snd_ssthresh) { - /* In "safe" area, increase. */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } else { - /* In dangerous area, increase slowly. - * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd - */ - if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - tp->snd_cwnd_cnt=0; - } else - tp->snd_cwnd_cnt++; - } + tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good); tp->snd_cwnd_stamp = tcp_time_stamp; } -/* This is based on the congestion detection/avoidance scheme described in - * Lawrence S. Brakmo and Larry L. Peterson. - * "TCP Vegas: End to end congestion avoidance on a global internet." - * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, - * October 1995. Available from: - * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps - * - * See http://www.cs.arizona.edu/xkernel/ for their implementation. - * The main aspects that distinguish this implementation from the - * Arizona Vegas implementation are: - * o We do not change the loss detection or recovery mechanisms of - * Linux in any way. Linux already recovers from losses quite well, - * using fine-grained timers, NewReno, and FACK. - * o To avoid the performance penalty imposed by increasing cwnd - * only every-other RTT during slow start, we increase during - * every RTT during slow start, just like Reno. - * o Largely to allow continuous cwnd growth during slow start, - * we use the rate at which ACKs come back as the "actual" - * rate, rather than the rate at which data is sent. - * o To speed convergence to the right rate, we set the cwnd - * to achieve the right ("actual") rate when we exit slow start. - * o To filter out the noise caused by delayed ACKs, we use the - * minimum RTT sample observed during the last RTT to calculate - * the actual rate. - * o When the sender re-starts from idle, it waits until it has - * received ACKs for an entire flight of new data before making - * a cwnd adjustment decision. The original Vegas implementation - * assumed senders never went idle. - */ -static void vegas_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt) -{ - /* The key players are v_beg_snd_una and v_beg_snd_nxt. - * - * These are so named because they represent the approximate values - * of snd_una and snd_nxt at the beginning of the current RTT. More - * precisely, they represent the amount of data sent during the RTT. - * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, - * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding - * bytes of data have been ACKed during the course of the RTT, giving - * an "actual" rate of: - * - * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) - * - * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, - * because delayed ACKs can cover more than one segment, so they - * don't line up nicely with the boundaries of RTTs. - * - * Another unfortunate fact of life is that delayed ACKs delay the - * advance of the left edge of our send window, so that the number - * of bytes we send in an RTT is often less than our cwnd will allow. - * So we keep track of our cwnd separately, in v_beg_snd_cwnd. - */ - - if (after(ack, tp->vegas.beg_snd_nxt)) { - /* Do the Vegas once-per-RTT cwnd adjustment. */ - u32 old_wnd, old_snd_cwnd; - - - /* Here old_wnd is essentially the window of data that was - * sent during the previous RTT, and has all - * been acknowledged in the course of the RTT that ended - * with the ACK we just received. Likewise, old_snd_cwnd - * is the cwnd during the previous RTT. - */ - old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) / - tp->mss_cache_std; - old_snd_cwnd = tp->vegas.beg_snd_cwnd; - - /* Save the extent of the current window so we can use this - * at the end of the next RTT. - */ - tp->vegas.beg_snd_una = tp->vegas.beg_snd_nxt; - tp->vegas.beg_snd_nxt = tp->snd_nxt; - tp->vegas.beg_snd_cwnd = tp->snd_cwnd; - - /* Take into account the current RTT sample too, to - * decrease the impact of delayed acks. This double counts - * this sample since we count it for the next window as well, - * but that's not too awful, since we're taking the min, - * rather than averaging. - */ - vegas_rtt_calc(tp, seq_rtt); - - /* We do the Vegas calculations only if we got enough RTT - * samples that we can be reasonably sure that we got - * at least one RTT sample that wasn't from a delayed ACK. - * If we only had 2 samples total, - * then that means we're getting only 1 ACK per RTT, which - * means they're almost certainly delayed ACKs. - * If we have 3 samples, we should be OK. - */ - - if (tp->vegas.cntRTT <= 2) { - /* We don't have enough RTT samples to do the Vegas - * calculation, so we'll behave like Reno. - */ - if (tp->snd_cwnd > tp->snd_ssthresh) - tp->snd_cwnd++; - } else { - u32 rtt, target_cwnd, diff; - - /* We have enough RTT samples, so, using the Vegas - * algorithm, we determine if we should increase or - * decrease cwnd, and by how much. - */ - - /* Pluck out the RTT we are using for the Vegas - * calculations. This is the min RTT seen during the - * last RTT. Taking the min filters out the effects - * of delayed ACKs, at the cost of noticing congestion - * a bit later. - */ - rtt = tp->vegas.minRTT; - - /* Calculate the cwnd we should have, if we weren't - * going too fast. - * - * This is: - * (actual rate in segments) * baseRTT - * We keep it as a fixed point number with - * V_PARAM_SHIFT bits to the right of the binary point. - */ - target_cwnd = ((old_wnd * tp->vegas.baseRTT) - << V_PARAM_SHIFT) / rtt; - - /* Calculate the difference between the window we had, - * and the window we would like to have. This quantity - * is the "Diff" from the Arizona Vegas papers. - * - * Again, this is a fixed point number with - * V_PARAM_SHIFT bits to the right of the binary - * point. - */ - diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; - - if (tp->snd_cwnd < tp->snd_ssthresh) { - /* Slow start. */ - if (diff > sysctl_tcp_vegas_gamma) { - /* Going too fast. Time to slow down - * and switch to congestion avoidance. - */ - tp->snd_ssthresh = 2; - - /* Set cwnd to match the actual rate - * exactly: - * cwnd = (actual rate) * baseRTT - * Then we add 1 because the integer - * truncation robs us of full link - * utilization. - */ - tp->snd_cwnd = min(tp->snd_cwnd, - (target_cwnd >> - V_PARAM_SHIFT)+1); - - } - } else { - /* Congestion avoidance. */ - u32 next_snd_cwnd; - - /* Figure out where we would like cwnd - * to be. - */ - if (diff > sysctl_tcp_vegas_beta) { - /* The old window was too fast, so - * we slow down. - */ - next_snd_cwnd = old_snd_cwnd - 1; - } else if (diff < sysctl_tcp_vegas_alpha) { - /* We don't have enough extra packets - * in the network, so speed up. - */ - next_snd_cwnd = old_snd_cwnd + 1; - } else { - /* Sending just as fast as we - * should be. - */ - next_snd_cwnd = old_snd_cwnd; - } - - /* Adjust cwnd upward or downward, toward the - * desired value. - */ - if (next_snd_cwnd > tp->snd_cwnd) - tp->snd_cwnd++; - else if (next_snd_cwnd < tp->snd_cwnd) - tp->snd_cwnd--; - } - } - - /* Wipe the slate clean for the next RTT. */ - tp->vegas.cntRTT = 0; - tp->vegas.minRTT = 0x7fffffff; - } - - /* The following code is executed for every ack we receive, - * except for conditions checked in should_advance_cwnd() - * before the call to tcp_cong_avoid(). Mainly this means that - * we only execute this code if the ack actually acked some - * data. - */ - - /* If we are in slow start, increase our cwnd in response to this ACK. - * (If we are not in slow start then we are in congestion avoidance, - * and adjust our congestion window only once per RTT. See the code - * above.) - */ - if (tp->snd_cwnd <= tp->snd_ssthresh) - tp->snd_cwnd++; - - /* to keep cwnd from growing without bound */ - tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); - - /* Make sure that we are never so timid as to reduce our cwnd below - * 2 MSS. - * - * Going below 2 MSS would risk huge delayed ACKs from our receiver. - */ - tp->snd_cwnd = max(tp->snd_cwnd, 2U); - - tp->snd_cwnd_stamp = tcp_time_stamp; -} - -static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt) -{ - if (tcp_vegas_enabled(tp)) - vegas_cong_avoid(tp, ack, seq_rtt); - else - reno_cong_avoid(tp); -} - /* Restart timer after forward progress on connection. * RFC2988 recommends to restart timer to now+rto. */ @@ -2415,13 +2024,18 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, /* Remove acknowledged frames from the retransmission queue. */ -static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) +static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; __u32 now = tcp_time_stamp; int acked = 0; __s32 seq_rtt = -1; + struct timeval usnow; + u32 pkts_acked = 0; + + if (seq_usrtt) + do_gettimeofday(&usnow); while ((skb = skb_peek(&sk->sk_write_queue)) && skb != sk->sk_send_head) { @@ -2448,6 +2062,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) */ if (!(scb->flags & TCPCB_FLAG_SYN)) { acked |= FLAG_DATA_ACKED; + ++pkts_acked; } else { acked |= FLAG_SYN_ACKED; tp->retrans_stamp = 0; @@ -2461,6 +2076,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) seq_rtt = -1; } else if (seq_rtt < 0) seq_rtt = now - scb->when; + if (seq_usrtt) + *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000 + + (usnow.tv_usec - skb->stamp.tv_usec); + if (sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= tcp_skb_pcount(skb); if (sacked & TCPCB_LOST) @@ -2479,8 +2098,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) } if (acked&FLAG_ACKED) { - tcp_ack_update_rtt(tp, acked, seq_rtt); + tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt); tcp_ack_packets_out(sk, tp); + + if (tp->ca_ops->pkts_acked) + tp->ca_ops->pkts_acked(tp, pkts_acked); } #if FASTRETRANS_DEBUG > 0 @@ -2624,257 +2246,6 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) tp->frto_counter = (tp->frto_counter + 1) % 3; } -/* - * TCP Westwood+ - */ - -/* - * @init_westwood - * This function initializes fields used in TCP Westwood+. We can't - * get no information about RTTmin at this time so we simply set it to - * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative - * since in this way we're sure it will be updated in a consistent - * way as soon as possible. It will reasonably happen within the first - * RTT period of the connection lifetime. - */ - -static void init_westwood(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tp->westwood.bw_ns_est = 0; - tp->westwood.bw_est = 0; - tp->westwood.accounted = 0; - tp->westwood.cumul_ack = 0; - tp->westwood.rtt_win_sx = tcp_time_stamp; - tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT; - tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT; - tp->westwood.snd_una = tp->snd_una; -} - -/* - * @westwood_do_filter - * Low-pass filter. Implemented using constant coeffients. - */ - -static inline __u32 westwood_do_filter(__u32 a, __u32 b) -{ - return (((7 * a) + b) >> 3); -} - -static void westwood_filter(struct sock *sk, __u32 delta) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tp->westwood.bw_ns_est = - westwood_do_filter(tp->westwood.bw_ns_est, - tp->westwood.bk / delta); - tp->westwood.bw_est = - westwood_do_filter(tp->westwood.bw_est, - tp->westwood.bw_ns_est); -} - -/* - * @westwood_update_rttmin - * It is used to update RTTmin. In this case we MUST NOT use - * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN! - */ - -static inline __u32 westwood_update_rttmin(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - __u32 rttmin = tp->westwood.rtt_min; - - if (tp->westwood.rtt != 0 && - (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin)) - rttmin = tp->westwood.rtt; - - return rttmin; -} - -/* - * @westwood_acked - * Evaluate increases for dk. - */ - -static inline __u32 westwood_acked(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - - return tp->snd_una - tp->westwood.snd_una; -} - -/* - * @westwood_new_window - * It evaluates if we are receiving data inside the same RTT window as - * when we started. - * Return value: - * It returns 0 if we are still evaluating samples in the same RTT - * window, 1 if the sample has to be considered in the next window. - */ - -static int westwood_new_window(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - __u32 left_bound; - __u32 rtt; - int ret = 0; - - left_bound = tp->westwood.rtt_win_sx; - rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN); - - /* - * A RTT-window has passed. Be careful since if RTT is less than - * 50ms we don't filter but we continue 'building the sample'. - * This minimum limit was choosen since an estimation on small - * time intervals is better to avoid... - * Obvioulsy on a LAN we reasonably will always have - * right_bound = left_bound + WESTWOOD_RTT_MIN - */ - - if ((left_bound + rtt) < tcp_time_stamp) - ret = 1; - - return ret; -} - -/* - * @westwood_update_window - * It updates RTT evaluation window if it is the right moment to do - * it. If so it calls filter for evaluating bandwidth. - */ - -static void __westwood_update_window(struct sock *sk, __u32 now) -{ - struct tcp_sock *tp = tcp_sk(sk); - __u32 delta = now - tp->westwood.rtt_win_sx; - - if (delta) { - if (tp->westwood.rtt) - westwood_filter(sk, delta); - - tp->westwood.bk = 0; - tp->westwood.rtt_win_sx = tcp_time_stamp; - } -} - - -static void westwood_update_window(struct sock *sk, __u32 now) -{ - if (westwood_new_window(sk)) - __westwood_update_window(sk, now); -} - -/* - * @__tcp_westwood_fast_bw - * It is called when we are in fast path. In particular it is called when - * header prediction is successfull. In such case infact update is - * straight forward and doesn't need any particular care. - */ - -static void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - westwood_update_window(sk, tcp_time_stamp); - - tp->westwood.bk += westwood_acked(sk); - tp->westwood.snd_una = tp->snd_una; - tp->westwood.rtt_min = westwood_update_rttmin(sk); -} - -static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb) -{ - if (tcp_is_westwood(tcp_sk(sk))) - __tcp_westwood_fast_bw(sk, skb); -} - - -/* - * @westwood_dupack_update - * It updates accounted and cumul_ack when receiving a dupack. - */ - -static void westwood_dupack_update(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tp->westwood.accounted += tp->mss_cache_std; - tp->westwood.cumul_ack = tp->mss_cache_std; -} - -static inline int westwood_may_change_cumul(struct tcp_sock *tp) -{ - return (tp->westwood.cumul_ack > tp->mss_cache_std); -} - -static inline void westwood_partial_update(struct tcp_sock *tp) -{ - tp->westwood.accounted -= tp->westwood.cumul_ack; - tp->westwood.cumul_ack = tp->mss_cache_std; -} - -static inline void westwood_complete_update(struct tcp_sock *tp) -{ - tp->westwood.cumul_ack -= tp->westwood.accounted; - tp->westwood.accounted = 0; -} - -/* - * @westwood_acked_count - * This function evaluates cumul_ack for evaluating dk in case of - * delayed or partial acks. - */ - -static inline __u32 westwood_acked_count(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tp->westwood.cumul_ack = westwood_acked(sk); - - /* If cumul_ack is 0 this is a dupack since it's not moving - * tp->snd_una. - */ - if (!(tp->westwood.cumul_ack)) - westwood_dupack_update(sk); - - if (westwood_may_change_cumul(tp)) { - /* Partial or delayed ack */ - if (tp->westwood.accounted >= tp->westwood.cumul_ack) - westwood_partial_update(tp); - else - westwood_complete_update(tp); - } - - tp->westwood.snd_una = tp->snd_una; - - return tp->westwood.cumul_ack; -} - - -/* - * @__tcp_westwood_slow_bw - * It is called when something is going wrong..even if there could - * be no problems! Infact a simple delayed packet may trigger a - * dupack. But we need to be careful in such case. - */ - -static void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - westwood_update_window(sk, tcp_time_stamp); - - tp->westwood.bk += westwood_acked_count(sk); - tp->westwood.rtt_min = westwood_update_rttmin(sk); -} - -static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb) -{ - if (tcp_is_westwood(tcp_sk(sk))) - __tcp_westwood_slow_bw(sk, skb); -} - /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) { @@ -2884,6 +2255,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) u32 ack = TCP_SKB_CB(skb)->ack_seq; u32 prior_in_flight; s32 seq_rtt; + s32 seq_usrtt = 0; int prior_packets; /* If the ack is newer than sent or older than previous acks @@ -2902,9 +2274,10 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) */ tcp_update_wl(tp, ack, ack_seq); tp->snd_una = ack; - tcp_westwood_fast_bw(sk, skb); flag |= FLAG_WIN_UPDATE; + tcp_ca_event(tp, CA_EVENT_FAST_ACK); + NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); } else { if (ack_seq != TCP_SKB_CB(skb)->end_seq) @@ -2920,7 +2293,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) flag |= FLAG_ECE; - tcp_westwood_slow_bw(sk,skb); + tcp_ca_event(tp, CA_EVENT_SLOW_ACK); } /* We passed data and got it acked, remove any soft error @@ -2935,22 +2308,20 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) prior_in_flight = tcp_packets_in_flight(tp); /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk, &seq_rtt); + flag |= tcp_clean_rtx_queue(sk, &seq_rtt, + tp->ca_ops->rtt_sample ? &seq_usrtt : NULL); if (tp->frto_counter) tcp_process_frto(sk, prior_snd_una); if (tcp_ack_is_dubious(tp, flag)) { /* Advanve CWND, if state allows this. */ - if ((flag & FLAG_DATA_ACKED) && - (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) && - tcp_may_raise_cwnd(tp, flag)) - tcp_cong_avoid(tp, ack, seq_rtt); + if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag)) + tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 0); tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); } else { - if ((flag & FLAG_DATA_ACKED) && - (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd)) - tcp_cong_avoid(tp, ack, seq_rtt); + if ((flag & FLAG_DATA_ACKED)) + tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1); } if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) @@ -4552,6 +3923,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_init_metrics(sk); + tcp_init_congestion_control(tp); + /* Prevent spurious tcp_cwnd_restart() on first data * packet. */ @@ -4708,9 +4081,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if(tp->af_specific->conn_request(sk, skb) < 0) return 1; - init_westwood(sk); - init_bictcp(tp); - /* Now we have several options: In theory there is * nothing else in the frame. KA9Q has an option to * send data with the syn, BSD accepts data with the @@ -4732,9 +4102,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; case TCP_SYN_SENT: - init_westwood(sk); - init_bictcp(tp); - queued = tcp_rcv_synsent_state_process(sk, skb, th, len); if (queued >= 0) return queued; @@ -4816,7 +4183,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !tp->srtt) - tcp_ack_saw_tstamp(tp, 0); + tcp_ack_saw_tstamp(tp, 0, 0); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; @@ -4828,6 +4195,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tcp_init_metrics(sk); + tcp_init_congestion_control(tp); + /* Prevent spurious tcp_cwnd_restart() on * first data packet. */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2d41d5d6ad1..9122814c13a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2048,6 +2048,7 @@ static int tcp_v4_init_sock(struct sock *sk) tp->mss_cache_std = tp->mss_cache = 536; tp->reordering = sysctl_tcp_reordering; + tp->ca_ops = &tcp_reno; sk->sk_state = TCP_CLOSE; @@ -2070,6 +2071,8 @@ int tcp_v4_destroy_sock(struct sock *sk) tcp_clear_xmit_timers(sk); + tcp_cleanup_congestion_control(tp); + /* Cleanup up the write buffer. */ sk_stream_writequeue_purge(sk); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b3943e7562f..f42a284164b 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -774,6 +774,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->frto_counter = 0; newtp->frto_highmark = 0; + newtp->ca_ops = &tcp_reno; + tcp_set_ca_state(newtp, TCP_CA_Open); tcp_init_xmit_timers(newsk); skb_queue_head_init(&newtp->out_of_order_queue); @@ -842,8 +844,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, if (newtp->ecn_flags&TCP_ECN_OK) sock_set_flag(newsk, SOCK_NO_LARGESEND); - tcp_ca_init(newtp); - TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS); } return newsk; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f17c6577e33..0e17c244875 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -111,8 +111,7 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst) u32 restart_cwnd = tcp_init_cwnd(tp, dst); u32 cwnd = tp->snd_cwnd; - if (tcp_is_vegas(tp)) - tcp_vegas_enable(tp); + tcp_ca_event(tp, CA_EVENT_CWND_RESTART); tp->snd_ssthresh = tcp_current_ssthresh(tp); restart_cwnd = min(restart_cwnd, cwnd); @@ -280,6 +279,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) #define SYSCTL_FLAG_WSCALE 0x2 #define SYSCTL_FLAG_SACK 0x4 + /* If congestion control is doing timestamping */ + if (tp->ca_ops->rtt_sample) + do_gettimeofday(&skb->stamp); + sysctl_flags = 0; if (tcb->flags & TCPCB_FLAG_SYN) { tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; @@ -304,17 +307,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); } - /* - * If the connection is idle and we are restarting, - * then we don't want to do any Vegas calculations - * until we get fresh RTT samples. So when we - * restart, we reset our Vegas state to a clean - * slate. After we get acks for this flight of - * packets, _then_ we can make Vegas calculations - * again. - */ - if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0) - tcp_vegas_enable(tp); + if (tcp_packets_in_flight(tp) == 0) + tcp_ca_event(tp, CA_EVENT_TX_START); th = (struct tcphdr *) skb_push(skb, tcp_header_size); skb->h.th = th; @@ -521,6 +515,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) * skbs, which it never sent before. --ANK */ TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; + buff->stamp = skb->stamp; if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { tp->lost_out -= tcp_skb_pcount(skb); @@ -1449,7 +1444,6 @@ static inline void tcp_connect_init(struct sock *sk) tp->window_clamp = dst_metric(dst, RTAX_WINDOW); tp->advmss = dst_metric(dst, RTAX_ADVMSS); tcp_initialize_rcv_mss(sk); - tcp_ca_init(tp); tcp_select_initial_window(tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), @@ -1503,7 +1497,6 @@ int tcp_connect(struct sock *sk) TCP_SKB_CB(buff)->end_seq = tp->write_seq; tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; - tcp_ca_init(tp); /* Send it off. */ TCP_SKB_CB(buff)->when = tcp_time_stamp; diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c new file mode 100644 index 00000000000..70e108e15c7 --- /dev/null +++ b/net/ipv4/tcp_scalable.c @@ -0,0 +1,68 @@ +/* Tom Kelly's Scalable TCP + * + * See htt://www-lce.eng.cam.ac.uk/~ctk21/scalable/ + * + * John Heffner <jheffner@sc.edu> + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <net/tcp.h> + +/* These factors derived from the recommended values in the aer: + * .01 and and 7/8. We use 50 instead of 100 to account for + * delayed ack. + */ +#define TCP_SCALABLE_AI_CNT 50U +#define TCP_SCALABLE_MD_SCALE 3 + +static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, + u32 in_flight, int flag) +{ + if (in_flight < tp->snd_cwnd) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + tp->snd_cwnd++; + } else { + tp->snd_cwnd_cnt++; + if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){ + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } + } + tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +static u32 tcp_scalable_ssthresh(struct tcp_sock *tp) +{ + return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); +} + + +static struct tcp_congestion_ops tcp_scalable = { + .ssthresh = tcp_scalable_ssthresh, + .cong_avoid = tcp_scalable_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + + .owner = THIS_MODULE, + .name = "scalable", +}; + +static int __init tcp_scalable_register(void) +{ + return tcp_register_congestion_control(&tcp_scalable); +} + +static void __exit tcp_scalable_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_scalable); +} + +module_init(tcp_scalable_register); +module_exit(tcp_scalable_unregister); + +MODULE_AUTHOR("John Heffner"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Scalable TCP"); diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c new file mode 100644 index 00000000000..9bd443db519 --- /dev/null +++ b/net/ipv4/tcp_vegas.c @@ -0,0 +1,411 @@ +/* + * TCP Vegas congestion control + * + * This is based on the congestion detection/avoidance scheme described in + * Lawrence S. Brakmo and Larry L. Peterson. + * "TCP Vegas: End to end congestion avoidance on a global internet." + * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, + * October 1995. Available from: + * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps + * + * See http://www.cs.arizona.edu/xkernel/ for their implementation. + * The main aspects that distinguish this implementation from the + * Arizona Vegas implementation are: + * o We do not change the loss detection or recovery mechanisms of + * Linux in any way. Linux already recovers from losses quite well, + * using fine-grained timers, NewReno, and FACK. + * o To avoid the performance penalty imposed by increasing cwnd + * only every-other RTT during slow start, we increase during + * every RTT during slow start, just like Reno. + * o Largely to allow continuous cwnd growth during slow start, + * we use the rate at which ACKs come back as the "actual" + * rate, rather than the rate at which data is sent. + * o To speed convergence to the right rate, we set the cwnd + * to achieve the right ("actual") rate when we exit slow start. + * o To filter out the noise caused by delayed ACKs, we use the + * minimum RTT sample observed during the last RTT to calculate + * the actual rate. + * o When the sender re-starts from idle, it waits until it has + * received ACKs for an entire flight of new data before making + * a cwnd adjustment decision. The original Vegas implementation + * assumed senders never went idle. + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/tcp_diag.h> + +#include <net/tcp.h> + +/* Default values of the Vegas variables, in fixed-point representation + * with V_PARAM_SHIFT bits to the right of the binary point. + */ +#define V_PARAM_SHIFT 1 +static int alpha = 1<<V_PARAM_SHIFT; +static int beta = 3<<V_PARAM_SHIFT; +static int gamma = 1<<V_PARAM_SHIFT; + +module_param(alpha, int, 0644); +MODULE_PARM_DESC(alpha, "lower bound of packets in network (scale by 2)"); +module_param(beta, int, 0644); +MODULE_PARM_DESC(beta, "upper bound of packets in network (scale by 2)"); +module_param(gamma, int, 0644); +MODULE_PARM_DESC(gamma, "limit on increase (scale by 2)"); + + +/* Vegas variables */ +struct vegas { + u32 beg_snd_nxt; /* right edge during last RTT */ + u32 beg_snd_una; /* left edge during last RTT */ + u32 beg_snd_cwnd; /* saves the size of the cwnd */ + u8 doing_vegas_now;/* if true, do vegas for this RTT */ + u16 cntRTT; /* # of RTTs measured within last RTT */ + u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ + u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ +}; + +/* There are several situations when we must "re-start" Vegas: + * + * o when a connection is established + * o after an RTO + * o after fast recovery + * o when we send a packet and there is no outstanding + * unacknowledged data (restarting an idle connection) + * + * In these circumstances we cannot do a Vegas calculation at the + * end of the first RTT, because any calculation we do is using + * stale info -- both the saved cwnd and congestion feedback are + * stale. + * + * Instead we must wait until the completion of an RTT during + * which we actually receive ACKs. + */ +static inline void vegas_enable(struct tcp_sock *tp) +{ + struct vegas *vegas = tcp_ca(tp); + + /* Begin taking Vegas samples next time we send something. */ + vegas->doing_vegas_now = 1; + + /* Set the beginning of the next send window. */ + vegas->beg_snd_nxt = tp->snd_nxt; + + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; +} + +/* Stop taking Vegas samples for now. */ +static inline void vegas_disable(struct tcp_sock *tp) +{ + struct vegas *vegas = tcp_ca(tp); + + vegas->doing_vegas_now = 0; +} + +static void tcp_vegas_init(struct tcp_sock *tp) +{ + struct vegas *vegas = tcp_ca(tp); + + vegas->baseRTT = 0x7fffffff; + vegas_enable(tp); +} + +/* Do RTT sampling needed for Vegas. + * Basically we: + * o min-filter RTT samples from within an RTT to get the current + * propagation delay + queuing delay (we are min-filtering to try to + * avoid the effects of delayed ACKs) + * o min-filter RTT samples from a much longer window (forever for now) + * to find the propagation delay (baseRTT) + */ +static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt) +{ + struct vegas *vegas = tcp_ca(tp); + u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */ + + /* Filter to find propagation delay: */ + if (vrtt < vegas->baseRTT) + vegas->baseRTT = vrtt; + + /* Find the min RTT during the last RTT to find + * the current prop. delay + queuing delay: + */ + vegas->minRTT = min(vegas->minRTT, vrtt); + vegas->cntRTT++; +} + +static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state) +{ + + if (ca_state == TCP_CA_Open) + vegas_enable(tp); + else + vegas_disable(tp); +} + +/* + * If the connection is idle and we are restarting, + * then we don't want to do any Vegas calculations + * until we get fresh RTT samples. So when we + * restart, we reset our Vegas state to a clean + * slate. After we get acks for this flight of + * packets, _then_ we can make Vegas calculations + * again. + */ +static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event) +{ + if (event == CA_EVENT_CWND_RESTART || + event == CA_EVENT_TX_START) + tcp_vegas_init(tp); +} + +static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack, + u32 seq_rtt, u32 in_flight, int flag) +{ + struct vegas *vegas = tcp_ca(tp); + + if (!vegas->doing_vegas_now) + return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag); + + /* The key players are v_beg_snd_una and v_beg_snd_nxt. + * + * These are so named because they represent the approximate values + * of snd_una and snd_nxt at the beginning of the current RTT. More + * precisely, they represent the amount of data sent during the RTT. + * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, + * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding + * bytes of data have been ACKed during the course of the RTT, giving + * an "actual" rate of: + * + * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) + * + * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, + * because delayed ACKs can cover more than one segment, so they + * don't line up nicely with the boundaries of RTTs. + * + * Another unfortunate fact of life is that delayed ACKs delay the + * advance of the left edge of our send window, so that the number + * of bytes we send in an RTT is often less than our cwnd will allow. + * So we keep track of our cwnd separately, in v_beg_snd_cwnd. + */ + + if (after(ack, vegas->beg_snd_nxt)) { + /* Do the Vegas once-per-RTT cwnd adjustment. */ + u32 old_wnd, old_snd_cwnd; + + + /* Here old_wnd is essentially the window of data that was + * sent during the previous RTT, and has all + * been acknowledged in the course of the RTT that ended + * with the ACK we just received. Likewise, old_snd_cwnd + * is the cwnd during the previous RTT. + */ + old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) / + tp->mss_cache; + old_snd_cwnd = vegas->beg_snd_cwnd; + + /* Save the extent of the current window so we can use this + * at the end of the next RTT. + */ + vegas->beg_snd_una = vegas->beg_snd_nxt; + vegas->beg_snd_nxt = tp->snd_nxt; + vegas->beg_snd_cwnd = tp->snd_cwnd; + + /* Take into account the current RTT sample too, to + * decrease the impact of delayed acks. This double counts + * this sample since we count it for the next window as well, + * but that's not too awful, since we're taking the min, + * rather than averaging. + */ + tcp_vegas_rtt_calc(tp, seq_rtt*1000); + + /* We do the Vegas calculations only if we got enough RTT + * samples that we can be reasonably sure that we got + * at least one RTT sample that wasn't from a delayed ACK. + * If we only had 2 samples total, + * then that means we're getting only 1 ACK per RTT, which + * means they're almost certainly delayed ACKs. + * If we have 3 samples, we should be OK. + */ + + if (vegas->cntRTT <= 2) { + /* We don't have enough RTT samples to do the Vegas + * calculation, so we'll behave like Reno. + */ + if (tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd++; + } else { + u32 rtt, target_cwnd, diff; + + /* We have enough RTT samples, so, using the Vegas + * algorithm, we determine if we should increase or + * decrease cwnd, and by how much. + */ + + /* Pluck out the RTT we are using for the Vegas + * calculations. This is the min RTT seen during the + * last RTT. Taking the min filters out the effects + * of delayed ACKs, at the cost of noticing congestion + * a bit later. + */ + rtt = vegas->minRTT; + + /* Calculate the cwnd we should have, if we weren't + * going too fast. + * + * This is: + * (actual rate in segments) * baseRTT + * We keep it as a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary point. + */ + target_cwnd = ((old_wnd * vegas->baseRTT) + << V_PARAM_SHIFT) / rtt; + + /* Calculate the difference between the window we had, + * and the window we would like to have. This quantity + * is the "Diff" from the Arizona Vegas papers. + * + * Again, this is a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary + * point. + */ + diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; + + if (tp->snd_cwnd < tp->snd_ssthresh) { + /* Slow start. */ + if (diff > gamma) { + /* Going too fast. Time to slow down + * and switch to congestion avoidance. + */ + tp->snd_ssthresh = 2; + + /* Set cwnd to match the actual rate + * exactly: + * cwnd = (actual rate) * baseRTT + * Then we add 1 because the integer + * truncation robs us of full link + * utilization. + */ + tp->snd_cwnd = min(tp->snd_cwnd, + (target_cwnd >> + V_PARAM_SHIFT)+1); + + } + } else { + /* Congestion avoidance. */ + u32 next_snd_cwnd; + + /* Figure out where we would like cwnd + * to be. + */ + if (diff > beta) { + /* The old window was too fast, so + * we slow down. + */ + next_snd_cwnd = old_snd_cwnd - 1; + } else if (diff < alpha) { + /* We don't have enough extra packets + * in the network, so speed up. + */ + next_snd_cwnd = old_snd_cwnd + 1; + } else { + /* Sending just as fast as we + * should be. + */ + next_snd_cwnd = old_snd_cwnd; + } + + /* Adjust cwnd upward or downward, toward the + * desired value. + */ + if (next_snd_cwnd > tp->snd_cwnd) + tp->snd_cwnd++; + else if (next_snd_cwnd < tp->snd_cwnd) + tp->snd_cwnd--; + } + } + + /* Wipe the slate clean for the next RTT. */ + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; + } + + /* The following code is executed for every ack we receive, + * except for conditions checked in should_advance_cwnd() + * before the call to tcp_cong_avoid(). Mainly this means that + * we only execute this code if the ack actually acked some + * data. + */ + + /* If we are in slow start, increase our cwnd in response to this ACK. + * (If we are not in slow start then we are in congestion avoidance, + * and adjust our congestion window only once per RTT. See the code + * above.) + */ + if (tp->snd_cwnd <= tp->snd_ssthresh) + tp->snd_cwnd++; + + /* to keep cwnd from growing without bound */ + tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); + + /* Make sure that we are never so timid as to reduce our cwnd below + * 2 MSS. + * + * Going below 2 MSS would risk huge delayed ACKs from our receiver. + */ + tp->snd_cwnd = max(tp->snd_cwnd, 2U); +} + +/* Extract info for Tcp socket info provided via netlink. */ +static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext, + struct sk_buff *skb) +{ + const struct vegas *ca = tcp_ca(tp); + if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { + struct tcpvegas_info *info; + + info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO, + sizeof(*info))); + + info->tcpv_enabled = ca->doing_vegas_now; + info->tcpv_rttcnt = ca->cntRTT; + info->tcpv_rtt = ca->baseRTT; + info->tcpv_minrtt = ca->minRTT; + rtattr_failure: ; + } +} + +static struct tcp_congestion_ops tcp_vegas = { + .init = tcp_vegas_init, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_vegas_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + .rtt_sample = tcp_vegas_rtt_calc, + .set_state = tcp_vegas_state, + .cwnd_event = tcp_vegas_cwnd_event, + .get_info = tcp_vegas_get_info, + + .owner = THIS_MODULE, + .name = "vegas", +}; + +static int __init tcp_vegas_register(void) +{ + BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE); + tcp_register_congestion_control(&tcp_vegas); + return 0; +} + +static void __exit tcp_vegas_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_vegas); +} + +module_init(tcp_vegas_register); +module_exit(tcp_vegas_unregister); + +MODULE_AUTHOR("Stephen Hemminger"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Vegas"); diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c new file mode 100644 index 00000000000..ef827242c94 --- /dev/null +++ b/net/ipv4/tcp_westwood.c @@ -0,0 +1,259 @@ +/* + * TCP Westwood+ + * + * Angelo Dell'Aera: TCP Westwood+ support + */ + +#include <linux/config.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/skbuff.h> +#include <linux/tcp_diag.h> +#include <net/tcp.h> + +/* TCP Westwood structure */ +struct westwood { + u32 bw_ns_est; /* first bandwidth estimation..not too smoothed 8) */ + u32 bw_est; /* bandwidth estimate */ + u32 rtt_win_sx; /* here starts a new evaluation... */ + u32 bk; + u32 snd_una; /* used for evaluating the number of acked bytes */ + u32 cumul_ack; + u32 accounted; + u32 rtt; + u32 rtt_min; /* minimum observed RTT */ +}; + + +/* TCP Westwood functions and constants */ +#define TCP_WESTWOOD_RTT_MIN (HZ/20) /* 50ms */ +#define TCP_WESTWOOD_INIT_RTT (20*HZ) /* maybe too conservative?! */ + +/* + * @tcp_westwood_create + * This function initializes fields used in TCP Westwood+, + * it is called after the initial SYN, so the sequence numbers + * are correct but new passive connections we have no + * information about RTTmin at this time so we simply set it to + * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative + * since in this way we're sure it will be updated in a consistent + * way as soon as possible. It will reasonably happen within the first + * RTT period of the connection lifetime. + */ +static void tcp_westwood_init(struct tcp_sock *tp) +{ + struct westwood *w = tcp_ca(tp); + + w->bk = 0; + w->bw_ns_est = 0; + w->bw_est = 0; + w->accounted = 0; + w->cumul_ack = 0; + w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; + w->rtt_win_sx = tcp_time_stamp; + w->snd_una = tp->snd_una; +} + +/* + * @westwood_do_filter + * Low-pass filter. Implemented using constant coefficients. + */ +static inline u32 westwood_do_filter(u32 a, u32 b) +{ + return (((7 * a) + b) >> 3); +} + +static inline void westwood_filter(struct westwood *w, u32 delta) +{ + w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta); + w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est); +} + +/* + * @westwood_pkts_acked + * Called after processing group of packets. + * but all westwood needs is the last sample of srtt. + */ +static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt) +{ + struct westwood *w = tcp_ca(tp); + if (cnt > 0) + w->rtt = tp->srtt >> 3; +} + +/* + * @westwood_update_window + * It updates RTT evaluation window if it is the right moment to do + * it. If so it calls filter for evaluating bandwidth. + */ +static void westwood_update_window(struct tcp_sock *tp) +{ + struct westwood *w = tcp_ca(tp); + s32 delta = tcp_time_stamp - w->rtt_win_sx; + + /* + * See if a RTT-window has passed. + * Be careful since if RTT is less than + * 50ms we don't filter but we continue 'building the sample'. + * This minimum limit was chosen since an estimation on small + * time intervals is better to avoid... + * Obviously on a LAN we reasonably will always have + * right_bound = left_bound + WESTWOOD_RTT_MIN + */ + if (w->rtt && delta > max_t(u32, w->rtt, TCP_WESTWOOD_RTT_MIN)) { + westwood_filter(w, delta); + + w->bk = 0; + w->rtt_win_sx = tcp_time_stamp; + } +} + +/* + * @westwood_fast_bw + * It is called when we are in fast path. In particular it is called when + * header prediction is successful. In such case in fact update is + * straight forward and doesn't need any particular care. + */ +static inline void westwood_fast_bw(struct tcp_sock *tp) +{ + struct westwood *w = tcp_ca(tp); + + westwood_update_window(tp); + + w->bk += tp->snd_una - w->snd_una; + w->snd_una = tp->snd_una; + w->rtt_min = min(w->rtt, w->rtt_min); +} + +/* + * @westwood_acked_count + * This function evaluates cumul_ack for evaluating bk in case of + * delayed or partial acks. + */ +static inline u32 westwood_acked_count(struct tcp_sock *tp) +{ + struct westwood *w = tcp_ca(tp); + + w->cumul_ack = tp->snd_una - w->snd_una; + + /* If cumul_ack is 0 this is a dupack since it's not moving + * tp->snd_una. + */ + if (!w->cumul_ack) { + w->accounted += tp->mss_cache; + w->cumul_ack = tp->mss_cache; + } + + if (w->cumul_ack > tp->mss_cache) { + /* Partial or delayed ack */ + if (w->accounted >= w->cumul_ack) { + w->accounted -= w->cumul_ack; + w->cumul_ack = tp->mss_cache; + } else { + w->cumul_ack -= w->accounted; + w->accounted = 0; + } + } + + w->snd_una = tp->snd_una; + + return w->cumul_ack; +} + +static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp) +{ + struct westwood *w = tcp_ca(tp); + return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); +} + +/* + * TCP Westwood + * Here limit is evaluated as Bw estimation*RTTmin (for obtaining it + * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 + * so avoids ever returning 0. + */ +static u32 tcp_westwood_cwnd_min(struct tcp_sock *tp) +{ + return westwood_bw_rttmin(tp); +} + +static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event) +{ + struct westwood *w = tcp_ca(tp); + + switch(event) { + case CA_EVENT_FAST_ACK: + westwood_fast_bw(tp); + break; + + case CA_EVENT_COMPLETE_CWR: + tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(tp); + break; + + case CA_EVENT_FRTO: + tp->snd_ssthresh = westwood_bw_rttmin(tp); + break; + + case CA_EVENT_SLOW_ACK: + westwood_update_window(tp); + w->bk += westwood_acked_count(tp); + w->rtt_min = min(w->rtt, w->rtt_min); + break; + + default: + /* don't care */ + break; + } +} + + +/* Extract info for Tcp socket info provided via netlink. */ +static void tcp_westwood_info(struct tcp_sock *tp, u32 ext, + struct sk_buff *skb) +{ + const struct westwood *ca = tcp_ca(tp); + if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { + struct rtattr *rta; + struct tcpvegas_info *info; + + rta = __RTA_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*info)); + info = RTA_DATA(rta); + info->tcpv_enabled = 1; + info->tcpv_rttcnt = 0; + info->tcpv_rtt = jiffies_to_usecs(ca->rtt); + info->tcpv_minrtt = jiffies_to_usecs(ca->rtt_min); + rtattr_failure: ; + } +} + + +static struct tcp_congestion_ops tcp_westwood = { + .init = tcp_westwood_init, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .min_cwnd = tcp_westwood_cwnd_min, + .cwnd_event = tcp_westwood_event, + .get_info = tcp_westwood_info, + .pkts_acked = tcp_westwood_pkts_acked, + + .owner = THIS_MODULE, + .name = "westwood" +}; + +static int __init tcp_westwood_register(void) +{ + BUG_ON(sizeof(struct westwood) > TCP_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_westwood); +} + +static void __exit tcp_westwood_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_westwood); +} + +module_init(tcp_westwood_register); +module_exit(tcp_westwood_unregister); + +MODULE_AUTHOR("Stephen Hemminger, Angelo Dell'Aera"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Westwood+"); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2414937f2a8..fce56039b0e 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2025,7 +2025,7 @@ static int tcp_v6_init_sock(struct sock *sk) sk->sk_state = TCP_CLOSE; tp->af_specific = &ipv6_specific; - + tp->ca_ops = &tcp_reno; sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); |