diff options
Diffstat (limited to 'net/dccp')
-rw-r--r-- | net/dccp/Kconfig | 4 | ||||
-rw-r--r-- | net/dccp/Makefile | 15 | ||||
-rw-r--r-- | net/dccp/ackvec.c | 9 | ||||
-rw-r--r-- | net/dccp/ackvec.h | 54 | ||||
-rw-r--r-- | net/dccp/ccid.c | 214 | ||||
-rw-r--r-- | net/dccp/ccid.h | 40 | ||||
-rw-r--r-- | net/dccp/ccids/Kconfig | 79 | ||||
-rw-r--r-- | net/dccp/ccids/Makefile | 9 | ||||
-rw-r--r-- | net/dccp/ccids/ccid2.c | 28 | ||||
-rw-r--r-- | net/dccp/ccids/ccid3.c | 23 | ||||
-rw-r--r-- | net/dccp/ccids/lib/Makefile | 3 | ||||
-rw-r--r-- | net/dccp/ccids/lib/loss_interval.c | 3 | ||||
-rw-r--r-- | net/dccp/ccids/lib/packet_history.c | 9 | ||||
-rw-r--r-- | net/dccp/ccids/lib/tfrc.c | 19 | ||||
-rw-r--r-- | net/dccp/ccids/lib/tfrc.h | 11 | ||||
-rw-r--r-- | net/dccp/ccids/lib/tfrc_equation.c | 4 | ||||
-rw-r--r-- | net/dccp/dccp.h | 19 | ||||
-rw-r--r-- | net/dccp/diag.c | 11 | ||||
-rw-r--r-- | net/dccp/feat.c | 1462 | ||||
-rw-r--r-- | net/dccp/feat.h | 130 | ||||
-rw-r--r-- | net/dccp/input.c | 46 | ||||
-rw-r--r-- | net/dccp/ipv4.c | 13 | ||||
-rw-r--r-- | net/dccp/ipv6.c | 15 | ||||
-rw-r--r-- | net/dccp/minisocks.c | 54 | ||||
-rw-r--r-- | net/dccp/options.c | 229 | ||||
-rw-r--r-- | net/dccp/output.c | 19 | ||||
-rw-r--r-- | net/dccp/probe.c | 19 | ||||
-rw-r--r-- | net/dccp/proto.c | 232 | ||||
-rw-r--r-- | net/dccp/sysctl.c | 21 | ||||
-rw-r--r-- | net/dccp/timer.c | 12 |
30 files changed, 1641 insertions, 1165 deletions
diff --git a/net/dccp/Kconfig b/net/dccp/Kconfig index 7aa2a7acc7e..ad6dffd9070 100644 --- a/net/dccp/Kconfig +++ b/net/dccp/Kconfig @@ -1,7 +1,6 @@ menuconfig IP_DCCP tristate "The DCCP Protocol (EXPERIMENTAL)" depends on INET && EXPERIMENTAL - select IP_DCCP_CCID2 ---help--- Datagram Congestion Control Protocol (RFC 4340) @@ -25,9 +24,6 @@ config INET_DCCP_DIAG def_tristate y if (IP_DCCP = y && INET_DIAG = y) def_tristate m -config IP_DCCP_ACKVEC - bool - source "net/dccp/ccids/Kconfig" menu "DCCP Kernel Hacking" diff --git a/net/dccp/Makefile b/net/dccp/Makefile index f4f8793aaff..2991efcc8de 100644 --- a/net/dccp/Makefile +++ b/net/dccp/Makefile @@ -2,14 +2,23 @@ obj-$(CONFIG_IP_DCCP) += dccp.o dccp_ipv4.o dccp-y := ccid.o feat.o input.o minisocks.o options.o output.o proto.o timer.o +# +# CCID algorithms to be used by dccp.ko +# +# CCID-2 is default (RFC 4340, p. 77) and has Ack Vectors as dependency +dccp-y += ccids/ccid2.o ackvec.o +dccp-$(CONFIG_IP_DCCP_CCID3) += ccids/ccid3.o +dccp-$(CONFIG_IP_DCCP_TFRC_LIB) += ccids/lib/tfrc.o \ + ccids/lib/tfrc_equation.o \ + ccids/lib/packet_history.o \ + ccids/lib/loss_interval.o + dccp_ipv4-y := ipv4.o # build dccp_ipv6 as module whenever either IPv6 or DCCP is a module obj-$(subst y,$(CONFIG_IP_DCCP),$(CONFIG_IPV6)) += dccp_ipv6.o dccp_ipv6-y := ipv6.o -dccp-$(CONFIG_IP_DCCP_ACKVEC) += ackvec.o - obj-$(CONFIG_INET_DCCP_DIAG) += dccp_diag.o obj-$(CONFIG_NET_DCCPPROBE) += dccp_probe.o @@ -17,5 +26,3 @@ dccp-$(CONFIG_SYSCTL) += sysctl.o dccp_diag-y := diag.o dccp_probe-y := probe.o - -obj-y += ccids/ diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index 1e8be246ad1..01e4d39fa23 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -12,7 +12,6 @@ #include "ackvec.h" #include "dccp.h" -#include <linux/dccp.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/kernel.h> @@ -68,7 +67,7 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) struct dccp_sock *dp = dccp_sk(sk); struct dccp_ackvec *av = dp->dccps_hc_rx_ackvec; /* Figure out how many options do we need to represent the ackvec */ - const u16 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_MAX_ACKVEC_OPT_LEN); + const u8 nr_opts = DIV_ROUND_UP(av->av_vec_len, DCCP_SINGLE_OPT_MAXLEN); u16 len = av->av_vec_len + 2 * nr_opts, i; u32 elapsed_time; const unsigned char *tail, *from; @@ -100,8 +99,8 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) for (i = 0; i < nr_opts; ++i) { int copylen = len; - if (len > DCCP_MAX_ACKVEC_OPT_LEN) - copylen = DCCP_MAX_ACKVEC_OPT_LEN; + if (len > DCCP_SINGLE_OPT_MAXLEN) + copylen = DCCP_SINGLE_OPT_MAXLEN; *to++ = DCCPO_ACK_VECTOR_0; *to++ = copylen + 2; @@ -432,7 +431,7 @@ found: int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, u64 *ackno, const u8 opt, const u8 *value, const u8 len) { - if (len > DCCP_MAX_ACKVEC_OPT_LEN) + if (len > DCCP_SINGLE_OPT_MAXLEN) return -1; /* dccp_ackvector_print(DCCP_SKB_CB(skb)->dccpd_ack_seq, value, len); */ diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h index bcb64fb4ace..45f95e55f87 100644 --- a/net/dccp/ackvec.h +++ b/net/dccp/ackvec.h @@ -11,15 +11,14 @@ * published by the Free Software Foundation. */ +#include <linux/dccp.h> #include <linux/compiler.h> #include <linux/ktime.h> #include <linux/list.h> #include <linux/types.h> -/* Read about the ECN nonce to see why it is 253 */ -#define DCCP_MAX_ACKVEC_OPT_LEN 253 /* We can spread an ack vector across multiple options */ -#define DCCP_MAX_ACKVEC_LEN (DCCP_MAX_ACKVEC_OPT_LEN * 2) +#define DCCP_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * 2) #define DCCP_ACKVEC_STATE_RECEIVED 0 #define DCCP_ACKVEC_STATE_ECN_MARKED (1 << 6) @@ -85,7 +84,6 @@ struct dccp_ackvec_record { struct sock; struct sk_buff; -#ifdef CONFIG_IP_DCCP_ACKVEC extern int dccp_ackvec_init(void); extern void dccp_ackvec_exit(void); @@ -107,52 +105,4 @@ static inline int dccp_ackvec_pending(const struct dccp_ackvec *av) { return av->av_vec_len; } -#else /* CONFIG_IP_DCCP_ACKVEC */ -static inline int dccp_ackvec_init(void) -{ - return 0; -} - -static inline void dccp_ackvec_exit(void) -{ -} - -static inline struct dccp_ackvec *dccp_ackvec_alloc(const gfp_t priority) -{ - return NULL; -} - -static inline void dccp_ackvec_free(struct dccp_ackvec *av) -{ -} - -static inline int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, - const u64 ackno, const u8 state) -{ - return -1; -} - -static inline void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, - struct sock *sk, const u64 ackno) -{ -} - -static inline int dccp_ackvec_parse(struct sock *sk, const struct sk_buff *skb, - const u64 *ackno, const u8 opt, - const u8 *value, const u8 len) -{ - return -1; -} - -static inline int dccp_insert_option_ackvec(const struct sock *sk, - const struct sk_buff *skb) -{ - return -1; -} - -static inline int dccp_ackvec_pending(const struct dccp_ackvec *av) -{ - return 0; -} -#endif /* CONFIG_IP_DCCP_ACKVEC */ #endif /* _ACKVEC_H */ diff --git a/net/dccp/ccid.c b/net/dccp/ccid.c index 8fe931a3d7a..f3e9ba1cfd0 100644 --- a/net/dccp/ccid.c +++ b/net/dccp/ccid.c @@ -12,49 +12,70 @@ */ #include "ccid.h" +#include "ccids/lib/tfrc.h" -static struct ccid_operations *ccids[CCID_MAX]; -#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) -static atomic_t ccids_lockct = ATOMIC_INIT(0); -static DEFINE_SPINLOCK(ccids_lock); +static struct ccid_operations *ccids[] = { + &ccid2_ops, +#ifdef CONFIG_IP_DCCP_CCID3 + &ccid3_ops, +#endif +}; -/* - * The strategy is: modifications ccids vector are short, do not sleep and - * veeery rare, but read access should be free of any exclusive locks. - */ -static void ccids_write_lock(void) +static struct ccid_operations *ccid_by_number(const u8 id) { - spin_lock(&ccids_lock); - while (atomic_read(&ccids_lockct) != 0) { - spin_unlock(&ccids_lock); - yield(); - spin_lock(&ccids_lock); - } + int i; + + for (i = 0; i < ARRAY_SIZE(ccids); i++) + if (ccids[i]->ccid_id == id) + return ccids[i]; + return NULL; } -static inline void ccids_write_unlock(void) +/* check that up to @array_len members in @ccid_array are supported */ +bool ccid_support_check(u8 const *ccid_array, u8 array_len) { - spin_unlock(&ccids_lock); + while (array_len > 0) + if (ccid_by_number(ccid_array[--array_len]) == NULL) + return false; + return true; } -static inline void ccids_read_lock(void) +/** + * ccid_get_builtin_ccids - Populate a list of built-in CCIDs + * @ccid_array: pointer to copy into + * @array_len: value to return length into + * This function allocates memory - caller must see that it is freed after use. + */ +int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len) { - atomic_inc(&ccids_lockct); - smp_mb__after_atomic_inc(); - spin_unlock_wait(&ccids_lock); + *ccid_array = kmalloc(ARRAY_SIZE(ccids), gfp_any()); + if (*ccid_array == NULL) + return -ENOBUFS; + + for (*array_len = 0; *array_len < ARRAY_SIZE(ccids); *array_len += 1) + (*ccid_array)[*array_len] = ccids[*array_len]->ccid_id; + return 0; } -static inline void ccids_read_unlock(void) +int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, + char __user *optval, int __user *optlen) { - atomic_dec(&ccids_lockct); -} + u8 *ccid_array, array_len; + int err = 0; -#else -#define ccids_write_lock() do { } while(0) -#define ccids_write_unlock() do { } while(0) -#define ccids_read_lock() do { } while(0) -#define ccids_read_unlock() do { } while(0) -#endif + if (len < ARRAY_SIZE(ccids)) + return -EINVAL; + + if (ccid_get_builtin_ccids(&ccid_array, &array_len)) + return -ENOBUFS; + + if (put_user(array_len, optlen) || + copy_to_user(optval, ccid_array, array_len)) + err = -EFAULT; + + kfree(ccid_array); + return err; +} static struct kmem_cache *ccid_kmem_cache_create(int obj_size, const char *fmt,...) { @@ -86,7 +107,7 @@ static void ccid_kmem_cache_destroy(struct kmem_cache *slab) } } -int ccid_register(struct ccid_operations *ccid_ops) +static int ccid_activate(struct ccid_operations *ccid_ops) { int err = -ENOBUFS; @@ -104,79 +125,40 @@ int ccid_register(struct ccid_operations *ccid_ops) if (ccid_ops->ccid_hc_tx_slab == NULL) goto out_free_rx_slab; - ccids_write_lock(); - err = -EEXIST; - if (ccids[ccid_ops->ccid_id] == NULL) { - ccids[ccid_ops->ccid_id] = ccid_ops; - err = 0; - } - ccids_write_unlock(); - if (err != 0) - goto out_free_tx_slab; - - pr_info("CCID: Registered CCID %d (%s)\n", + pr_info("CCID: Activated CCID %d (%s)\n", ccid_ops->ccid_id, ccid_ops->ccid_name); + err = 0; out: return err; -out_free_tx_slab: - ccid_kmem_cache_destroy(ccid_ops->ccid_hc_tx_slab); - ccid_ops->ccid_hc_tx_slab = NULL; - goto out; out_free_rx_slab: ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab); ccid_ops->ccid_hc_rx_slab = NULL; goto out; } -EXPORT_SYMBOL_GPL(ccid_register); - -int ccid_unregister(struct ccid_operations *ccid_ops) +static void ccid_deactivate(struct ccid_operations *ccid_ops) { - ccids_write_lock(); - ccids[ccid_ops->ccid_id] = NULL; - ccids_write_unlock(); - ccid_kmem_cache_destroy(ccid_ops->ccid_hc_tx_slab); ccid_ops->ccid_hc_tx_slab = NULL; ccid_kmem_cache_destroy(ccid_ops->ccid_hc_rx_slab); ccid_ops->ccid_hc_rx_slab = NULL; - pr_info("CCID: Unregistered CCID %d (%s)\n", + pr_info("CCID: Deactivated CCID %d (%s)\n", ccid_ops->ccid_id, ccid_ops->ccid_name); - return 0; } -EXPORT_SYMBOL_GPL(ccid_unregister); - -struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp) +struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx) { - struct ccid_operations *ccid_ops; + struct ccid_operations *ccid_ops = ccid_by_number(id); struct ccid *ccid = NULL; - ccids_read_lock(); -#ifdef CONFIG_MODULES - if (ccids[id] == NULL) { - /* We only try to load if in process context */ - ccids_read_unlock(); - if (gfp & GFP_ATOMIC) - goto out; - request_module("net-dccp-ccid-%d", id); - ccids_read_lock(); - } -#endif - ccid_ops = ccids[id]; if (ccid_ops == NULL) - goto out_unlock; - - if (!try_module_get(ccid_ops->ccid_owner)) - goto out_unlock; - - ccids_read_unlock(); + goto out; ccid = kmem_cache_alloc(rx ? ccid_ops->ccid_hc_rx_slab : - ccid_ops->ccid_hc_tx_slab, gfp); + ccid_ops->ccid_hc_tx_slab, gfp_any()); if (ccid == NULL) - goto out_module_put; + goto out; ccid->ccid_ops = ccid_ops; if (rx) { memset(ccid + 1, 0, ccid_ops->ccid_hc_rx_obj_size); @@ -191,67 +173,57 @@ struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, gfp_t gfp) } out: return ccid; -out_unlock: - ccids_read_unlock(); - goto out; out_free_ccid: kmem_cache_free(rx ? ccid_ops->ccid_hc_rx_slab : ccid_ops->ccid_hc_tx_slab, ccid); ccid = NULL; -out_module_put: - module_put(ccid_ops->ccid_owner); goto out; } -EXPORT_SYMBOL_GPL(ccid_new); - -struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, gfp_t gfp) +void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk) { - return ccid_new(id, sk, 1, gfp); + if (ccid != NULL) { + if (ccid->ccid_ops->ccid_hc_rx_exit != NULL) + ccid->ccid_ops->ccid_hc_rx_exit(sk); + kmem_cache_free(ccid->ccid_ops->ccid_hc_rx_slab, ccid); + } } -EXPORT_SYMBOL_GPL(ccid_hc_rx_new); - -struct ccid *ccid_hc_tx_new(unsigned char id,struct sock *sk, gfp_t gfp) +void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk) { - return ccid_new(id, sk, 0, gfp); + if (ccid != NULL) { + if (ccid->ccid_ops->ccid_hc_tx_exit != NULL) + ccid->ccid_ops->ccid_hc_tx_exit(sk); + kmem_cache_free(ccid->ccid_ops->ccid_hc_tx_slab, ccid); + } } -EXPORT_SYMBOL_GPL(ccid_hc_tx_new); - -static void ccid_delete(struct ccid *ccid, struct sock *sk, int rx) +int __init ccid_initialize_builtins(void) { - struct ccid_operations *ccid_ops; + int i, err = tfrc_lib_init(); - if (ccid == NULL) - return; + if (err) + return err; - ccid_ops = ccid->ccid_ops; - if (rx) { - if (ccid_ops->ccid_hc_rx_exit != NULL) - ccid_ops->ccid_hc_rx_exit(sk); - kmem_cache_free(ccid_ops->ccid_hc_rx_slab, ccid); - } else { - if (ccid_ops->ccid_hc_tx_exit != NULL) - ccid_ops->ccid_hc_tx_exit(sk); - kmem_cache_free(ccid_ops->ccid_hc_tx_slab, ccid); + for (i = 0; i < ARRAY_SIZE(ccids); i++) { + err = ccid_activate(ccids[i]); + if (err) + goto unwind_registrations; } - ccids_read_lock(); - if (ccids[ccid_ops->ccid_id] != NULL) - module_put(ccid_ops->ccid_owner); - ccids_read_unlock(); -} + return 0; -void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk) -{ - ccid_delete(ccid, sk, 1); +unwind_registrations: + while(--i >= 0) + ccid_deactivate(ccids[i]); + tfrc_lib_exit(); + return err; } -EXPORT_SYMBOL_GPL(ccid_hc_rx_delete); - -void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk) +void ccid_cleanup_builtins(void) { - ccid_delete(ccid, sk, 0); -} + int i; -EXPORT_SYMBOL_GPL(ccid_hc_tx_delete); + for (i = 0; i < ARRAY_SIZE(ccids); i++) + ccid_deactivate(ccids[i]); + tfrc_lib_exit(); +} diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index fdeae7b5731..facedd20b53 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -29,7 +29,6 @@ struct tcp_info; * @ccid_id: numerical CCID ID (up to %CCID_MAX, cf. table 5 in RFC 4340, 10.) * @ccid_ccmps: the CCMPS including network/transport headers (0 when disabled) * @ccid_name: alphabetical identifier string for @ccid_id - * @ccid_owner: module which implements/owns this CCID * @ccid_hc_{r,t}x_slab: memory pool for the receiver/sender half-connection * @ccid_hc_{r,t}x_obj_size: size of the receiver/sender half-connection socket * @@ -48,7 +47,6 @@ struct ccid_operations { unsigned char ccid_id; __u32 ccid_ccmps; const char *ccid_name; - struct module *ccid_owner; struct kmem_cache *ccid_hc_rx_slab, *ccid_hc_tx_slab; __u32 ccid_hc_rx_obj_size, @@ -90,8 +88,13 @@ struct ccid_operations { int __user *optlen); }; -extern int ccid_register(struct ccid_operations *ccid_ops); -extern int ccid_unregister(struct ccid_operations *ccid_ops); +extern struct ccid_operations ccid2_ops; +#ifdef CONFIG_IP_DCCP_CCID3 +extern struct ccid_operations ccid3_ops; +#endif + +extern int ccid_initialize_builtins(void); +extern void ccid_cleanup_builtins(void); struct ccid { struct ccid_operations *ccid_ops; @@ -103,13 +106,30 @@ static inline void *ccid_priv(const struct ccid *ccid) return (void *)ccid->ccid_priv; } -extern struct ccid *ccid_new(unsigned char id, struct sock *sk, int rx, - gfp_t gfp); +extern bool ccid_support_check(u8 const *ccid_array, u8 array_len); +extern int ccid_get_builtin_ccids(u8 **ccid_array, u8 *array_len); +extern int ccid_getsockopt_builtin_ccids(struct sock *sk, int len, + char __user *, int __user *); + +extern struct ccid *ccid_new(const u8 id, struct sock *sk, bool rx); -extern struct ccid *ccid_hc_rx_new(unsigned char id, struct sock *sk, - gfp_t gfp); -extern struct ccid *ccid_hc_tx_new(unsigned char id, struct sock *sk, - gfp_t gfp); +static inline int ccid_get_current_rx_ccid(struct dccp_sock *dp) +{ + struct ccid *ccid = dp->dccps_hc_rx_ccid; + + if (ccid == NULL || ccid->ccid_ops == NULL) + return -1; + return ccid->ccid_ops->ccid_id; +} + +static inline int ccid_get_current_tx_ccid(struct dccp_sock *dp) +{ + struct ccid *ccid = dp->dccps_hc_tx_ccid; + + if (ccid == NULL || ccid->ccid_ops == NULL) + return -1; + return ccid->ccid_ops->ccid_id; +} extern void ccid_hc_rx_delete(struct ccid *ccid, struct sock *sk); extern void ccid_hc_tx_delete(struct ccid *ccid, struct sock *sk); diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig index 12275943eab..b28bf962edc 100644 --- a/net/dccp/ccids/Kconfig +++ b/net/dccp/ccids/Kconfig @@ -1,80 +1,51 @@ menu "DCCP CCIDs Configuration (EXPERIMENTAL)" depends on EXPERIMENTAL -config IP_DCCP_CCID2 - tristate "CCID2 (TCP-Like) (EXPERIMENTAL)" - def_tristate IP_DCCP - select IP_DCCP_ACKVEC - ---help--- - CCID 2, TCP-like Congestion Control, denotes Additive Increase, - Multiplicative Decrease (AIMD) congestion control with behavior - modelled directly on TCP, including congestion window, slow start, - timeouts, and so forth [RFC 2581]. CCID 2 achieves maximum - bandwidth over the long term, consistent with the use of end-to-end - congestion control, but halves its congestion window in response to - each congestion event. This leads to the abrupt rate changes - typical of TCP. Applications should use CCID 2 if they prefer - maximum bandwidth utilization to steadiness of rate. This is often - the case for applications that are not playing their data directly - to the user. For example, a hypothetical application that - transferred files over DCCP, using application-level retransmissions - for lost packets, would prefer CCID 2 to CCID 3. On-line games may - also prefer CCID 2. See RFC 4341 for further details. - - CCID2 is the default CCID used by DCCP. - config IP_DCCP_CCID2_DEBUG - bool "CCID2 debugging messages" - depends on IP_DCCP_CCID2 - ---help--- - Enable CCID2-specific debugging messages. + bool "CCID-2 debugging messages" + ---help--- + Enable CCID-2 specific debugging messages. - When compiling CCID2 as a module, this debugging output can - additionally be toggled by setting the ccid2_debug module - parameter to 0 or 1. + The debugging output can additionally be toggled by setting the + ccid2_debug parameter to 0 or 1. - If in doubt, say N. + If in doubt, say N. config IP_DCCP_CCID3 - tristate "CCID3 (TCP-Friendly) (EXPERIMENTAL)" - def_tristate IP_DCCP - select IP_DCCP_TFRC_LIB + bool "CCID-3 (TCP-Friendly) (EXPERIMENTAL)" + def_bool y if (IP_DCCP = y || IP_DCCP = m) ---help--- - CCID 3 denotes TCP-Friendly Rate Control (TFRC), an equation-based + CCID-3 denotes TCP-Friendly Rate Control (TFRC), an equation-based rate-controlled congestion control mechanism. TFRC is designed to be reasonably fair when competing for bandwidth with TCP-like flows, where a flow is "reasonably fair" if its sending rate is generally within a factor of two of the sending rate of a TCP flow under the same conditions. However, TFRC has a much lower variation of - throughput over time compared with TCP, which makes CCID 3 more - suitable than CCID 2 for applications such streaming media where a + throughput over time compared with TCP, which makes CCID-3 more + suitable than CCID-2 for applications such streaming media where a relatively smooth sending rate is of importance. - CCID 3 is further described in RFC 4342, + CCID-3 is further described in RFC 4342, http://www.ietf.org/rfc/rfc4342.txt The TFRC congestion control algorithms were initially described in - RFC 3448. + RFC 5448. This text was extracted from RFC 4340 (sec. 10.2), http://www.ietf.org/rfc/rfc4340.txt - - To compile this CCID as a module, choose M here: the module will be - called dccp_ccid3. - If in doubt, say M. + If in doubt, say N. config IP_DCCP_CCID3_DEBUG - bool "CCID3 debugging messages" - depends on IP_DCCP_CCID3 - ---help--- - Enable CCID3-specific debugging messages. + bool "CCID-3 debugging messages" + depends on IP_DCCP_CCID3 + ---help--- + Enable CCID-3 specific debugging messages. - When compiling CCID3 as a module, this debugging output can - additionally be toggled by setting the ccid3_debug module - parameter to 0 or 1. + The debugging output can additionally be toggled by setting the + ccid3_debug parameter to 0 or 1. - If in doubt, say N. + If in doubt, say N. config IP_DCCP_CCID3_RTO int "Use higher bound for nofeedback timer" @@ -108,12 +79,8 @@ config IP_DCCP_CCID3_RTO therefore not be performed on WANs. config IP_DCCP_TFRC_LIB - tristate - default n + def_bool y if IP_DCCP_CCID3 config IP_DCCP_TFRC_DEBUG - bool - depends on IP_DCCP_TFRC_LIB - default y if IP_DCCP_CCID3_DEBUG - + def_bool y if IP_DCCP_CCID3_DEBUG endmenu diff --git a/net/dccp/ccids/Makefile b/net/dccp/ccids/Makefile deleted file mode 100644 index 438f20bccff..00000000000 --- a/net/dccp/ccids/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -obj-$(CONFIG_IP_DCCP_CCID3) += dccp_ccid3.o - -dccp_ccid3-y := ccid3.o - -obj-$(CONFIG_IP_DCCP_CCID2) += dccp_ccid2.o - -dccp_ccid2-y := ccid2.o - -obj-y += lib/ diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 9a430734530..d235294ace2 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -25,7 +25,7 @@ /* * This implementation should follow RFC 4341 */ - +#include "../feat.h" #include "../ccid.h" #include "../dccp.h" #include "ccid2.h" @@ -147,8 +147,8 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) DCCP_WARN("Limiting Ack Ratio (%u) to %u\n", val, max_ratio); val = max_ratio; } - if (val > 0xFFFF) /* RFC 4340, 11.3 */ - val = 0xFFFF; + if (val > DCCPF_ACK_RATIO_MAX) + val = DCCPF_ACK_RATIO_MAX; if (val == dp->dccps_l_ack_ratio) return; @@ -768,10 +768,9 @@ static void ccid2_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) } } -static struct ccid_operations ccid2 = { +struct ccid_operations ccid2_ops = { .ccid_id = DCCPC_CCID2, .ccid_name = "TCP-like", - .ccid_owner = THIS_MODULE, .ccid_hc_tx_obj_size = sizeof(struct ccid2_hc_tx_sock), .ccid_hc_tx_init = ccid2_hc_tx_init, .ccid_hc_tx_exit = ccid2_hc_tx_exit, @@ -784,22 +783,5 @@ static struct ccid_operations ccid2 = { #ifdef CONFIG_IP_DCCP_CCID2_DEBUG module_param(ccid2_debug, bool, 0644); -MODULE_PARM_DESC(ccid2_debug, "Enable debug messages"); +MODULE_PARM_DESC(ccid2_debug, "Enable CCID-2 debug messages"); #endif - -static __init int ccid2_module_init(void) -{ - return ccid_register(&ccid2); -} -module_init(ccid2_module_init); - -static __exit void ccid2_module_exit(void) -{ - ccid_unregister(&ccid2); -} -module_exit(ccid2_module_exit); - -MODULE_AUTHOR("Andrea Bittau <a.bittau@cs.ucl.ac.uk>"); -MODULE_DESCRIPTION("DCCP TCP-Like (CCID2) CCID"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("net-dccp-ccid-2"); diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 3b8bd7ca676..a27b7f4c19c 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -940,10 +940,9 @@ static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, return 0; } -static struct ccid_operations ccid3 = { +struct ccid_operations ccid3_ops = { .ccid_id = DCCPC_CCID3, .ccid_name = "TCP-Friendly Rate Control", - .ccid_owner = THIS_MODULE, .ccid_hc_tx_obj_size = sizeof(struct ccid3_hc_tx_sock), .ccid_hc_tx_init = ccid3_hc_tx_init, .ccid_hc_tx_exit = ccid3_hc_tx_exit, @@ -964,23 +963,5 @@ static struct ccid_operations ccid3 = { #ifdef CONFIG_IP_DCCP_CCID3_DEBUG module_param(ccid3_debug, bool, 0644); -MODULE_PARM_DESC(ccid3_debug, "Enable debug messages"); +MODULE_PARM_DESC(ccid3_debug, "Enable CCID-3 debug messages"); #endif - -static __init int ccid3_module_init(void) -{ - return ccid_register(&ccid3); -} -module_init(ccid3_module_init); - -static __exit void ccid3_module_exit(void) -{ - ccid_unregister(&ccid3); -} -module_exit(ccid3_module_exit); - -MODULE_AUTHOR("Ian McDonald <ian.mcdonald@jandi.co.nz>, " - "Arnaldo Carvalho de Melo <acme@ghostprotocols.net>"); -MODULE_DESCRIPTION("DCCP TFRC CCID3 CCID"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("net-dccp-ccid-3"); diff --git a/net/dccp/ccids/lib/Makefile b/net/dccp/ccids/lib/Makefile deleted file mode 100644 index 68c93e3d89d..00000000000 --- a/net/dccp/ccids/lib/Makefile +++ /dev/null @@ -1,3 +0,0 @@ -obj-$(CONFIG_IP_DCCP_TFRC_LIB) += dccp_tfrc_lib.o - -dccp_tfrc_lib-y := tfrc.o tfrc_equation.o packet_history.o loss_interval.o diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c index 5b3ce0688c5..4d1e4012726 100644 --- a/net/dccp/ccids/lib/loss_interval.c +++ b/net/dccp/ccids/lib/loss_interval.c @@ -60,7 +60,6 @@ void tfrc_lh_cleanup(struct tfrc_loss_hist *lh) lh->ring[LIH_INDEX(lh->counter)] = NULL; } } -EXPORT_SYMBOL_GPL(tfrc_lh_cleanup); static void tfrc_lh_calc_i_mean(struct tfrc_loss_hist *lh) { @@ -121,7 +120,6 @@ u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) return (lh->i_mean < old_i_mean); } -EXPORT_SYMBOL_GPL(tfrc_lh_update_i_mean); /* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */ static inline u8 tfrc_lh_is_new_loss(struct tfrc_loss_interval *cur, @@ -169,7 +167,6 @@ int tfrc_lh_interval_add(struct tfrc_loss_hist *lh, struct tfrc_rx_hist *rh, } return 1; } -EXPORT_SYMBOL_GPL(tfrc_lh_interval_add); int __init tfrc_li_init(void) { diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index 6cc108afdc3..b7785b3581e 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -94,7 +94,6 @@ int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) *headp = entry; return 0; } -EXPORT_SYMBOL_GPL(tfrc_tx_hist_add); void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp) { @@ -109,7 +108,6 @@ void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp) *headp = NULL; } -EXPORT_SYMBOL_GPL(tfrc_tx_hist_purge); u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno, const ktime_t now) @@ -127,7 +125,6 @@ u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno, return rtt; } -EXPORT_SYMBOL_GPL(tfrc_tx_hist_rtt); /* @@ -172,7 +169,6 @@ void tfrc_rx_hist_add_packet(struct tfrc_rx_hist *h, tfrc_rx_hist_entry_from_skb(entry, skb, ndp); } -EXPORT_SYMBOL_GPL(tfrc_rx_hist_add_packet); /* has the packet contained in skb been seen before? */ int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb) @@ -189,7 +185,6 @@ int tfrc_rx_hist_duplicate(struct tfrc_rx_hist *h, struct sk_buff *skb) return 0; } -EXPORT_SYMBOL_GPL(tfrc_rx_hist_duplicate); static void tfrc_rx_hist_swap(struct tfrc_rx_hist *h, const u8 a, const u8 b) { @@ -390,7 +385,6 @@ int tfrc_rx_handle_loss(struct tfrc_rx_hist *h, } return is_new_loss; } -EXPORT_SYMBOL_GPL(tfrc_rx_handle_loss); int tfrc_rx_hist_alloc(struct tfrc_rx_hist *h) { @@ -412,7 +406,6 @@ out_free: } return -ENOBUFS; } -EXPORT_SYMBOL_GPL(tfrc_rx_hist_alloc); void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) { @@ -424,7 +417,6 @@ void tfrc_rx_hist_purge(struct tfrc_rx_hist *h) h->ring[i] = NULL; } } -EXPORT_SYMBOL_GPL(tfrc_rx_hist_purge); /** * tfrc_rx_hist_rtt_last_s - reference entry to compute RTT samples against @@ -495,4 +487,3 @@ keep_ref_for_next_time: return sample; } -EXPORT_SYMBOL_GPL(tfrc_rx_hist_sample_rtt); diff --git a/net/dccp/ccids/lib/tfrc.c b/net/dccp/ccids/lib/tfrc.c index 185916218e0..60c412ccfee 100644 --- a/net/dccp/ccids/lib/tfrc.c +++ b/net/dccp/ccids/lib/tfrc.c @@ -1,20 +1,18 @@ /* - * TFRC: main module holding the pieces of the TFRC library together + * TFRC library initialisation * * Copyright (c) 2007 The University of Aberdeen, Scotland, UK * Copyright (c) 2007 Arnaldo Carvalho de Melo <acme@redhat.com> */ -#include <linux/module.h> -#include <linux/moduleparam.h> #include "tfrc.h" #ifdef CONFIG_IP_DCCP_TFRC_DEBUG int tfrc_debug; module_param(tfrc_debug, bool, 0644); -MODULE_PARM_DESC(tfrc_debug, "Enable debug messages"); +MODULE_PARM_DESC(tfrc_debug, "Enable TFRC debug messages"); #endif -static int __init tfrc_module_init(void) +int __init tfrc_lib_init(void) { int rc = tfrc_li_init(); @@ -38,18 +36,9 @@ out: return rc; } -static void __exit tfrc_module_exit(void) +void __exit tfrc_lib_exit(void) { tfrc_rx_packet_history_exit(); tfrc_tx_packet_history_exit(); tfrc_li_exit(); } - -module_init(tfrc_module_init); -module_exit(tfrc_module_exit); - -MODULE_AUTHOR("Gerrit Renker <gerrit@erg.abdn.ac.uk>, " - "Ian McDonald <ian.mcdonald@jandi.co.nz>, " - "Arnaldo Carvalho de Melo <acme@redhat.com>"); -MODULE_DESCRIPTION("DCCP TFRC library"); -MODULE_LICENSE("GPL"); diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h index ed9857527ac..e9720b14327 100644 --- a/net/dccp/ccids/lib/tfrc.h +++ b/net/dccp/ccids/lib/tfrc.h @@ -17,7 +17,8 @@ #include <linux/types.h> #include <linux/math64.h> #include "../../dccp.h" -/* internal includes that this module exports: */ + +/* internal includes that this library exports: */ #include "loss_interval.h" #include "packet_history.h" @@ -66,4 +67,12 @@ extern void tfrc_rx_packet_history_exit(void); extern int tfrc_li_init(void); extern void tfrc_li_exit(void); + +#ifdef CONFIG_IP_DCCP_TFRC_LIB +extern int tfrc_lib_init(void); +extern void tfrc_lib_exit(void); +#else +#define tfrc_lib_init() (0) +#define tfrc_lib_exit() +#endif #endif /* _TFRC_H_ */ diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c index 2f20a29cffe..c5d3a9e5a5a 100644 --- a/net/dccp/ccids/lib/tfrc_equation.c +++ b/net/dccp/ccids/lib/tfrc_equation.c @@ -659,8 +659,6 @@ u32 tfrc_calc_x(u16 s, u32 R, u32 p) return scaled_div32(result, f); } -EXPORT_SYMBOL_GPL(tfrc_calc_x); - /** * tfrc_calc_x_reverse_lookup - try to find p given f(p) * @@ -693,5 +691,3 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue) index = tfrc_binsearch(fvalue, 0); return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; } - -EXPORT_SYMBOL_GPL(tfrc_calc_x_reverse_lookup); diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index b4bc6e095a0..f2230fc168e 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -49,7 +49,7 @@ extern int dccp_debug; extern struct inet_hashinfo dccp_hashinfo; -extern atomic_t dccp_orphan_count; +extern struct percpu_counter dccp_orphan_count; extern void dccp_time_wait(struct sock *sk, int state, int timeo); @@ -98,9 +98,6 @@ extern int sysctl_dccp_retries2; extern int sysctl_dccp_feat_sequence_window; extern int sysctl_dccp_feat_rx_ccid; extern int sysctl_dccp_feat_tx_ccid; -extern int sysctl_dccp_feat_ack_ratio; -extern int sysctl_dccp_feat_send_ack_vector; -extern int sysctl_dccp_feat_send_ndp_count; extern int sysctl_dccp_tx_qlen; extern int sysctl_dccp_sync_ratelimit; @@ -252,7 +249,8 @@ extern const char *dccp_state_name(const int state); extern void dccp_set_state(struct sock *sk, const int state); extern void dccp_done(struct sock *sk); -extern void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb); +extern int dccp_reqsk_init(struct request_sock *rq, struct dccp_sock const *dp, + struct sk_buff const *skb); extern int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb); @@ -434,13 +432,18 @@ static inline int dccp_ack_pending(const struct sock *sk) { const struct dccp_sock *dp = dccp_sk(sk); return dp->dccps_timestamp_echo != 0 || -#ifdef CONFIG_IP_DCCP_ACKVEC - (dccp_msk(sk)->dccpms_send_ack_vector && + (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) || -#endif inet_csk_ack_scheduled(sk); } +extern int dccp_feat_finalise_settings(struct dccp_sock *dp); +extern int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq); +extern int dccp_feat_insert_opts(struct dccp_sock*, struct dccp_request_sock*, + struct sk_buff *skb); +extern int dccp_feat_activate_values(struct sock *sk, struct list_head *fn); +extern void dccp_feat_list_purge(struct list_head *fn_list); + extern int dccp_insert_options(struct sock *sk, struct sk_buff *skb); extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*); extern int dccp_insert_option_elapsed_time(struct sock *sk, diff --git a/net/dccp/diag.c b/net/dccp/diag.c index d8a3509b26f..b21f261da75 100644 --- a/net/dccp/diag.c +++ b/net/dccp/diag.c @@ -29,11 +29,14 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_backoff = icsk->icsk_backoff; info->tcpi_pmtu = icsk->icsk_pmtu_cookie; - if (dccp_msk(sk)->dccpms_send_ack_vector) + if (dp->dccps_hc_rx_ackvec != NULL) info->tcpi_options |= TCPI_OPT_SACK; - ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info); - ccid_hc_tx_get_info(dp->dccps_hc_tx_ccid, sk, info); + if (dp->dccps_hc_rx_ccid != NULL) + ccid_hc_rx_get_info(dp->dccps_hc_rx_ccid, sk, info); + + if (dp->dccps_hc_tx_ccid != NULL) + ccid_hc_tx_get_info(dp->dccps_hc_tx_ccid, sk, info); } static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, @@ -45,7 +48,7 @@ static void dccp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, dccp_get_info(sk, _info); } -static struct inet_diag_handler dccp_diag_handler = { +static const struct inet_diag_handler dccp_diag_handler = { .idiag_hashinfo = &dccp_hashinfo, .idiag_get_info = dccp_diag_get_info, .idiag_type = DCCPDIAG_GETSOCK, diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 933a0ecf8d4..4152308958a 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -1,11 +1,17 @@ /* * net/dccp/feat.c * - * An implementation of the DCCP protocol - * Andrea Bittau <a.bittau@cs.ucl.ac.uk> + * Feature negotiation for the DCCP protocol (RFC 4340, section 6) + * + * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk> + * Rewrote from scratch, some bits from earlier code by + * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk> + * * * ASSUMPTIONS * ----------- + * o Feature negotiation is coordinated with connection setup (as in TCP), wild + * changes of parameters of an established connection are not supported. * o All currently known SP features have 1-byte quantities. If in the future * extensions of RFCs 4340..42 define features with item lengths larger than * one byte, a feature-specific extension of the code will be required. @@ -15,597 +21,1185 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ - #include <linux/module.h> - #include "ccid.h" #include "feat.h" -#define DCCP_FEAT_SP_NOAGREE (-123) - -int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature, - u8 *val, u8 len, gfp_t gfp) -{ - struct dccp_opt_pend *opt; - - dccp_feat_debug(type, feature, *val); - - if (len > 3) { - DCCP_WARN("invalid length %d\n", len); - return -EINVAL; - } - /* XXX add further sanity checks */ - - /* check if that feature is already being negotiated */ - list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { - /* ok we found a negotiation for this option already */ - if (opt->dccpop_feat == feature && opt->dccpop_type == type) { - dccp_pr_debug("Replacing old\n"); - /* replace */ - BUG_ON(opt->dccpop_val == NULL); - kfree(opt->dccpop_val); - opt->dccpop_val = val; - opt->dccpop_len = len; - opt->dccpop_conf = 0; - return 0; - } - } - - /* negotiation for a new feature */ - opt = kmalloc(sizeof(*opt), gfp); - if (opt == NULL) - return -ENOMEM; - - opt->dccpop_type = type; - opt->dccpop_feat = feature; - opt->dccpop_len = len; - opt->dccpop_val = val; - opt->dccpop_conf = 0; - opt->dccpop_sc = NULL; - - BUG_ON(opt->dccpop_val == NULL); - - list_add_tail(&opt->dccpop_node, &dmsk->dccpms_pending); - return 0; -} - -EXPORT_SYMBOL_GPL(dccp_feat_change); - -static int dccp_feat_update_ccid(struct sock *sk, u8 type, u8 new_ccid_nr) +/* + * Feature activation handlers. + * + * These all use an u64 argument, to provide enough room for NN/SP features. At + * this stage the negotiated values have been checked to be within their range. + */ +static int dccp_hdlr_ccid(struct sock *sk, u64 ccid, bool rx) { struct dccp_sock *dp = dccp_sk(sk); - struct dccp_minisock *dmsk = dccp_msk(sk); - /* figure out if we are changing our CCID or the peer's */ - const int rx = type == DCCPO_CHANGE_R; - const u8 ccid_nr = rx ? dmsk->dccpms_rx_ccid : dmsk->dccpms_tx_ccid; - struct ccid *new_ccid; + struct ccid *new_ccid = ccid_new(ccid, sk, rx); - /* Check if nothing is being changed. */ - if (ccid_nr == new_ccid_nr) - return 0; - - new_ccid = ccid_new(new_ccid_nr, sk, rx, GFP_ATOMIC); if (new_ccid == NULL) return -ENOMEM; if (rx) { ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); dp->dccps_hc_rx_ccid = new_ccid; - dmsk->dccpms_rx_ccid = new_ccid_nr; } else { ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); dp->dccps_hc_tx_ccid = new_ccid; - dmsk->dccpms_tx_ccid = new_ccid_nr; } + return 0; +} +static int dccp_hdlr_seq_win(struct sock *sk, u64 seq_win, bool rx) +{ + if (!rx) + dccp_msk(sk)->dccpms_sequence_window = seq_win; return 0; } -static int dccp_feat_update(struct sock *sk, u8 type, u8 feat, u8 val) +static int dccp_hdlr_ack_ratio(struct sock *sk, u64 ratio, bool rx) { - dccp_feat_debug(type, feat, val); + if (rx) + dccp_sk(sk)->dccps_r_ack_ratio = ratio; + else + dccp_sk(sk)->dccps_l_ack_ratio = ratio; + return 0; +} - switch (feat) { - case DCCPF_CCID: - return dccp_feat_update_ccid(sk, type, val); - default: - dccp_pr_debug("UNIMPLEMENTED: %s(%d, ...)\n", - dccp_feat_typename(type), feat); - break; +static int dccp_hdlr_ackvec(struct sock *sk, u64 enable, bool rx) +{ + struct dccp_sock *dp = dccp_sk(sk); + + if (rx) { + if (enable && dp->dccps_hc_rx_ackvec == NULL) { + dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(gfp_any()); + if (dp->dccps_hc_rx_ackvec == NULL) + return -ENOMEM; + } else if (!enable) { + dccp_ackvec_free(dp->dccps_hc_rx_ackvec); + dp->dccps_hc_rx_ackvec = NULL; + } } return 0; } -static int dccp_feat_reconcile(struct sock *sk, struct dccp_opt_pend *opt, - u8 *rpref, u8 rlen) +static int dccp_hdlr_ndp(struct sock *sk, u64 enable, bool rx) { - struct dccp_sock *dp = dccp_sk(sk); - u8 *spref, slen, *res = NULL; - int i, j, rc, agree = 1; + if (!rx) + dccp_sk(sk)->dccps_send_ndp_count = (enable > 0); + return 0; +} - BUG_ON(rpref == NULL); +/* + * Minimum Checksum Coverage is located at the RX side (9.2.1). This means that + * `rx' holds when the sending peer informs about his partial coverage via a + * ChangeR() option. In the other case, we are the sender and the receiver + * announces its coverage via ChangeL() options. The policy here is to honour + * such communication by enabling the corresponding partial coverage - but only + * if it has not been set manually before; the warning here means that all + * packets will be dropped. + */ +static int dccp_hdlr_min_cscov(struct sock *sk, u64 cscov, bool rx) +{ + struct dccp_sock *dp = dccp_sk(sk); - /* check if we are the black sheep */ - if (dp->dccps_role == DCCP_ROLE_CLIENT) { - spref = rpref; - slen = rlen; - rpref = opt->dccpop_val; - rlen = opt->dccpop_len; - } else { - spref = opt->dccpop_val; - slen = opt->dccpop_len; + if (rx) + dp->dccps_pcrlen = cscov; + else { + if (dp->dccps_pcslen == 0) + dp->dccps_pcslen = cscov; + else if (cscov > dp->dccps_pcslen) + DCCP_WARN("CsCov %u too small, peer requires >= %u\n", + dp->dccps_pcslen, (u8)cscov); } - /* - * Now we have server preference list in spref and client preference in - * rpref - */ - BUG_ON(spref == NULL); - BUG_ON(rpref == NULL); + return 0; +} - /* FIXME sanity check vals */ +static const struct { + u8 feat_num; /* DCCPF_xxx */ + enum dccp_feat_type rxtx; /* RX or TX */ + enum dccp_feat_type reconciliation; /* SP or NN */ + u8 default_value; /* as in 6.4 */ + int (*activation_hdlr)(struct sock *sk, u64 val, bool rx); +/* + * Lookup table for location and type of features (from RFC 4340/4342) + * +--------------------------+----+-----+----+----+---------+-----------+ + * | Feature | Location | Reconc. | Initial | Section | + * | | RX | TX | SP | NN | Value | Reference | + * +--------------------------+----+-----+----+----+---------+-----------+ + * | DCCPF_CCID | | X | X | | 2 | 10 | + * | DCCPF_SHORT_SEQNOS | | X | X | | 0 | 7.6.1 | + * | DCCPF_SEQUENCE_WINDOW | | X | | X | 100 | 7.5.2 | + * | DCCPF_ECN_INCAPABLE | X | | X | | 0 | 12.1 | + * | DCCPF_ACK_RATIO | | X | | X | 2 | 11.3 | + * | DCCPF_SEND_ACK_VECTOR | X | | X | | 0 | 11.5 | + * | DCCPF_SEND_NDP_COUNT | | X | X | | 0 | 7.7.2 | + * | DCCPF_MIN_CSUM_COVER | X | | X | | 0 | 9.2.1 | + * | DCCPF_DATA_CHECKSUM | X | | X | | 0 | 9.3.1 | + * | DCCPF_SEND_LEV_RATE | X | | X | | 0 | 4342/8.4 | + * +--------------------------+----+-----+----+----+---------+-----------+ + */ +} dccp_feat_table[] = { + { DCCPF_CCID, FEAT_AT_TX, FEAT_SP, 2, dccp_hdlr_ccid }, + { DCCPF_SHORT_SEQNOS, FEAT_AT_TX, FEAT_SP, 0, NULL }, + { DCCPF_SEQUENCE_WINDOW, FEAT_AT_TX, FEAT_NN, 100, dccp_hdlr_seq_win }, + { DCCPF_ECN_INCAPABLE, FEAT_AT_RX, FEAT_SP, 0, NULL }, + { DCCPF_ACK_RATIO, FEAT_AT_TX, FEAT_NN, 2, dccp_hdlr_ack_ratio}, + { DCCPF_SEND_ACK_VECTOR, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_ackvec }, + { DCCPF_SEND_NDP_COUNT, FEAT_AT_TX, FEAT_SP, 0, dccp_hdlr_ndp }, + { DCCPF_MIN_CSUM_COVER, FEAT_AT_RX, FEAT_SP, 0, dccp_hdlr_min_cscov}, + { DCCPF_DATA_CHECKSUM, FEAT_AT_RX, FEAT_SP, 0, NULL }, + { DCCPF_SEND_LEV_RATE, FEAT_AT_RX, FEAT_SP, 0, NULL }, +}; +#define DCCP_FEAT_SUPPORTED_MAX ARRAY_SIZE(dccp_feat_table) + +/** + * dccp_feat_index - Hash function to map feature number into array position + * Returns consecutive array index or -1 if the feature is not understood. + */ +static int dccp_feat_index(u8 feat_num) +{ + /* The first 9 entries are occupied by the types from RFC 4340, 6.4 */ + if (feat_num > DCCPF_RESERVED && feat_num <= DCCPF_DATA_CHECKSUM) + return feat_num - 1; - /* Are values in any order? XXX Lame "algorithm" here */ - for (i = 0; i < slen; i++) { - for (j = 0; j < rlen; j++) { - if (spref[i] == rpref[j]) { - res = &spref[i]; - break; - } - } - if (res) - break; + /* + * Other features: add cases for new feature types here after adding + * them to the above table. + */ + switch (feat_num) { + case DCCPF_SEND_LEV_RATE: + return DCCP_FEAT_SUPPORTED_MAX - 1; } + return -1; +} - /* we didn't agree on anything */ - if (res == NULL) { - /* confirm previous value */ - switch (opt->dccpop_feat) { - case DCCPF_CCID: - /* XXX did i get this right? =P */ - if (opt->dccpop_type == DCCPO_CHANGE_L) - res = &dccp_msk(sk)->dccpms_tx_ccid; - else - res = &dccp_msk(sk)->dccpms_rx_ccid; - break; +static u8 dccp_feat_type(u8 feat_num) +{ + int idx = dccp_feat_index(feat_num); - default: - DCCP_BUG("Fell through, feat=%d", opt->dccpop_feat); - /* XXX implement res */ - return -EFAULT; - } + if (idx < 0) + return FEAT_UNKNOWN; + return dccp_feat_table[idx].reconciliation; +} - dccp_pr_debug("Don't agree... reconfirming %d\n", *res); - agree = 0; /* this is used for mandatory options... */ - } +static int dccp_feat_default_value(u8 feat_num) +{ + int idx = dccp_feat_index(feat_num); + /* + * There are no default values for unknown features, so encountering a + * negative index here indicates a serious problem somewhere else. + */ + DCCP_BUG_ON(idx < 0); - /* need to put result and our preference list */ - rlen = 1 + opt->dccpop_len; - rpref = kmalloc(rlen, GFP_ATOMIC); - if (rpref == NULL) - return -ENOMEM; + return idx < 0 ? 0 : dccp_feat_table[idx].default_value; +} + +static int __dccp_feat_activate(struct sock *sk, const int idx, + const bool is_local, dccp_feat_val const *fval) +{ + bool rx; + u64 val; - *rpref = *res; - memcpy(&rpref[1], opt->dccpop_val, opt->dccpop_len); + if (idx < 0 || idx >= DCCP_FEAT_SUPPORTED_MAX) + return -1; + if (dccp_feat_table[idx].activation_hdlr == NULL) + return 0; - /* put it in the "confirm queue" */ - if (opt->dccpop_sc == NULL) { - opt->dccpop_sc = kmalloc(sizeof(*opt->dccpop_sc), GFP_ATOMIC); - if (opt->dccpop_sc == NULL) { - kfree(rpref); - return -ENOMEM; + if (fval == NULL) { + val = dccp_feat_table[idx].default_value; + } else if (dccp_feat_table[idx].reconciliation == FEAT_SP) { + if (fval->sp.vec == NULL) { + /* + * This can happen when an empty Confirm is sent + * for an SP (i.e. known) feature. In this case + * we would be using the default anyway. + */ + DCCP_CRIT("Feature #%d undefined: using default", idx); + val = dccp_feat_table[idx].default_value; + } else { + val = fval->sp.vec[0]; } } else { - /* recycle the confirm slot */ - BUG_ON(opt->dccpop_sc->dccpoc_val == NULL); - kfree(opt->dccpop_sc->dccpoc_val); - dccp_pr_debug("recycling confirm slot\n"); + val = fval->nn; } - memset(opt->dccpop_sc, 0, sizeof(*opt->dccpop_sc)); - opt->dccpop_sc->dccpoc_val = rpref; - opt->dccpop_sc->dccpoc_len = rlen; + /* Location is RX if this is a local-RX or remote-TX feature */ + rx = (is_local == (dccp_feat_table[idx].rxtx == FEAT_AT_RX)); - /* update the option on our side [we are about to send the confirm] */ - rc = dccp_feat_update(sk, opt->dccpop_type, opt->dccpop_feat, *res); - if (rc) { - kfree(opt->dccpop_sc->dccpoc_val); - kfree(opt->dccpop_sc); - opt->dccpop_sc = NULL; - return rc; - } + return dccp_feat_table[idx].activation_hdlr(sk, val, rx); +} - dccp_pr_debug("Will confirm %d\n", *rpref); +/* Test for "Req'd" feature (RFC 4340, 6.4) */ +static inline int dccp_feat_must_be_understood(u8 feat_num) +{ + return feat_num == DCCPF_CCID || feat_num == DCCPF_SHORT_SEQNOS || + feat_num == DCCPF_SEQUENCE_WINDOW; +} - /* say we want to change to X but we just got a confirm X, suppress our - * change - */ - if (!opt->dccpop_conf) { - if (*opt->dccpop_val == *res) - opt->dccpop_conf = 1; - dccp_pr_debug("won't ask for change of same feature\n"); +/* copy constructor, fval must not already contain allocated memory */ +static int dccp_feat_clone_sp_val(dccp_feat_val *fval, u8 const *val, u8 len) +{ + fval->sp.len = len; + if (fval->sp.len > 0) { + fval->sp.vec = kmemdup(val, len, gfp_any()); + if (fval->sp.vec == NULL) { + fval->sp.len = 0; + return -ENOBUFS; + } } + return 0; +} - return agree ? 0 : DCCP_FEAT_SP_NOAGREE; /* used for mandatory opts */ +static void dccp_feat_val_destructor(u8 feat_num, dccp_feat_val *val) +{ + if (unlikely(val == NULL)) + return; + if (dccp_feat_type(feat_num) == FEAT_SP) + kfree(val->sp.vec); + memset(val, 0, sizeof(*val)); } -static int dccp_feat_sp(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) +static struct dccp_feat_entry * + dccp_feat_clone_entry(struct dccp_feat_entry const *original) { - struct dccp_minisock *dmsk = dccp_msk(sk); - struct dccp_opt_pend *opt; - int rc = 1; - u8 t; + struct dccp_feat_entry *new; + u8 type = dccp_feat_type(original->feat_num); - /* - * We received a CHANGE. We gotta match it against our own preference - * list. If we got a CHANGE_R it means it's a change for us, so we need - * to compare our CHANGE_L list. - */ - if (type == DCCPO_CHANGE_L) - t = DCCPO_CHANGE_R; - else - t = DCCPO_CHANGE_L; + if (type == FEAT_UNKNOWN) + return NULL; - /* find our preference list for this feature */ - list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { - if (opt->dccpop_type != t || opt->dccpop_feat != feature) - continue; + new = kmemdup(original, sizeof(struct dccp_feat_entry), gfp_any()); + if (new == NULL) + return NULL; + + if (type == FEAT_SP && dccp_feat_clone_sp_val(&new->val, + original->val.sp.vec, + original->val.sp.len)) { + kfree(new); + return NULL; + } + return new; +} - /* find the winner from the two preference lists */ - rc = dccp_feat_reconcile(sk, opt, val, len); - break; +static void dccp_feat_entry_destructor(struct dccp_feat_entry *entry) +{ + if (entry != NULL) { + dccp_feat_val_destructor(entry->feat_num, &entry->val); + kfree(entry); } +} - /* We didn't deal with the change. This can happen if we have no - * preference list for the feature. In fact, it just shouldn't - * happen---if we understand a feature, we should have a preference list - * with at least the default value. - */ - BUG_ON(rc == 1); +/* + * List management functions + * + * Feature negotiation lists rely on and maintain the following invariants: + * - each feat_num in the list is known, i.e. we know its type and default value + * - each feat_num/is_local combination is unique (old entries are overwritten) + * - SP values are always freshly allocated + * - list is sorted in increasing order of feature number (faster lookup) + */ +static struct dccp_feat_entry *dccp_feat_list_lookup(struct list_head *fn_list, + u8 feat_num, bool is_local) +{ + struct dccp_feat_entry *entry; - return rc; + list_for_each_entry(entry, fn_list, node) { + if (entry->feat_num == feat_num && entry->is_local == is_local) + return entry; + else if (entry->feat_num > feat_num) + break; + } + return NULL; } -static int dccp_feat_nn(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) +/** + * dccp_feat_entry_new - Central list update routine (called by all others) + * @head: list to add to + * @feat: feature number + * @local: whether the local (1) or remote feature with number @feat is meant + * This is the only constructor and serves to ensure the above invariants. + */ +static struct dccp_feat_entry * + dccp_feat_entry_new(struct list_head *head, u8 feat, bool local) { - struct dccp_opt_pend *opt; - struct dccp_minisock *dmsk = dccp_msk(sk); - u8 *copy; - int rc; + struct dccp_feat_entry *entry; + + list_for_each_entry(entry, head, node) + if (entry->feat_num == feat && entry->is_local == local) { + dccp_feat_val_destructor(entry->feat_num, &entry->val); + return entry; + } else if (entry->feat_num > feat) { + head = &entry->node; + break; + } - /* NN features must be Change L (sec. 6.3.2) */ - if (type != DCCPO_CHANGE_L) { - dccp_pr_debug("received %s for NN feature %d\n", - dccp_feat_typename(type), feature); - return -EFAULT; + entry = kmalloc(sizeof(*entry), gfp_any()); + if (entry != NULL) { + entry->feat_num = feat; + entry->is_local = local; + list_add_tail(&entry->node, head); } + return entry; +} - /* XXX sanity check opt val */ +/** + * dccp_feat_push_change - Add/overwrite a Change option in the list + * @fn_list: feature-negotiation list to update + * @feat: one of %dccp_feature_numbers + * @local: whether local (1) or remote (0) @feat_num is meant + * @needs_mandatory: whether to use Mandatory feature negotiation options + * @fval: pointer to NN/SP value to be inserted (will be copied) + */ +static int dccp_feat_push_change(struct list_head *fn_list, u8 feat, u8 local, + u8 mandatory, dccp_feat_val *fval) +{ + struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); - /* copy option so we can confirm it */ - opt = kzalloc(sizeof(*opt), GFP_ATOMIC); - if (opt == NULL) + if (new == NULL) return -ENOMEM; - copy = kmemdup(val, len, GFP_ATOMIC); - if (copy == NULL) { - kfree(opt); - return -ENOMEM; - } + new->feat_num = feat; + new->is_local = local; + new->state = FEAT_INITIALISING; + new->needs_confirm = 0; + new->empty_confirm = 0; + new->val = *fval; + new->needs_mandatory = mandatory; - opt->dccpop_type = DCCPO_CONFIRM_R; /* NN can only confirm R */ - opt->dccpop_feat = feature; - opt->dccpop_val = copy; - opt->dccpop_len = len; + return 0; +} - /* change feature */ - rc = dccp_feat_update(sk, type, feature, *val); - if (rc) { - kfree(opt->dccpop_val); - kfree(opt); - return rc; - } +/** + * dccp_feat_push_confirm - Add a Confirm entry to the FN list + * @fn_list: feature-negotiation list to add to + * @feat: one of %dccp_feature_numbers + * @local: whether local (1) or remote (0) @feat_num is being confirmed + * @fval: pointer to NN/SP value to be inserted or NULL + * Returns 0 on success, a Reset code for further processing otherwise. + */ +static int dccp_feat_push_confirm(struct list_head *fn_list, u8 feat, u8 local, + dccp_feat_val *fval) +{ + struct dccp_feat_entry *new = dccp_feat_entry_new(fn_list, feat, local); - dccp_feat_debug(type, feature, *copy); + if (new == NULL) + return DCCP_RESET_CODE_TOO_BUSY; - list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf); + new->feat_num = feat; + new->is_local = local; + new->state = FEAT_STABLE; /* transition in 6.6.2 */ + new->needs_confirm = 1; + new->empty_confirm = (fval == NULL); + new->val.nn = 0; /* zeroes the whole structure */ + if (!new->empty_confirm) + new->val = *fval; + new->needs_mandatory = 0; return 0; } -static void dccp_feat_empty_confirm(struct dccp_minisock *dmsk, - u8 type, u8 feature) +static int dccp_push_empty_confirm(struct list_head *fn_list, u8 feat, u8 local) { - /* XXX check if other confirms for that are queued and recycle slot */ - struct dccp_opt_pend *opt = kzalloc(sizeof(*opt), GFP_ATOMIC); + return dccp_feat_push_confirm(fn_list, feat, local, NULL); +} - if (opt == NULL) { - /* XXX what do we do? Ignoring should be fine. It's a change - * after all =P - */ - return; - } +static inline void dccp_feat_list_pop(struct dccp_feat_entry *entry) +{ + list_del(&entry->node); + dccp_feat_entry_destructor(entry); +} - switch (type) { - case DCCPO_CHANGE_L: - opt->dccpop_type = DCCPO_CONFIRM_R; - break; - case DCCPO_CHANGE_R: - opt->dccpop_type = DCCPO_CONFIRM_L; - break; - default: - DCCP_WARN("invalid type %d\n", type); - kfree(opt); - return; +void dccp_feat_list_purge(struct list_head *fn_list) +{ + struct dccp_feat_entry *entry, *next; + + list_for_each_entry_safe(entry, next, fn_list, node) + dccp_feat_entry_destructor(entry); + INIT_LIST_HEAD(fn_list); +} +EXPORT_SYMBOL_GPL(dccp_feat_list_purge); + +/* generate @to as full clone of @from - @to must not contain any nodes */ +int dccp_feat_clone_list(struct list_head const *from, struct list_head *to) +{ + struct dccp_feat_entry *entry, *new; + + INIT_LIST_HEAD(to); + list_for_each_entry(entry, from, node) { + new = dccp_feat_clone_entry(entry); + if (new == NULL) + goto cloning_failed; + list_add_tail(&new->node, to); } - opt->dccpop_feat = feature; - opt->dccpop_val = NULL; - opt->dccpop_len = 0; + return 0; - /* change feature */ - dccp_pr_debug("Empty %s(%d)\n", dccp_feat_typename(type), feature); +cloning_failed: + dccp_feat_list_purge(to); + return -ENOMEM; +} + +/** + * dccp_feat_valid_nn_length - Enforce length constraints on NN options + * Length is between 0 and %DCCP_OPTVAL_MAXLEN. Used for outgoing packets only, + * incoming options are accepted as long as their values are valid. + */ +static u8 dccp_feat_valid_nn_length(u8 feat_num) +{ + if (feat_num == DCCPF_ACK_RATIO) /* RFC 4340, 11.3 and 6.6.8 */ + return 2; + if (feat_num == DCCPF_SEQUENCE_WINDOW) /* RFC 4340, 7.5.2 and 6.5 */ + return 6; + return 0; +} - list_add_tail(&opt->dccpop_node, &dmsk->dccpms_conf); +static u8 dccp_feat_is_valid_nn_val(u8 feat_num, u64 val) +{ + switch (feat_num) { + case DCCPF_ACK_RATIO: + return val <= DCCPF_ACK_RATIO_MAX; + case DCCPF_SEQUENCE_WINDOW: + return val >= DCCPF_SEQ_WMIN && val <= DCCPF_SEQ_WMAX; + } + return 0; /* feature unknown - so we can't tell */ } -static void dccp_feat_flush_confirm(struct sock *sk) +/* check that SP values are within the ranges defined in RFC 4340 */ +static u8 dccp_feat_is_valid_sp_val(u8 feat_num, u8 val) { - struct dccp_minisock *dmsk = dccp_msk(sk); - /* Check if there is anything to confirm in the first place */ - int yes = !list_empty(&dmsk->dccpms_conf); + switch (feat_num) { + case DCCPF_CCID: + return val == DCCPC_CCID2 || val == DCCPC_CCID3; + /* Type-check Boolean feature values: */ + case DCCPF_SHORT_SEQNOS: + case DCCPF_ECN_INCAPABLE: + case DCCPF_SEND_ACK_VECTOR: + case DCCPF_SEND_NDP_COUNT: + case DCCPF_DATA_CHECKSUM: + case DCCPF_SEND_LEV_RATE: + return val < 2; + case DCCPF_MIN_CSUM_COVER: + return val < 16; + } + return 0; /* feature unknown */ +} - if (!yes) { - struct dccp_opt_pend *opt; +static u8 dccp_feat_sp_list_ok(u8 feat_num, u8 const *sp_list, u8 sp_len) +{ + if (sp_list == NULL || sp_len < 1) + return 0; + while (sp_len--) + if (!dccp_feat_is_valid_sp_val(feat_num, *sp_list++)) + return 0; + return 1; +} - list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { - if (opt->dccpop_conf) { - yes = 1; - break; +/** + * dccp_feat_insert_opts - Generate FN options from current list state + * @skb: next sk_buff to be sent to the peer + * @dp: for client during handshake and general negotiation + * @dreq: used by the server only (all Changes/Confirms in LISTEN/RESPOND) + */ +int dccp_feat_insert_opts(struct dccp_sock *dp, struct dccp_request_sock *dreq, + struct sk_buff *skb) +{ + struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; + struct dccp_feat_entry *pos, *next; + u8 opt, type, len, *ptr, nn_in_nbo[DCCP_OPTVAL_MAXLEN]; + bool rpt; + + /* put entries into @skb in the order they appear in the list */ + list_for_each_entry_safe_reverse(pos, next, fn, node) { + opt = dccp_feat_genopt(pos); + type = dccp_feat_type(pos->feat_num); + rpt = false; + + if (pos->empty_confirm) { + len = 0; + ptr = NULL; + } else { + if (type == FEAT_SP) { + len = pos->val.sp.len; + ptr = pos->val.sp.vec; + rpt = pos->needs_confirm; + } else if (type == FEAT_NN) { + len = dccp_feat_valid_nn_length(pos->feat_num); + ptr = nn_in_nbo; + dccp_encode_value_var(pos->val.nn, ptr, len); + } else { + DCCP_BUG("unknown feature %u", pos->feat_num); + return -1; } } + + if (dccp_insert_fn_opt(skb, opt, pos->feat_num, ptr, len, rpt)) + return -1; + if (pos->needs_mandatory && dccp_insert_option_mandatory(skb)) + return -1; + /* + * Enter CHANGING after transmitting the Change option (6.6.2). + */ + if (pos->state == FEAT_INITIALISING) + pos->state = FEAT_CHANGING; } + return 0; +} - if (!yes) - return; +/** + * __feat_register_nn - Register new NN value on socket + * @fn: feature-negotiation list to register with + * @feat: an NN feature from %dccp_feature_numbers + * @mandatory: use Mandatory option if 1 + * @nn_val: value to register (restricted to 4 bytes) + * Note that NN features are local by definition (RFC 4340, 6.3.2). + */ +static int __feat_register_nn(struct list_head *fn, u8 feat, + u8 mandatory, u64 nn_val) +{ + dccp_feat_val fval = { .nn = nn_val }; + + if (dccp_feat_type(feat) != FEAT_NN || + !dccp_feat_is_valid_nn_val(feat, nn_val)) + return -EINVAL; - /* OK there is something to confirm... */ - /* XXX check if packet is in flight? Send delayed ack?? */ - if (sk->sk_state == DCCP_OPEN) - dccp_send_ack(sk); + /* Don't bother with default values, they will be activated anyway. */ + if (nn_val - (u64)dccp_feat_default_value(feat) == 0) + return 0; + + return dccp_feat_push_change(fn, feat, 1, mandatory, &fval); } -int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, u8 *val, u8 len) +/** + * __feat_register_sp - Register new SP value/list on socket + * @fn: feature-negotiation list to register with + * @feat: an SP feature from %dccp_feature_numbers + * @is_local: whether the local (1) or the remote (0) @feat is meant + * @mandatory: use Mandatory option if 1 + * @sp_val: SP value followed by optional preference list + * @sp_len: length of @sp_val in bytes + */ +static int __feat_register_sp(struct list_head *fn, u8 feat, u8 is_local, + u8 mandatory, u8 const *sp_val, u8 sp_len) { - int rc; + dccp_feat_val fval; - dccp_feat_debug(type, feature, *val); + if (dccp_feat_type(feat) != FEAT_SP || + !dccp_feat_sp_list_ok(feat, sp_val, sp_len)) + return -EINVAL; - /* figure out if it's SP or NN feature */ - switch (feature) { - /* deal with SP features */ - case DCCPF_CCID: - rc = dccp_feat_sp(sk, type, feature, val, len); - break; + /* Avoid negotiating alien CCIDs by only advertising supported ones */ + if (feat == DCCPF_CCID && !ccid_support_check(sp_val, sp_len)) + return -EOPNOTSUPP; - /* deal with NN features */ - case DCCPF_ACK_RATIO: - rc = dccp_feat_nn(sk, type, feature, val, len); - break; + if (dccp_feat_clone_sp_val(&fval, sp_val, sp_len)) + return -ENOMEM; - /* XXX implement other features */ - default: - dccp_pr_debug("UNIMPLEMENTED: not handling %s(%d, ...)\n", - dccp_feat_typename(type), feature); - rc = -EFAULT; - break; - } + return dccp_feat_push_change(fn, feat, is_local, mandatory, &fval); +} - /* check if there were problems changing features */ - if (rc) { - /* If we don't agree on SP, we sent a confirm for old value. - * However we propagate rc to caller in case option was - * mandatory +/** + * dccp_feat_register_sp - Register requests to change SP feature values + * @sk: client or listening socket + * @feat: one of %dccp_feature_numbers + * @is_local: whether the local (1) or remote (0) @feat is meant + * @list: array of preferred values, in descending order of preference + * @len: length of @list in bytes + */ +int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, + u8 const *list, u8 len) +{ /* any changes must be registered before establishing the connection */ + if (sk->sk_state != DCCP_CLOSED) + return -EISCONN; + if (dccp_feat_type(feat) != FEAT_SP) + return -EINVAL; + return __feat_register_sp(&dccp_sk(sk)->dccps_featneg, feat, is_local, + 0, list, len); +} + +/* Analogous to dccp_feat_register_sp(), but for non-negotiable values */ +int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val) +{ + /* any changes must be registered before establishing the connection */ + if (sk->sk_state != DCCP_CLOSED) + return -EISCONN; + if (dccp_feat_type(feat) != FEAT_NN) + return -EINVAL; + return __feat_register_nn(&dccp_sk(sk)->dccps_featneg, feat, 0, val); +} + +/* + * Tracking features whose value depend on the choice of CCID + * + * This is designed with an extension in mind so that a list walk could be done + * before activating any features. However, the existing framework was found to + * work satisfactorily up until now, the automatic verification is left open. + * When adding new CCIDs, add a corresponding dependency table here. + */ +static const struct ccid_dependency *dccp_feat_ccid_deps(u8 ccid, bool is_local) +{ + static const struct ccid_dependency ccid2_dependencies[2][2] = { + /* + * CCID2 mandates Ack Vectors (RFC 4341, 4.): as CCID is a TX + * feature and Send Ack Vector is an RX feature, `is_local' + * needs to be reversed. */ - if (rc != DCCP_FEAT_SP_NOAGREE) - dccp_feat_empty_confirm(dccp_msk(sk), type, feature); + { /* Dependencies of the receiver-side (remote) CCID2 */ + { + .dependent_feat = DCCPF_SEND_ACK_VECTOR, + .is_local = true, + .is_mandatory = true, + .val = 1 + }, + { 0, 0, 0, 0 } + }, + { /* Dependencies of the sender-side (local) CCID2 */ + { + .dependent_feat = DCCPF_SEND_ACK_VECTOR, + .is_local = false, + .is_mandatory = true, + .val = 1 + }, + { 0, 0, 0, 0 } + } + }; + static const struct ccid_dependency ccid3_dependencies[2][5] = { + { /* + * Dependencies of the receiver-side CCID3 + */ + { /* locally disable Ack Vectors */ + .dependent_feat = DCCPF_SEND_ACK_VECTOR, + .is_local = true, + .is_mandatory = false, + .val = 0 + }, + { /* see below why Send Loss Event Rate is on */ + .dependent_feat = DCCPF_SEND_LEV_RATE, + .is_local = true, + .is_mandatory = true, + .val = 1 + }, + { /* NDP Count is needed as per RFC 4342, 6.1.1 */ + .dependent_feat = DCCPF_SEND_NDP_COUNT, + .is_local = false, + .is_mandatory = true, + .val = 1 + }, + { 0, 0, 0, 0 }, + }, + { /* + * CCID3 at the TX side: we request that the HC-receiver + * will not send Ack Vectors (they will be ignored, so + * Mandatory is not set); we enable Send Loss Event Rate + * (Mandatory since the implementation does not support + * the Loss Intervals option of RFC 4342, 8.6). + * The last two options are for peer's information only. + */ + { + .dependent_feat = DCCPF_SEND_ACK_VECTOR, + .is_local = false, + .is_mandatory = false, + .val = 0 + }, + { + .dependent_feat = DCCPF_SEND_LEV_RATE, + .is_local = false, + .is_mandatory = true, + .val = 1 + }, + { /* this CCID does not support Ack Ratio */ + .dependent_feat = DCCPF_ACK_RATIO, + .is_local = true, + .is_mandatory = false, + .val = 0 + }, + { /* tell receiver we are sending NDP counts */ + .dependent_feat = DCCPF_SEND_NDP_COUNT, + .is_local = true, + .is_mandatory = false, + .val = 1 + }, + { 0, 0, 0, 0 } + } + }; + switch (ccid) { + case DCCPC_CCID2: + return ccid2_dependencies[is_local]; + case DCCPC_CCID3: + return ccid3_dependencies[is_local]; + default: + return NULL; } +} - /* generate the confirm [if required] */ - dccp_feat_flush_confirm(sk); - +/** + * dccp_feat_propagate_ccid - Resolve dependencies of features on choice of CCID + * @fn: feature-negotiation list to update + * @id: CCID number to track + * @is_local: whether TX CCID (1) or RX CCID (0) is meant + * This function needs to be called after registering all other features. + */ +static int dccp_feat_propagate_ccid(struct list_head *fn, u8 id, bool is_local) +{ + const struct ccid_dependency *table = dccp_feat_ccid_deps(id, is_local); + int i, rc = (table == NULL); + + for (i = 0; rc == 0 && table[i].dependent_feat != DCCPF_RESERVED; i++) + if (dccp_feat_type(table[i].dependent_feat) == FEAT_SP) + rc = __feat_register_sp(fn, table[i].dependent_feat, + table[i].is_local, + table[i].is_mandatory, + &table[i].val, 1); + else + rc = __feat_register_nn(fn, table[i].dependent_feat, + table[i].is_mandatory, + table[i].val); return rc; } -EXPORT_SYMBOL_GPL(dccp_feat_change_recv); +/** + * dccp_feat_finalise_settings - Finalise settings before starting negotiation + * @dp: client or listening socket (settings will be inherited) + * This is called after all registrations (socket initialisation, sysctls, and + * sockopt calls), and before sending the first packet containing Change options + * (ie. client-Request or server-Response), to ensure internal consistency. + */ +int dccp_feat_finalise_settings(struct dccp_sock *dp) +{ + struct list_head *fn = &dp->dccps_featneg; + struct dccp_feat_entry *entry; + int i = 2, ccids[2] = { -1, -1 }; -int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, - u8 *val, u8 len) + /* + * Propagating CCIDs: + * 1) not useful to propagate CCID settings if this host advertises more + * than one CCID: the choice of CCID may still change - if this is + * the client, or if this is the server and the client sends + * singleton CCID values. + * 2) since is that propagate_ccid changes the list, we defer changing + * the sorted list until after the traversal. + */ + list_for_each_entry(entry, fn, node) + if (entry->feat_num == DCCPF_CCID && entry->val.sp.len == 1) + ccids[entry->is_local] = entry->val.sp.vec[0]; + while (i--) + if (ccids[i] > 0 && dccp_feat_propagate_ccid(fn, ccids[i], i)) + return -1; + return 0; +} + +/** + * dccp_feat_server_ccid_dependencies - Resolve CCID-dependent features + * It is the server which resolves the dependencies once the CCID has been + * fully negotiated. If no CCID has been negotiated, it uses the default CCID. + */ +int dccp_feat_server_ccid_dependencies(struct dccp_request_sock *dreq) { - u8 t; - struct dccp_opt_pend *opt; - struct dccp_minisock *dmsk = dccp_msk(sk); - int found = 0; - int all_confirmed = 1; + struct list_head *fn = &dreq->dreq_featneg; + struct dccp_feat_entry *entry; + u8 is_local, ccid; - dccp_feat_debug(type, feature, *val); + for (is_local = 0; is_local <= 1; is_local++) { + entry = dccp_feat_list_lookup(fn, DCCPF_CCID, is_local); - /* locate our change request */ - switch (type) { - case DCCPO_CONFIRM_L: t = DCCPO_CHANGE_R; break; - case DCCPO_CONFIRM_R: t = DCCPO_CHANGE_L; break; - default: DCCP_WARN("invalid type %d\n", type); - return 1; + if (entry != NULL && !entry->empty_confirm) + ccid = entry->val.sp.vec[0]; + else + ccid = dccp_feat_default_value(DCCPF_CCID); + if (dccp_feat_propagate_ccid(fn, ccid, is_local)) + return -1; } - /* XXX sanity check feature value */ + return 0; +} - list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { - if (!opt->dccpop_conf && opt->dccpop_type == t && - opt->dccpop_feat == feature) { - found = 1; - dccp_pr_debug("feature %d found\n", opt->dccpop_feat); +/* Select the first entry in @servlist that also occurs in @clilist (6.3.1) */ +static int dccp_feat_preflist_match(u8 *servlist, u8 slen, u8 *clilist, u8 clen) +{ + u8 c, s; - /* XXX do sanity check */ + for (s = 0; s < slen; s++) + for (c = 0; c < clen; c++) + if (servlist[s] == clilist[c]) + return servlist[s]; + return -1; +} - opt->dccpop_conf = 1; +/** + * dccp_feat_prefer - Move preferred entry to the start of array + * Reorder the @array_len elements in @array so that @preferred_value comes + * first. Returns >0 to indicate that @preferred_value does occur in @array. + */ +static u8 dccp_feat_prefer(u8 preferred_value, u8 *array, u8 array_len) +{ + u8 i, does_occur = 0; - /* We got a confirmation---change the option */ - dccp_feat_update(sk, opt->dccpop_type, - opt->dccpop_feat, *val); + if (array != NULL) { + for (i = 0; i < array_len; i++) + if (array[i] == preferred_value) { + array[i] = array[0]; + does_occur++; + } + if (does_occur) + array[0] = preferred_value; + } + return does_occur; +} - /* XXX check the return value of dccp_feat_update */ - break; - } +/** + * dccp_feat_reconcile - Reconcile SP preference lists + * @fval: SP list to reconcile into + * @arr: received SP preference list + * @len: length of @arr in bytes + * @is_server: whether this side is the server (and @fv is the server's list) + * @reorder: whether to reorder the list in @fv after reconciling with @arr + * When successful, > 0 is returned and the reconciled list is in @fval. + * A value of 0 means that negotiation failed (no shared entry). + */ +static int dccp_feat_reconcile(dccp_feat_val *fv, u8 *arr, u8 len, + bool is_server, bool reorder) +{ + int rc; - if (!opt->dccpop_conf) - all_confirmed = 0; + if (!fv->sp.vec || !arr) { + DCCP_CRIT("NULL feature value or array"); + return 0; } - /* fix re-transmit timer */ - /* XXX gotta make sure that no option negotiation occurs during - * connection shutdown. Consider that the CLOSEREQ is sent and timer is - * on. if all options are confirmed it might kill timer which should - * remain alive until close is received. - */ - if (all_confirmed) { - dccp_pr_debug("clear feat negotiation timer %p\n", sk); - inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); - } + if (is_server) + rc = dccp_feat_preflist_match(fv->sp.vec, fv->sp.len, arr, len); + else + rc = dccp_feat_preflist_match(arr, len, fv->sp.vec, fv->sp.len); - if (!found) - dccp_pr_debug("%s(%d, ...) never requested\n", - dccp_feat_typename(type), feature); - return 0; -} + if (!reorder) + return rc; + if (rc < 0) + return 0; -EXPORT_SYMBOL_GPL(dccp_feat_confirm_recv); + /* + * Reorder list: used for activating features and in dccp_insert_fn_opt. + */ + return dccp_feat_prefer(rc, fv->sp.vec, fv->sp.len); +} -void dccp_feat_clean(struct dccp_minisock *dmsk) +/** + * dccp_feat_change_recv - Process incoming ChangeL/R options + * @fn: feature-negotiation list to update + * @is_mandatory: whether the Change was preceded by a Mandatory option + * @opt: %DCCPO_CHANGE_L or %DCCPO_CHANGE_R + * @feat: one of %dccp_feature_numbers + * @val: NN value or SP value/preference list + * @len: length of @val in bytes + * @server: whether this node is the server (1) or the client (0) + */ +static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt, + u8 feat, u8 *val, u8 len, const bool server) { - struct dccp_opt_pend *opt, *next; + u8 defval, type = dccp_feat_type(feat); + const bool local = (opt == DCCPO_CHANGE_R); + struct dccp_feat_entry *entry; + dccp_feat_val fval; + + if (len == 0 || type == FEAT_UNKNOWN) /* 6.1 and 6.6.8 */ + goto unknown_feature_or_value; - list_for_each_entry_safe(opt, next, &dmsk->dccpms_pending, - dccpop_node) { - BUG_ON(opt->dccpop_val == NULL); - kfree(opt->dccpop_val); + /* + * Negotiation of NN features: Change R is invalid, so there is no + * simultaneous negotiation; hence we do not look up in the list. + */ + if (type == FEAT_NN) { + if (local || len > sizeof(fval.nn)) + goto unknown_feature_or_value; - if (opt->dccpop_sc != NULL) { - BUG_ON(opt->dccpop_sc->dccpoc_val == NULL); - kfree(opt->dccpop_sc->dccpoc_val); - kfree(opt->dccpop_sc); + /* 6.3.2: "The feature remote MUST accept any valid value..." */ + fval.nn = dccp_decode_value_var(val, len); + if (!dccp_feat_is_valid_nn_val(feat, fval.nn)) + goto unknown_feature_or_value; + + return dccp_feat_push_confirm(fn, feat, local, &fval); + } + + /* + * Unidirectional/simultaneous negotiation of SP features (6.3.1) + */ + entry = dccp_feat_list_lookup(fn, feat, local); + if (entry == NULL) { + /* + * No particular preferences have been registered. We deal with + * this situation by assuming that all valid values are equally + * acceptable, and apply the following checks: + * - if the peer's list is a singleton, we accept a valid value; + * - if we are the server, we first try to see if the peer (the + * client) advertises the default value. If yes, we use it, + * otherwise we accept the preferred value; + * - else if we are the client, we use the first list element. + */ + if (dccp_feat_clone_sp_val(&fval, val, 1)) + return DCCP_RESET_CODE_TOO_BUSY; + + if (len > 1 && server) { + defval = dccp_feat_default_value(feat); + if (dccp_feat_preflist_match(&defval, 1, val, len) > -1) + fval.sp.vec[0] = defval; + } else if (!dccp_feat_is_valid_sp_val(feat, fval.sp.vec[0])) { + kfree(fval.sp.vec); + goto unknown_feature_or_value; + } + + /* Treat unsupported CCIDs like invalid values */ + if (feat == DCCPF_CCID && !ccid_support_check(fval.sp.vec, 1)) { + kfree(fval.sp.vec); + goto not_valid_or_not_known; } - kfree(opt); + return dccp_feat_push_confirm(fn, feat, local, &fval); + + } else if (entry->state == FEAT_UNSTABLE) { /* 6.6.2 */ + return 0; } - INIT_LIST_HEAD(&dmsk->dccpms_pending); - list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) { - BUG_ON(opt == NULL); - if (opt->dccpop_val != NULL) - kfree(opt->dccpop_val); - kfree(opt); + if (dccp_feat_reconcile(&entry->val, val, len, server, true)) { + entry->empty_confirm = 0; + } else if (is_mandatory) { + return DCCP_RESET_CODE_MANDATORY_ERROR; + } else if (entry->state == FEAT_INITIALISING) { + /* + * Failed simultaneous negotiation (server only): try to `save' + * the connection by checking whether entry contains the default + * value for @feat. If yes, send an empty Confirm to signal that + * the received Change was not understood - which implies using + * the default value. + * If this also fails, we use Reset as the last resort. + */ + WARN_ON(!server); + defval = dccp_feat_default_value(feat); + if (!dccp_feat_reconcile(&entry->val, &defval, 1, server, true)) + return DCCP_RESET_CODE_OPTION_ERROR; + entry->empty_confirm = 1; } - INIT_LIST_HEAD(&dmsk->dccpms_conf); -} + entry->needs_confirm = 1; + entry->needs_mandatory = 0; + entry->state = FEAT_STABLE; + return 0; + +unknown_feature_or_value: + if (!is_mandatory) + return dccp_push_empty_confirm(fn, feat, local); -EXPORT_SYMBOL_GPL(dccp_feat_clean); +not_valid_or_not_known: + return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR + : DCCP_RESET_CODE_OPTION_ERROR; +} -/* this is to be called only when a listening sock creates its child. It is - * assumed by the function---the confirm is not duplicated, but rather it is - * "passed on". +/** + * dccp_feat_confirm_recv - Process received Confirm options + * @fn: feature-negotiation list to update + * @is_mandatory: whether @opt was preceded by a Mandatory option + * @opt: %DCCPO_CONFIRM_L or %DCCPO_CONFIRM_R + * @feat: one of %dccp_feature_numbers + * @val: NN value or SP value/preference list + * @len: length of @val in bytes + * @server: whether this node is server (1) or client (0) */ -int dccp_feat_clone(struct sock *oldsk, struct sock *newsk) +static u8 dccp_feat_confirm_recv(struct list_head *fn, u8 is_mandatory, u8 opt, + u8 feat, u8 *val, u8 len, const bool server) { - struct dccp_minisock *olddmsk = dccp_msk(oldsk); - struct dccp_minisock *newdmsk = dccp_msk(newsk); - struct dccp_opt_pend *opt; - int rc = 0; + u8 *plist, plen, type = dccp_feat_type(feat); + const bool local = (opt == DCCPO_CONFIRM_R); + struct dccp_feat_entry *entry = dccp_feat_list_lookup(fn, feat, local); - INIT_LIST_HEAD(&newdmsk->dccpms_pending); - INIT_LIST_HEAD(&newdmsk->dccpms_conf); + if (entry == NULL) { /* nothing queued: ignore or handle error */ + if (is_mandatory && type == FEAT_UNKNOWN) + return DCCP_RESET_CODE_MANDATORY_ERROR; - list_for_each_entry(opt, &olddmsk->dccpms_pending, dccpop_node) { - struct dccp_opt_pend *newopt; - /* copy the value of the option */ - u8 *val = kmemdup(opt->dccpop_val, opt->dccpop_len, GFP_ATOMIC); + if (!local && type == FEAT_NN) /* 6.3.2 */ + goto confirmation_failed; + return 0; + } - if (val == NULL) - goto out_clean; + if (entry->state != FEAT_CHANGING) /* 6.6.2 */ + return 0; - newopt = kmemdup(opt, sizeof(*newopt), GFP_ATOMIC); - if (newopt == NULL) { - kfree(val); - goto out_clean; - } + if (len == 0) { + if (dccp_feat_must_be_understood(feat)) /* 6.6.7 */ + goto confirmation_failed; + /* + * Empty Confirm during connection setup: this means reverting + * to the `old' value, which in this case is the default. Since + * we handle default values automatically when no other values + * have been set, we revert to the old value by removing this + * entry from the list. + */ + dccp_feat_list_pop(entry); + return 0; + } - /* insert the option */ - newopt->dccpop_val = val; - list_add_tail(&newopt->dccpop_node, &newdmsk->dccpms_pending); + if (type == FEAT_NN) { + if (len > sizeof(entry->val.nn)) + goto confirmation_failed; - /* XXX what happens with backlogs and multiple connections at - * once... - */ - /* the master socket no longer needs to worry about confirms */ - opt->dccpop_sc = NULL; /* it's not a memleak---new socket has it */ + if (entry->val.nn == dccp_decode_value_var(val, len)) + goto confirmation_succeeded; - /* reset state for a new socket */ - opt->dccpop_conf = 0; + DCCP_WARN("Bogus Confirm for non-existing value\n"); + goto confirmation_failed; } - /* XXX not doing anything about the conf queue */ + /* + * Parsing SP Confirms: the first element of @val is the preferred + * SP value which the peer confirms, the remainder depends on @len. + * Note that only the confirmed value need to be a valid SP value. + */ + if (!dccp_feat_is_valid_sp_val(feat, *val)) + goto confirmation_failed; + + if (len == 1) { /* peer didn't supply a preference list */ + plist = val; + plen = len; + } else { /* preferred value + preference list */ + plist = val + 1; + plen = len - 1; + } -out: - return rc; + /* Check whether the peer got the reconciliation right (6.6.8) */ + if (dccp_feat_reconcile(&entry->val, plist, plen, server, 0) != *val) { + DCCP_WARN("Confirm selected the wrong value %u\n", *val); + return DCCP_RESET_CODE_OPTION_ERROR; + } + entry->val.sp.vec[0] = *val; -out_clean: - dccp_feat_clean(newdmsk); - rc = -ENOMEM; - goto out; -} +confirmation_succeeded: + entry->state = FEAT_STABLE; + return 0; -EXPORT_SYMBOL_GPL(dccp_feat_clone); +confirmation_failed: + DCCP_WARN("Confirmation failed\n"); + return is_mandatory ? DCCP_RESET_CODE_MANDATORY_ERROR + : DCCP_RESET_CODE_OPTION_ERROR; +} -static int __dccp_feat_init(struct dccp_minisock *dmsk, u8 type, u8 feat, - u8 *val, u8 len) +/** + * dccp_feat_parse_options - Process Feature-Negotiation Options + * @sk: for general use and used by the client during connection setup + * @dreq: used by the server during connection setup + * @mandatory: whether @opt was preceded by a Mandatory option + * @opt: %DCCPO_CHANGE_L | %DCCPO_CHANGE_R | %DCCPO_CONFIRM_L | %DCCPO_CONFIRM_R + * @feat: one of %dccp_feature_numbers + * @val: value contents of @opt + * @len: length of @val in bytes + * Returns 0 on success, a Reset code for ending the connection otherwise. + */ +int dccp_feat_parse_options(struct sock *sk, struct dccp_request_sock *dreq, + u8 mandatory, u8 opt, u8 feat, u8 *val, u8 len) { - int rc = -ENOMEM; - u8 *copy = kmemdup(val, len, GFP_KERNEL); + struct dccp_sock *dp = dccp_sk(sk); + struct list_head *fn = dreq ? &dreq->dreq_featneg : &dp->dccps_featneg; + bool server = false; - if (copy != NULL) { - rc = dccp_feat_change(dmsk, type, feat, copy, len, GFP_KERNEL); - if (rc) - kfree(copy); + switch (sk->sk_state) { + /* + * Negotiation during connection setup + */ + case DCCP_LISTEN: + server = true; /* fall through */ + case DCCP_REQUESTING: + switch (opt) { + case DCCPO_CHANGE_L: + case DCCPO_CHANGE_R: + return dccp_feat_change_recv(fn, mandatory, opt, feat, + val, len, server); + case DCCPO_CONFIRM_R: + case DCCPO_CONFIRM_L: + return dccp_feat_confirm_recv(fn, mandatory, opt, feat, + val, len, server); + } } - return rc; + return 0; /* ignore FN options in all other states */ } -int dccp_feat_init(struct dccp_minisock *dmsk) +int dccp_feat_init(struct sock *sk) { + struct dccp_sock *dp = dccp_sk(sk); + struct dccp_minisock *dmsk = dccp_msk(sk); int rc; - INIT_LIST_HEAD(&dmsk->dccpms_pending); - INIT_LIST_HEAD(&dmsk->dccpms_conf); - - /* CCID L */ - rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_CCID, - &dmsk->dccpms_tx_ccid, 1); - if (rc) - goto out; - - /* CCID R */ - rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_R, DCCPF_CCID, - &dmsk->dccpms_rx_ccid, 1); - if (rc) - goto out; + INIT_LIST_HEAD(&dmsk->dccpms_pending); /* XXX no longer used */ + INIT_LIST_HEAD(&dmsk->dccpms_conf); /* XXX no longer used */ /* Ack ratio */ - rc = __dccp_feat_init(dmsk, DCCPO_CHANGE_L, DCCPF_ACK_RATIO, - &dmsk->dccpms_ack_ratio, 1); -out: + rc = __feat_register_nn(&dp->dccps_featneg, DCCPF_ACK_RATIO, 0, + dp->dccps_l_ack_ratio); return rc; } EXPORT_SYMBOL_GPL(dccp_feat_init); +int dccp_feat_activate_values(struct sock *sk, struct list_head *fn_list) +{ + struct dccp_sock *dp = dccp_sk(sk); + struct dccp_feat_entry *cur, *next; + int idx; + dccp_feat_val *fvals[DCCP_FEAT_SUPPORTED_MAX][2] = { + [0 ... DCCP_FEAT_SUPPORTED_MAX-1] = { NULL, NULL } + }; + + list_for_each_entry(cur, fn_list, node) { + /* + * An empty Confirm means that either an unknown feature type + * or an invalid value was present. In the first case there is + * nothing to activate, in the other the default value is used. + */ + if (cur->empty_confirm) + continue; + + idx = dccp_feat_index(cur->feat_num); + if (idx < 0) { + DCCP_BUG("Unknown feature %u", cur->feat_num); + goto activation_failed; + } + if (cur->state != FEAT_STABLE) { + DCCP_CRIT("Negotiation of %s %u failed in state %u", + cur->is_local ? "local" : "remote", + cur->feat_num, cur->state); + goto activation_failed; + } + fvals[idx][cur->is_local] = &cur->val; + } + + /* + * Activate in decreasing order of index, so that the CCIDs are always + * activated as the last feature. This avoids the case where a CCID + * relies on the initialisation of one or more features that it depends + * on (e.g. Send NDP Count, Send Ack Vector, and Ack Ratio features). + */ + for (idx = DCCP_FEAT_SUPPORTED_MAX; --idx >= 0;) + if (__dccp_feat_activate(sk, idx, 0, fvals[idx][0]) || + __dccp_feat_activate(sk, idx, 1, fvals[idx][1])) { + DCCP_CRIT("Could not activate %d", idx); + goto activation_failed; + } + + /* Clean up Change options which have been confirmed already */ + list_for_each_entry_safe(cur, next, fn_list, node) + if (!cur->needs_confirm) + dccp_feat_list_pop(cur); + + dccp_pr_debug("Activation OK\n"); + return 0; + +activation_failed: + /* + * We clean up everything that may have been allocated, since + * it is difficult to track at which stage negotiation failed. + * This is ok, since all allocation functions below are robust + * against NULL arguments. + */ + ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); + ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); + dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; + dccp_ackvec_free(dp->dccps_hc_rx_ackvec); + dp->dccps_hc_rx_ackvec = NULL; + return -1; +} + #ifdef CONFIG_IP_DCCP_DEBUG const char *dccp_feat_typename(const u8 type) { @@ -620,8 +1214,6 @@ const char *dccp_feat_typename(const u8 type) return NULL; } -EXPORT_SYMBOL_GPL(dccp_feat_typename); - const char *dccp_feat_name(const u8 feat) { static const char *feature_names[] = { @@ -639,11 +1231,11 @@ const char *dccp_feat_name(const u8 feat) if (feat > DCCPF_DATA_CHECKSUM && feat < DCCPF_MIN_CCID_SPECIFIC) return feature_names[DCCPF_RESERVED]; + if (feat == DCCPF_SEND_LEV_RATE) + return "Send Loss Event Rate"; if (feat >= DCCPF_MIN_CCID_SPECIFIC) return "CCID-specific"; return feature_names[feat]; } - -EXPORT_SYMBOL_GPL(dccp_feat_name); #endif /* CONFIG_IP_DCCP_DEBUG */ diff --git a/net/dccp/feat.h b/net/dccp/feat.h index e272222c7ac..9b46e2a7866 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -3,17 +3,103 @@ /* * net/dccp/feat.h * - * An implementation of the DCCP protocol + * Feature negotiation for the DCCP protocol (RFC 4340, section 6) + * Copyright (c) 2008 Gerrit Renker <gerrit@erg.abdn.ac.uk> * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk> * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. */ - #include <linux/types.h> #include "dccp.h" +/* + * Known limit values + */ +/* Ack Ratio takes 2-byte integer values (11.3) */ +#define DCCPF_ACK_RATIO_MAX 0xFFFF +/* Wmin=32 and Wmax=2^46-1 from 7.5.2 */ +#define DCCPF_SEQ_WMIN 32 +#define DCCPF_SEQ_WMAX 0x3FFFFFFFFFFFull +/* Maximum number of SP values that fit in a single (Confirm) option */ +#define DCCP_FEAT_MAX_SP_VALS (DCCP_SINGLE_OPT_MAXLEN - 2) + +enum dccp_feat_type { + FEAT_AT_RX = 1, /* located at RX side of half-connection */ + FEAT_AT_TX = 2, /* located at TX side of half-connection */ + FEAT_SP = 4, /* server-priority reconciliation (6.3.1) */ + FEAT_NN = 8, /* non-negotiable reconciliation (6.3.2) */ + FEAT_UNKNOWN = 0xFF /* not understood or invalid feature */ +}; + +enum dccp_feat_state { + FEAT_DEFAULT = 0, /* using default values from 6.4 */ + FEAT_INITIALISING, /* feature is being initialised */ + FEAT_CHANGING, /* Change sent but not confirmed yet */ + FEAT_UNSTABLE, /* local modification in state CHANGING */ + FEAT_STABLE /* both ends (think they) agree */ +}; + +/** + * dccp_feat_val - Container for SP or NN feature values + * @nn: single NN value + * @sp.vec: single SP value plus optional preference list + * @sp.len: length of @sp.vec in bytes + */ +typedef union { + u64 nn; + struct { + u8 *vec; + u8 len; + } sp; +} dccp_feat_val; + +/** + * struct feat_entry - Data structure to perform feature negotiation + * @val: feature's current value (SP features may have preference list) + * @state: feature's current state + * @feat_num: one of %dccp_feature_numbers + * @needs_mandatory: whether Mandatory options should be sent + * @needs_confirm: whether to send a Confirm instead of a Change + * @empty_confirm: whether to send an empty Confirm (depends on @needs_confirm) + * @is_local: feature location (1) or feature-remote (0) + * @node: list pointers, entries arranged in FIFO order + */ +struct dccp_feat_entry { + dccp_feat_val val; + enum dccp_feat_state state:8; + u8 feat_num; + + bool needs_mandatory, + needs_confirm, + empty_confirm, + is_local; + + struct list_head node; +}; + +static inline u8 dccp_feat_genopt(struct dccp_feat_entry *entry) +{ + if (entry->needs_confirm) + return entry->is_local ? DCCPO_CONFIRM_L : DCCPO_CONFIRM_R; + return entry->is_local ? DCCPO_CHANGE_L : DCCPO_CHANGE_R; +} + +/** + * struct ccid_dependency - Track changes resulting from choosing a CCID + * @dependent_feat: one of %dccp_feature_numbers + * @is_local: local (1) or remote (0) @dependent_feat + * @is_mandatory: whether presence of @dependent_feat is mission-critical or not + * @val: corresponding default value for @dependent_feat (u8 is sufficient here) + */ +struct ccid_dependency { + u8 dependent_feat; + bool is_local:1, + is_mandatory:1; + u8 val; +}; + #ifdef CONFIG_IP_DCCP_DEBUG extern const char *dccp_feat_typename(const u8 type); extern const char *dccp_feat_name(const u8 feat); @@ -27,14 +113,30 @@ static inline void dccp_feat_debug(const u8 type, const u8 feat, const u8 val) #define dccp_feat_debug(type, feat, val) #endif /* CONFIG_IP_DCCP_DEBUG */ -extern int dccp_feat_change(struct dccp_minisock *dmsk, u8 type, u8 feature, - u8 *val, u8 len, gfp_t gfp); -extern int dccp_feat_change_recv(struct sock *sk, u8 type, u8 feature, - u8 *val, u8 len); -extern int dccp_feat_confirm_recv(struct sock *sk, u8 type, u8 feature, - u8 *val, u8 len); -extern void dccp_feat_clean(struct dccp_minisock *dmsk); -extern int dccp_feat_clone(struct sock *oldsk, struct sock *newsk); -extern int dccp_feat_init(struct dccp_minisock *dmsk); +extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, + u8 const *list, u8 len); +extern int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val); +extern int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *, + u8 mand, u8 opt, u8 feat, u8 *val, u8 len); +extern int dccp_feat_clone_list(struct list_head const *, struct list_head *); +extern int dccp_feat_init(struct sock *sk); + +/* + * Encoding variable-length options and their maximum length. + * + * This affects NN options (SP options are all u8) and other variable-length + * options (see table 3 in RFC 4340). The limit is currently given the Sequence + * Window NN value (sec. 7.5.2) and the NDP count (sec. 7.7) option, all other + * options consume less than 6 bytes (timestamps are 4 bytes). + * When updating this constant (e.g. due to new internet drafts / RFCs), make + * sure that you also update all code which refers to it. + */ +#define DCCP_OPTVAL_MAXLEN 6 + +extern void dccp_encode_value_var(const u64 value, u8 *to, const u8 len); +extern u64 dccp_decode_value_var(const u8 *bf, const u8 len); +extern int dccp_insert_option_mandatory(struct sk_buff *skb); +extern int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, + u8 *val, u8 len, bool repeat_first); #endif /* _DCCP_FEAT_H */ diff --git a/net/dccp/input.c b/net/dccp/input.c index 779d0ed9ae9..7648f316310 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -163,7 +163,7 @@ static void dccp_event_ack_recv(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); - if (dccp_msk(sk)->dccpms_send_ack_vector) + if (dp->dccps_hc_rx_ackvec != NULL) dccp_ackvec_check_rcv_ackno(dp->dccps_hc_rx_ackvec, sk, DCCP_SKB_CB(skb)->dccpd_ack_seq); } @@ -375,7 +375,7 @@ int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) dccp_event_ack_recv(sk, skb); - if (dccp_msk(sk)->dccpms_send_ack_vector && + if (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_ACKVEC_STATE_RECEIVED)) @@ -421,20 +421,19 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, goto out_invalid_packet; } + /* + * If option processing (Step 8) failed, return 1 here so that + * dccp_v4_do_rcv() sends a Reset. The Reset code depends on + * the option type and is set in dccp_parse_options(). + */ if (dccp_parse_options(sk, NULL, skb)) - goto out_invalid_packet; + return 1; /* Obtain usec RTT sample from SYN exchange (used by CCID 3) */ if (likely(dp->dccps_options_received.dccpor_timestamp_echo)) dp->dccps_syn_rtt = dccp_sample_rtt(sk, 10 * (tstamp - dp->dccps_options_received.dccpor_timestamp_echo)); - if (dccp_msk(sk)->dccpms_send_ack_vector && - dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, - DCCP_SKB_CB(skb)->dccpd_seq, - DCCP_ACKVEC_STATE_RECEIVED)) - goto out_invalid_packet; /* FIXME: change error code */ - /* Stop the REQUEST timer */ inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); WARN_ON(sk->sk_send_head == NULL); @@ -475,6 +474,15 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, */ dccp_set_state(sk, DCCP_PARTOPEN); + /* + * If feature negotiation was successful, activate features now; + * an activation failure means that this host could not activate + * one ore more features (e.g. insufficient memory), which would + * leave at least one feature in an undefined state. + */ + if (dccp_feat_activate_values(sk, &dp->dccps_featneg)) + goto unable_to_proceed; + /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); @@ -509,6 +517,16 @@ out_invalid_packet: /* dccp_v4_do_rcv will send a reset */ DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_PACKET_ERROR; return 1; + +unable_to_proceed: + DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_ABORTED; + /* + * We mark this socket as no longer usable, so that the loop in + * dccp_sendmsg() terminates and the application gets notified. + */ + dccp_set_state(sk, DCCP_CLOSED); + sk->sk_err = ECOMM; + return 1; } static int dccp_rcv_respond_partopen_state_process(struct sock *sk, @@ -590,8 +608,6 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (inet_csk(sk)->icsk_af_ops->conn_request(sk, skb) < 0) return 1; - - /* FIXME: do congestion control initialization */ goto discard; } if (dh->dccph_type == DCCP_PKT_RESET) @@ -602,7 +618,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, return 1; } - if (sk->sk_state != DCCP_REQUESTING) { + if (sk->sk_state != DCCP_REQUESTING && sk->sk_state != DCCP_RESPOND) { if (dccp_check_seqno(sk, skb)) goto discard; @@ -615,7 +631,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) dccp_event_ack_recv(sk, skb); - if (dccp_msk(sk)->dccpms_send_ack_vector && + if (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_ACKVEC_STATE_RECEIVED)) @@ -667,8 +683,6 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, return 1; case DCCP_REQUESTING: - /* FIXME: do congestion control initialization */ - queued = dccp_rcv_request_sent_state_process(sk, skb, dh, len); if (queued >= 0) return queued; @@ -727,5 +741,3 @@ u32 dccp_sample_rtt(struct sock *sk, long delta) return delta; } - -EXPORT_SYMBOL_GPL(dccp_sample_rtt); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index e3dfddab21c..d1dd95289b8 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -545,6 +545,7 @@ out: static void dccp_v4_reqsk_destructor(struct request_sock *req) { + dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); kfree(inet_rsk(req)->opt); } @@ -595,7 +596,8 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (req == NULL) goto drop; - dccp_reqsk_init(req, skb); + if (dccp_reqsk_init(req, dccp_sk(sk), skb)) + goto drop_and_free; dreq = dccp_rsk(req); if (dccp_parse_options(sk, dreq, skb)) @@ -792,12 +794,10 @@ static int dccp_v4_rcv(struct sk_buff *skb) DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(dh); DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type; - dccp_pr_debug("%8.8s " - "src=%u.%u.%u.%u@%-5d " - "dst=%u.%u.%u.%u@%-5d seq=%llu", + dccp_pr_debug("%8.8s src=%pI4@%-5d dst=%pI4@%-5d seq=%llu", dccp_packet_name(dh->dccph_type), - NIPQUAD(iph->saddr), ntohs(dh->dccph_sport), - NIPQUAD(iph->daddr), ntohs(dh->dccph_dport), + &iph->saddr, ntohs(dh->dccph_sport), + &iph->daddr, ntohs(dh->dccph_dport), (unsigned long long) DCCP_SKB_CB(skb)->dccpd_seq); if (dccp_packet_without_ack(skb)) { @@ -938,6 +938,7 @@ static struct proto dccp_v4_prot = { .orphan_count = &dccp_orphan_count, .max_header = MAX_DCCP_HEADER, .obj_size = sizeof(struct dccp_sock), + .slab_flags = SLAB_DESTROY_BY_RCU, .rsk_prot = &dccp_request_sock_ops, .twsk_prot = &dccp_timewait_sock_ops, .h.hashinfo = &dccp_hashinfo, diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index d4ce1224e00..b963f35c65f 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -168,7 +168,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, goto out; } - err = xfrm_lookup(&dst, &fl, sk, 0); + err = xfrm_lookup(net, &dst, &fl, sk, 0); if (err < 0) { sk->sk_err_soft = -err; goto out; @@ -279,7 +279,7 @@ static int dccp_v6_send_response(struct sock *sk, struct request_sock *req) if (final_p) ipv6_addr_copy(&fl.fl6_dst, final_p); - err = xfrm_lookup(&dst, &fl, sk, 0); + err = xfrm_lookup(sock_net(sk), &dst, &fl, sk, 0); if (err < 0) goto done; @@ -304,6 +304,7 @@ done: static void dccp_v6_reqsk_destructor(struct request_sock *req) { + dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); if (inet6_rsk(req)->pktopts != NULL) kfree_skb(inet6_rsk(req)->pktopts); } @@ -342,7 +343,7 @@ static void dccp_v6_ctl_send_reset(struct sock *sk, struct sk_buff *rxskb) /* sk = NULL, but it is safe for now. RST socket required. */ if (!ip6_dst_lookup(ctl_sk, &skb->dst, &fl)) { - if (xfrm_lookup(&skb->dst, &fl, NULL, 0) >= 0) { + if (xfrm_lookup(net, &skb->dst, &fl, NULL, 0) >= 0) { ip6_xmit(ctl_sk, skb, &fl, NULL, 0); DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS); DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS); @@ -426,7 +427,8 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (req == NULL) goto drop; - dccp_reqsk_init(req, skb); + if (dccp_reqsk_init(req, dccp_sk(sk), skb)) + goto drop_and_free; dreq = dccp_rsk(req); if (dccp_parse_options(sk, dreq, skb)) @@ -567,7 +569,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, if (final_p) ipv6_addr_copy(&fl.fl6_dst, final_p); - if ((xfrm_lookup(&dst, &fl, sk, 0)) < 0) + if ((xfrm_lookup(sock_net(sk), &dst, &fl, sk, 0)) < 0) goto out; } @@ -1002,7 +1004,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (final_p) ipv6_addr_copy(&fl.fl6_dst, final_p); - err = __xfrm_lookup(&dst, &fl, sk, XFRM_LOOKUP_WAIT); + err = __xfrm_lookup(sock_net(sk), &dst, &fl, sk, XFRM_LOOKUP_WAIT); if (err < 0) { if (err == -EREMOTE) err = ip6_dst_blackhole(sk, &dst, &fl); @@ -1138,6 +1140,7 @@ static struct proto dccp_v6_prot = { .orphan_count = &dccp_orphan_count, .max_header = MAX_DCCP_HEADER, .obj_size = sizeof(struct dccp6_sock), + .slab_flags = SLAB_DESTROY_BY_RCU, .rsk_prot = &dccp6_request_sock_ops, .twsk_prot = &dccp6_timewait_sock_ops, .h.hashinfo = &dccp_hashinfo, diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index e6bf99e3e41..6821ae33dd3 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -45,11 +45,6 @@ EXPORT_SYMBOL_GPL(dccp_death_row); void dccp_minisock_init(struct dccp_minisock *dmsk) { dmsk->dccpms_sequence_window = sysctl_dccp_feat_sequence_window; - dmsk->dccpms_rx_ccid = sysctl_dccp_feat_rx_ccid; - dmsk->dccpms_tx_ccid = sysctl_dccp_feat_tx_ccid; - dmsk->dccpms_ack_ratio = sysctl_dccp_feat_ack_ratio; - dmsk->dccpms_send_ack_vector = sysctl_dccp_feat_send_ack_vector; - dmsk->dccpms_send_ndp_count = sysctl_dccp_feat_send_ndp_count; } void dccp_time_wait(struct sock *sk, int state, int timeo) @@ -112,7 +107,7 @@ struct sock *dccp_create_openreq_child(struct sock *sk, struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); if (newsk != NULL) { - const struct dccp_request_sock *dreq = dccp_rsk(req); + struct dccp_request_sock *dreq = dccp_rsk(req); struct inet_connection_sock *newicsk = inet_csk(newsk); struct dccp_sock *newdp = dccp_sk(newsk); struct dccp_minisock *newdmsk = dccp_msk(newsk); @@ -125,35 +120,7 @@ struct sock *dccp_create_openreq_child(struct sock *sk, newdp->dccps_timestamp_time = dreq->dreq_timestamp_time; newicsk->icsk_rto = DCCP_TIMEOUT_INIT; - if (dccp_feat_clone(sk, newsk)) - goto out_free; - - if (newdmsk->dccpms_send_ack_vector) { - newdp->dccps_hc_rx_ackvec = - dccp_ackvec_alloc(GFP_ATOMIC); - if (unlikely(newdp->dccps_hc_rx_ackvec == NULL)) - goto out_free; - } - - newdp->dccps_hc_rx_ccid = - ccid_hc_rx_new(newdmsk->dccpms_rx_ccid, - newsk, GFP_ATOMIC); - newdp->dccps_hc_tx_ccid = - ccid_hc_tx_new(newdmsk->dccpms_tx_ccid, - newsk, GFP_ATOMIC); - if (unlikely(newdp->dccps_hc_rx_ccid == NULL || - newdp->dccps_hc_tx_ccid == NULL)) { - dccp_ackvec_free(newdp->dccps_hc_rx_ackvec); - ccid_hc_rx_delete(newdp->dccps_hc_rx_ccid, newsk); - ccid_hc_tx_delete(newdp->dccps_hc_tx_ccid, newsk); -out_free: - /* It is still raw copy of parent, so invalidate - * destructor and make plain sk_free() */ - newsk->sk_destruct = NULL; - sk_free(newsk); - return NULL; - } - + INIT_LIST_HEAD(&newdp->dccps_featneg); /* * Step 3: Process LISTEN state * @@ -184,6 +151,17 @@ out_free: dccp_set_seqno(&newdp->dccps_awl, max48(newdp->dccps_awl, newdp->dccps_iss)); + /* + * Activate features after initialising the sequence numbers, + * since CCID initialisation may depend on GSS, ISR, ISS etc. + */ + if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) { + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() */ + newsk->sk_destruct = NULL; + sk_free(newsk); + return NULL; + } dccp_init_xmit_timers(newsk); DCCP_INC_STATS_BH(DCCP_MIB_PASSIVEOPENS); @@ -304,7 +282,8 @@ void dccp_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, EXPORT_SYMBOL_GPL(dccp_reqsk_send_ack); -void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb) +int dccp_reqsk_init(struct request_sock *req, + struct dccp_sock const *dp, struct sk_buff const *skb) { struct dccp_request_sock *dreq = dccp_rsk(req); @@ -313,6 +292,9 @@ void dccp_reqsk_init(struct request_sock *req, struct sk_buff *skb) inet_rsk(req)->acked = 0; req->rcv_wnd = sysctl_dccp_feat_sequence_window; dreq->dreq_timestamp_echo = 0; + + /* inherit feature negotiation options from listening socket */ + return dccp_feat_clone_list(&dp->dccps_featneg, &dreq->dreq_featneg); } EXPORT_SYMBOL_GPL(dccp_reqsk_init); diff --git a/net/dccp/options.c b/net/dccp/options.c index 0809b63cb05..7b1165c21f5 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -26,20 +26,21 @@ int sysctl_dccp_feat_sequence_window = DCCPF_INITIAL_SEQUENCE_WINDOW; int sysctl_dccp_feat_rx_ccid = DCCPF_INITIAL_CCID; int sysctl_dccp_feat_tx_ccid = DCCPF_INITIAL_CCID; -int sysctl_dccp_feat_ack_ratio = DCCPF_INITIAL_ACK_RATIO; -int sysctl_dccp_feat_send_ack_vector = DCCPF_INITIAL_SEND_ACK_VECTOR; -int sysctl_dccp_feat_send_ndp_count = DCCPF_INITIAL_SEND_NDP_COUNT; -static u32 dccp_decode_value_var(const unsigned char *bf, const u8 len) +u64 dccp_decode_value_var(const u8 *bf, const u8 len) { - u32 value = 0; + u64 value = 0; + if (len >= DCCP_OPTVAL_MAXLEN) + value += ((u64)*bf++) << 40; + if (len > 4) + value += ((u64)*bf++) << 32; if (len > 3) - value += *bf++ << 24; + value += ((u64)*bf++) << 24; if (len > 2) - value += *bf++ << 16; + value += ((u64)*bf++) << 16; if (len > 1) - value += *bf++ << 8; + value += ((u64)*bf++) << 8; if (len > 0) value += *bf; @@ -64,7 +65,7 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, (dh->dccph_doff * 4); struct dccp_options_received *opt_recv = &dp->dccps_options_received; unsigned char opt, len; - unsigned char *value; + unsigned char *uninitialized_var(value); u32 elapsed_time; __be32 opt_val; int rc; @@ -131,41 +132,19 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, dccp_pr_debug("%s opt: NDP count=%llu\n", dccp_role(sk), (unsigned long long)opt_recv->dccpor_ndp); break; - case DCCPO_CHANGE_L: - /* fall through */ - case DCCPO_CHANGE_R: - if (pkt_type == DCCP_PKT_DATA) + case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R: + if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */ break; - if (len < 2) - goto out_invalid_option; - rc = dccp_feat_change_recv(sk, opt, *value, value + 1, - len - 1); - /* - * When there is a change error, change_recv is - * responsible for dealing with it. i.e. reply with an - * empty confirm. - * If the change was mandatory, then we need to die. - */ - if (rc && mandatory) - goto out_invalid_option; - break; - case DCCPO_CONFIRM_L: - /* fall through */ - case DCCPO_CONFIRM_R: - if (pkt_type == DCCP_PKT_DATA) - break; - if (len < 2) /* FIXME this disallows empty confirm */ - goto out_invalid_option; - if (dccp_feat_confirm_recv(sk, opt, *value, - value + 1, len - 1)) - goto out_invalid_option; + rc = dccp_feat_parse_options(sk, dreq, mandatory, opt, + *value, value + 1, len - 1); + if (rc) + goto out_featneg_failed; break; case DCCPO_ACK_VECTOR_0: case DCCPO_ACK_VECTOR_1: if (dccp_packet_without_ack(skb)) /* RFC 4340, 11.4 */ break; - - if (dccp_msk(sk)->dccpms_send_ack_vector && + if (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_parse(sk, skb, &ackno, opt, value, len)) goto out_invalid_option; break; @@ -289,8 +268,10 @@ out_nonsensical_length: out_invalid_option: DCCP_INC_STATS_BH(DCCP_MIB_INVALIDOPT); - DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_OPTION_ERROR; - DCCP_WARN("DCCP(%p): invalid option %d, len=%d", sk, opt, len); + rc = DCCP_RESET_CODE_OPTION_ERROR; +out_featneg_failed: + DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc); + DCCP_SKB_CB(skb)->dccpd_reset_code = rc; DCCP_SKB_CB(skb)->dccpd_reset_data[0] = opt; DCCP_SKB_CB(skb)->dccpd_reset_data[1] = len > 0 ? value[0] : 0; DCCP_SKB_CB(skb)->dccpd_reset_data[2] = len > 1 ? value[1] : 0; @@ -299,9 +280,12 @@ out_invalid_option: EXPORT_SYMBOL_GPL(dccp_parse_options); -static void dccp_encode_value_var(const u32 value, unsigned char *to, - const unsigned int len) +void dccp_encode_value_var(const u64 value, u8 *to, const u8 len) { + if (len >= DCCP_OPTVAL_MAXLEN) + *to++ = (value & 0xFF0000000000ull) >> 40; + if (len > 4) + *to++ = (value & 0xFF00000000ull) >> 32; if (len > 3) *to++ = (value & 0xFF000000) >> 24; if (len > 2) @@ -461,23 +445,61 @@ static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp, return 0; } -static int dccp_insert_feat_opt(struct sk_buff *skb, u8 type, u8 feat, - u8 *val, u8 len) +/** + * dccp_insert_option_mandatory - Mandatory option (5.8.2) + * Note that since we are using skb_push, this function needs to be called + * _after_ inserting the option it is supposed to influence (stack order). + */ +int dccp_insert_option_mandatory(struct sk_buff *skb) +{ + if (DCCP_SKB_CB(skb)->dccpd_opt_len >= DCCP_MAX_OPT_LEN) + return -1; + + DCCP_SKB_CB(skb)->dccpd_opt_len++; + *skb_push(skb, 1) = DCCPO_MANDATORY; + return 0; +} + +/** + * dccp_insert_fn_opt - Insert single Feature-Negotiation option into @skb + * @type: %DCCPO_CHANGE_L, %DCCPO_CHANGE_R, %DCCPO_CONFIRM_L, %DCCPO_CONFIRM_R + * @feat: one out of %dccp_feature_numbers + * @val: NN value or SP array (preferred element first) to copy + * @len: true length of @val in bytes (excluding first element repetition) + * @repeat_first: whether to copy the first element of @val twice + * The last argument is used to construct Confirm options, where the preferred + * value and the preference list appear separately (RFC 4340, 6.3.1). Preference + * lists are kept such that the preferred entry is always first, so we only need + * to copy twice, and avoid the overhead of cloning into a bigger array. + */ +int dccp_insert_fn_opt(struct sk_buff *skb, u8 type, u8 feat, + u8 *val, u8 len, bool repeat_first) { - u8 *to; + u8 tot_len, *to; - if (DCCP_SKB_CB(skb)->dccpd_opt_len + len + 3 > DCCP_MAX_OPT_LEN) { - DCCP_WARN("packet too small for feature %d option!\n", feat); + /* take the `Feature' field and possible repetition into account */ + if (len > (DCCP_SINGLE_OPT_MAXLEN - 2)) { + DCCP_WARN("length %u for feature %u too large\n", len, feat); return -1; } - DCCP_SKB_CB(skb)->dccpd_opt_len += len + 3; + if (unlikely(val == NULL || len == 0)) + len = repeat_first = 0; + tot_len = 3 + repeat_first + len; + + if (DCCP_SKB_CB(skb)->dccpd_opt_len + tot_len > DCCP_MAX_OPT_LEN) { + DCCP_WARN("packet too small for feature %d option!\n", feat); + return -1; + } + DCCP_SKB_CB(skb)->dccpd_opt_len += tot_len; - to = skb_push(skb, len + 3); + to = skb_push(skb, tot_len); *to++ = type; - *to++ = len + 3; + *to++ = tot_len; *to++ = feat; + if (repeat_first) + *to++ = *val; if (len) memcpy(to, val, len); @@ -487,69 +509,6 @@ static int dccp_insert_feat_opt(struct sk_buff *skb, u8 type, u8 feat, return 0; } -static int dccp_insert_options_feat(struct sock *sk, struct sk_buff *skb) -{ - struct dccp_sock *dp = dccp_sk(sk); - struct dccp_minisock *dmsk = dccp_msk(sk); - struct dccp_opt_pend *opt, *next; - int change = 0; - - /* confirm any options [NN opts] */ - list_for_each_entry_safe(opt, next, &dmsk->dccpms_conf, dccpop_node) { - dccp_insert_feat_opt(skb, opt->dccpop_type, - opt->dccpop_feat, opt->dccpop_val, - opt->dccpop_len); - /* fear empty confirms */ - if (opt->dccpop_val) - kfree(opt->dccpop_val); - kfree(opt); - } - INIT_LIST_HEAD(&dmsk->dccpms_conf); - - /* see which features we need to send */ - list_for_each_entry(opt, &dmsk->dccpms_pending, dccpop_node) { - /* see if we need to send any confirm */ - if (opt->dccpop_sc) { - dccp_insert_feat_opt(skb, opt->dccpop_type + 1, - opt->dccpop_feat, - opt->dccpop_sc->dccpoc_val, - opt->dccpop_sc->dccpoc_len); - - BUG_ON(!opt->dccpop_sc->dccpoc_val); - kfree(opt->dccpop_sc->dccpoc_val); - kfree(opt->dccpop_sc); - opt->dccpop_sc = NULL; - } - - /* any option not confirmed, re-send it */ - if (!opt->dccpop_conf) { - dccp_insert_feat_opt(skb, opt->dccpop_type, - opt->dccpop_feat, opt->dccpop_val, - opt->dccpop_len); - change++; - } - } - - /* Retransmit timer. - * If this is the master listening sock, we don't set a timer on it. It - * should be fine because if the dude doesn't receive our RESPONSE - * [which will contain the CHANGE] he will send another REQUEST which - * will "retrnasmit" the change. - */ - if (change && dp->dccps_role != DCCP_ROLE_LISTEN) { - dccp_pr_debug("reset feat negotiation timer %p\n", sk); - - /* XXX don't reset the timer on re-transmissions. I.e. reset it - * only when sending new stuff i guess. Currently the timer - * never backs off because on re-transmission it just resets it! - */ - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, DCCP_RTO_MAX); - } - - return 0; -} - /* The length of all options needs to be a multiple of 4 (5.8) */ static void dccp_insert_option_padding(struct sk_buff *skb) { @@ -565,19 +524,31 @@ static void dccp_insert_option_padding(struct sk_buff *skb) int dccp_insert_options(struct sock *sk, struct sk_buff *skb) { struct dccp_sock *dp = dccp_sk(sk); - struct dccp_minisock *dmsk = dccp_msk(sk); DCCP_SKB_CB(skb)->dccpd_opt_len = 0; - if (dmsk->dccpms_send_ndp_count && - dccp_insert_option_ndp(sk, skb)) + if (dp->dccps_send_ndp_count && dccp_insert_option_ndp(sk, skb)) return -1; - if (!dccp_packet_without_ack(skb)) { - if (dmsk->dccpms_send_ack_vector && - dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) && - dccp_insert_option_ackvec(sk, skb)) + if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA) { + + /* Feature Negotiation */ + if (dccp_feat_insert_opts(dp, NULL, skb)) return -1; + + if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST) { + /* + * Obtain RTT sample from Request/Response exchange. + * This is currently used in CCID 3 initialisation. + */ + if (dccp_insert_option_timestamp(sk, skb)) + return -1; + + } else if (dp->dccps_hc_rx_ackvec != NULL && + dccp_ackvec_pending(dp->dccps_hc_rx_ackvec) && + dccp_insert_option_ackvec(sk, skb)) { + return -1; + } } if (dp->dccps_hc_rx_insert_options) { @@ -586,21 +557,6 @@ int dccp_insert_options(struct sock *sk, struct sk_buff *skb) dp->dccps_hc_rx_insert_options = 0; } - /* Feature negotiation */ - /* Data packets can't do feat negotiation */ - if (DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATA && - DCCP_SKB_CB(skb)->dccpd_type != DCCP_PKT_DATAACK && - dccp_insert_options_feat(sk, skb)) - return -1; - - /* - * Obtain RTT sample from Request/Response exchange. - * This is currently used in CCID 3 initialisation. - */ - if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_REQUEST && - dccp_insert_option_timestamp(sk, skb)) - return -1; - if (dp->dccps_timestamp_echo != 0 && dccp_insert_option_timestamp_echo(dp, NULL, skb)) return -1; @@ -613,6 +569,9 @@ int dccp_insert_options_rsk(struct dccp_request_sock *dreq, struct sk_buff *skb) { DCCP_SKB_CB(skb)->dccpd_opt_len = 0; + if (dccp_feat_insert_opts(NULL, dreq, skb)) + return -1; + if (dreq->dreq_timestamp_echo != 0 && dccp_insert_option_timestamp_echo(NULL, dreq, skb)) return -1; diff --git a/net/dccp/output.c b/net/dccp/output.c index 809d803d500..22a618af489 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -175,7 +175,7 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) * make it a multiple of 4 */ - cur_mps -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4; + cur_mps -= roundup(5 + 6 + 10 + 6 + 6 + 6, 4); /* And store cached results */ icsk->icsk_pmtu_cookie = pmtu; @@ -339,10 +339,12 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, DCCP_SKB_CB(skb)->dccpd_type = DCCP_PKT_RESPONSE; DCCP_SKB_CB(skb)->dccpd_seq = dreq->dreq_iss; - if (dccp_insert_options_rsk(dreq, skb)) { - kfree_skb(skb); - return NULL; - } + /* Resolve feature dependencies resulting from choice of CCID */ + if (dccp_feat_server_ccid_dependencies(dreq)) + goto response_failed; + + if (dccp_insert_options_rsk(dreq, skb)) + goto response_failed; /* Build and checksum header */ dh = dccp_zeroed_hdr(skb, dccp_header_size); @@ -363,6 +365,9 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, inet_rsk(req)->acked = 1; DCCP_INC_STATS(DCCP_MIB_OUTSEGS); return skb; +response_failed: + kfree_skb(skb); + return NULL; } EXPORT_SYMBOL_GPL(dccp_make_response); @@ -469,6 +474,10 @@ int dccp_connect(struct sock *sk) struct sk_buff *skb; struct inet_connection_sock *icsk = inet_csk(sk); + /* do not connect if feature negotiation setup fails */ + if (dccp_feat_finalise_settings(dccp_sk(sk))) + return -EPROTO; + dccp_connect_init(sk); skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation); diff --git a/net/dccp/probe.c b/net/dccp/probe.c index 81368a7f537..37731da4148 100644 --- a/net/dccp/probe.c +++ b/net/dccp/probe.c @@ -74,30 +74,27 @@ static void printl(const char *fmt, ...) static int jdccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t size) { - const struct dccp_minisock *dmsk = dccp_msk(sk); const struct inet_sock *inet = inet_sk(sk); - const struct ccid3_hc_tx_sock *hctx; + struct ccid3_hc_tx_sock *hctx = NULL; - if (dmsk->dccpms_tx_ccid == DCCPC_CCID3) + if (ccid_get_current_tx_ccid(dccp_sk(sk)) == DCCPC_CCID3) hctx = ccid3_hc_tx_sk(sk); - else - hctx = NULL; if (port == 0 || ntohs(inet->dport) == port || ntohs(inet->sport) == port) { if (hctx) - printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %d %d %d %u " + printl("%pI4:%u %pI4:%u %d %d %d %d %u " "%llu %llu %d\n", - NIPQUAD(inet->saddr), ntohs(inet->sport), - NIPQUAD(inet->daddr), ntohs(inet->dport), size, + &inet->saddr, ntohs(inet->sport), + &inet->daddr, ntohs(inet->dport), size, hctx->ccid3hctx_s, hctx->ccid3hctx_rtt, hctx->ccid3hctx_p, hctx->ccid3hctx_x_calc, hctx->ccid3hctx_x_recv >> 6, hctx->ccid3hctx_x >> 6, hctx->ccid3hctx_t_ipi); else - printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d\n", - NIPQUAD(inet->saddr), ntohs(inet->sport), - NIPQUAD(inet->daddr), ntohs(inet->dport), size); + printl("%pI4:%u %pI4:%u %d\n", + &inet->saddr, ntohs(inet->sport), + &inet->daddr, ntohs(inet->dport), size); } jprobe_return(); diff --git a/net/dccp/proto.c b/net/dccp/proto.c index d0bd3481976..945b4d5d23b 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -40,16 +40,10 @@ DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly; EXPORT_SYMBOL_GPL(dccp_statistics); -atomic_t dccp_orphan_count = ATOMIC_INIT(0); - +struct percpu_counter dccp_orphan_count; EXPORT_SYMBOL_GPL(dccp_orphan_count); -struct inet_hashinfo __cacheline_aligned dccp_hashinfo = { - .lhash_lock = RW_LOCK_UNLOCKED, - .lhash_users = ATOMIC_INIT(0), - .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait), -}; - +struct inet_hashinfo dccp_hashinfo; EXPORT_SYMBOL_GPL(dccp_hashinfo); /* the maximum queue length for tx in packets. 0 is no limit */ @@ -67,6 +61,9 @@ void dccp_set_state(struct sock *sk, const int state) case DCCP_OPEN: if (oldstate != DCCP_OPEN) DCCP_INC_STATS(DCCP_MIB_CURRESTAB); + /* Client retransmits all Confirm options until entering OPEN */ + if (oldstate == DCCP_PARTOPEN) + dccp_feat_list_purge(&dccp_sk(sk)->dccps_featneg); break; case DCCP_CLOSED: @@ -175,7 +172,6 @@ EXPORT_SYMBOL_GPL(dccp_state_name); int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) { struct dccp_sock *dp = dccp_sk(sk); - struct dccp_minisock *dmsk = dccp_msk(sk); struct inet_connection_sock *icsk = inet_csk(sk); dccp_minisock_init(&dp->dccps_minisock); @@ -193,45 +189,10 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) dccp_init_xmit_timers(sk); - /* - * FIXME: We're hardcoding the CCID, and doing this at this point makes - * the listening (master) sock get CCID control blocks, which is not - * necessary, but for now, to not mess with the test userspace apps, - * lets leave it here, later the real solution is to do this in a - * setsockopt(CCIDs-I-want/accept). -acme - */ - if (likely(ctl_sock_initialized)) { - int rc = dccp_feat_init(dmsk); - - if (rc) - return rc; - - if (dmsk->dccpms_send_ack_vector) { - dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(GFP_KERNEL); - if (dp->dccps_hc_rx_ackvec == NULL) - return -ENOMEM; - } - dp->dccps_hc_rx_ccid = ccid_hc_rx_new(dmsk->dccpms_rx_ccid, - sk, GFP_KERNEL); - dp->dccps_hc_tx_ccid = ccid_hc_tx_new(dmsk->dccpms_tx_ccid, - sk, GFP_KERNEL); - if (unlikely(dp->dccps_hc_rx_ccid == NULL || - dp->dccps_hc_tx_ccid == NULL)) { - ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); - if (dmsk->dccpms_send_ack_vector) { - dccp_ackvec_free(dp->dccps_hc_rx_ackvec); - dp->dccps_hc_rx_ackvec = NULL; - } - dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; - return -ENOMEM; - } - } else { - /* control socket doesn't need feat nego */ - INIT_LIST_HEAD(&dmsk->dccpms_pending); - INIT_LIST_HEAD(&dmsk->dccpms_conf); - } - + INIT_LIST_HEAD(&dp->dccps_featneg); + /* control socket doesn't need feat nego */ + if (likely(ctl_sock_initialized)) + return dccp_feat_init(sk); return 0; } @@ -240,7 +201,6 @@ EXPORT_SYMBOL_GPL(dccp_init_sock); void dccp_destroy_sock(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); - struct dccp_minisock *dmsk = dccp_msk(sk); /* * DCCP doesn't use sk_write_queue, just sk_send_head @@ -258,7 +218,7 @@ void dccp_destroy_sock(struct sock *sk) kfree(dp->dccps_service_list); dp->dccps_service_list = NULL; - if (dmsk->dccpms_send_ack_vector) { + if (dp->dccps_hc_rx_ackvec != NULL) { dccp_ackvec_free(dp->dccps_hc_rx_ackvec); dp->dccps_hc_rx_ackvec = NULL; } @@ -267,7 +227,7 @@ void dccp_destroy_sock(struct sock *sk) dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; /* clean up feature negotiation state */ - dccp_feat_clean(dmsk); + dccp_feat_list_purge(&dp->dccps_featneg); } EXPORT_SYMBOL_GPL(dccp_destroy_sock); @@ -277,6 +237,9 @@ static inline int dccp_listen_start(struct sock *sk, int backlog) struct dccp_sock *dp = dccp_sk(sk); dp->dccps_role = DCCP_ROLE_LISTEN; + /* do not start to listen if feature negotiation setup fails */ + if (dccp_feat_finalise_settings(dp)) + return -EPROTO; return inet_csk_listen_start(sk, backlog); } @@ -466,42 +429,70 @@ static int dccp_setsockopt_service(struct sock *sk, const __be32 service, return 0; } -/* byte 1 is feature. the rest is the preference list */ -static int dccp_setsockopt_change(struct sock *sk, int type, - struct dccp_so_feat __user *optval) +static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx) { - struct dccp_so_feat opt; - u8 *val; - int rc; + u8 *list, len; + int i, rc; - if (copy_from_user(&opt, optval, sizeof(opt))) - return -EFAULT; + if (cscov < 0 || cscov > 15) + return -EINVAL; /* - * rfc4340: 6.1. Change Options + * Populate a list of permissible values, in the range cscov...15. This + * is necessary since feature negotiation of single values only works if + * both sides incidentally choose the same value. Since the list starts + * lowest-value first, negotiation will pick the smallest shared value. */ - if (opt.dccpsf_len < 1) + if (cscov == 0) + return 0; + len = 16 - cscov; + + list = kmalloc(len, GFP_KERNEL); + if (list == NULL) + return -ENOBUFS; + + for (i = 0; i < len; i++) + list[i] = cscov++; + + rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len); + + if (rc == 0) { + if (rx) + dccp_sk(sk)->dccps_pcrlen = cscov; + else + dccp_sk(sk)->dccps_pcslen = cscov; + } + kfree(list); + return rc; +} + +static int dccp_setsockopt_ccid(struct sock *sk, int type, + char __user *optval, int optlen) +{ + u8 *val; + int rc = 0; + + if (optlen < 1 || optlen > DCCP_FEAT_MAX_SP_VALS) return -EINVAL; - val = kmalloc(opt.dccpsf_len, GFP_KERNEL); - if (!val) + val = kmalloc(optlen, GFP_KERNEL); + if (val == NULL) return -ENOMEM; - if (copy_from_user(val, opt.dccpsf_val, opt.dccpsf_len)) { - rc = -EFAULT; - goto out_free_val; + if (copy_from_user(val, optval, optlen)) { + kfree(val); + return -EFAULT; } - rc = dccp_feat_change(dccp_msk(sk), type, opt.dccpsf_feat, - val, opt.dccpsf_len, GFP_KERNEL); - if (rc) - goto out_free_val; + lock_sock(sk); + if (type == DCCP_SOCKOPT_TX_CCID || type == DCCP_SOCKOPT_CCID) + rc = dccp_feat_register_sp(sk, DCCPF_CCID, 1, val, optlen); -out: - return rc; + if (!rc && (type == DCCP_SOCKOPT_RX_CCID || type == DCCP_SOCKOPT_CCID)) + rc = dccp_feat_register_sp(sk, DCCPF_CCID, 0, val, optlen); + release_sock(sk); -out_free_val: kfree(val); - goto out; + return rc; } static int do_dccp_setsockopt(struct sock *sk, int level, int optname, @@ -510,7 +501,21 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, struct dccp_sock *dp = dccp_sk(sk); int val, err = 0; - if (optlen < sizeof(int)) + switch (optname) { + case DCCP_SOCKOPT_PACKET_SIZE: + DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); + return 0; + case DCCP_SOCKOPT_CHANGE_L: + case DCCP_SOCKOPT_CHANGE_R: + DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n"); + return 0; + case DCCP_SOCKOPT_CCID: + case DCCP_SOCKOPT_RX_CCID: + case DCCP_SOCKOPT_TX_CCID: + return dccp_setsockopt_ccid(sk, optname, optval, optlen); + } + + if (optlen < (int)sizeof(int)) return -EINVAL; if (get_user(val, (int __user *)optval)) @@ -521,53 +526,24 @@ static int do_dccp_setsockopt(struct sock *sk, int level, int optname, lock_sock(sk); switch (optname) { - case DCCP_SOCKOPT_PACKET_SIZE: - DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n"); - err = 0; - break; - case DCCP_SOCKOPT_CHANGE_L: - if (optlen != sizeof(struct dccp_so_feat)) - err = -EINVAL; - else - err = dccp_setsockopt_change(sk, DCCPO_CHANGE_L, - (struct dccp_so_feat __user *) - optval); - break; - case DCCP_SOCKOPT_CHANGE_R: - if (optlen != sizeof(struct dccp_so_feat)) - err = -EINVAL; - else - err = dccp_setsockopt_change(sk, DCCPO_CHANGE_R, - (struct dccp_so_feat __user *) - optval); - break; case DCCP_SOCKOPT_SERVER_TIMEWAIT: if (dp->dccps_role != DCCP_ROLE_SERVER) err = -EOPNOTSUPP; else dp->dccps_server_timewait = (val != 0); break; - case DCCP_SOCKOPT_SEND_CSCOV: /* sender side, RFC 4340, sec. 9.2 */ - if (val < 0 || val > 15) - err = -EINVAL; - else - dp->dccps_pcslen = val; + case DCCP_SOCKOPT_SEND_CSCOV: + err = dccp_setsockopt_cscov(sk, val, false); break; - case DCCP_SOCKOPT_RECV_CSCOV: /* receiver side, RFC 4340 sec. 9.2.1 */ - if (val < 0 || val > 15) - err = -EINVAL; - else { - dp->dccps_pcrlen = val; - /* FIXME: add feature negotiation, - * ChangeL(MinimumChecksumCoverage, val) */ - } + case DCCP_SOCKOPT_RECV_CSCOV: + err = dccp_setsockopt_cscov(sk, val, true); break; default: err = -ENOPROTOOPT; break; } - release_sock(sk); + return err; } @@ -648,6 +624,18 @@ static int do_dccp_getsockopt(struct sock *sk, int level, int optname, case DCCP_SOCKOPT_GET_CUR_MPS: val = dp->dccps_mss_cache; break; + case DCCP_SOCKOPT_AVAILABLE_CCIDS: + return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen); + case DCCP_SOCKOPT_TX_CCID: + val = ccid_get_current_tx_ccid(dp); + if (val < 0) + return -ENOPROTOOPT; + break; + case DCCP_SOCKOPT_RX_CCID: + val = ccid_get_current_rx_ccid(dp); + if (val < 0) + return -ENOPROTOOPT; + break; case DCCP_SOCKOPT_SERVER_TIMEWAIT: val = dp->dccps_server_timewait; break; @@ -976,7 +964,6 @@ adjudge_to_death: state = sk->sk_state; sock_hold(sk); sock_orphan(sk); - atomic_inc(sk->sk_prot->orphan_count); /* * It is the last release_sock in its life. It will remove backlog. @@ -990,6 +977,8 @@ adjudge_to_death: bh_lock_sock(sk); WARN_ON(sock_owned_by_user(sk)); + percpu_counter_inc(sk->sk_prot->orphan_count); + /* Have we already been destroyed by a softirq or backlog? */ if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED) goto out; @@ -1040,17 +1029,21 @@ static int __init dccp_init(void) { unsigned long goal; int ehash_order, bhash_order, i; - int rc = -ENOBUFS; + int rc; BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb)); - + rc = percpu_counter_init(&dccp_orphan_count, 0); + if (rc) + goto out; + rc = -ENOBUFS; + inet_hashinfo_init(&dccp_hashinfo); dccp_hashinfo.bind_bucket_cachep = kmem_cache_create("dccp_bind_bucket", sizeof(struct inet_bind_bucket), 0, SLAB_HWCACHE_ALIGN, NULL); if (!dccp_hashinfo.bind_bucket_cachep) - goto out; + goto out_free_percpu; /* * Size and allocate the main established and bind bucket @@ -1084,8 +1077,8 @@ static int __init dccp_init(void) } for (i = 0; i < dccp_hashinfo.ehash_size; i++) { - INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain); - INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].twchain); + INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].chain, i); + INIT_HLIST_NULLS_HEAD(&dccp_hashinfo.ehash[i].twchain, i); } if (inet_ehash_locks_alloc(&dccp_hashinfo)) @@ -1125,9 +1118,15 @@ static int __init dccp_init(void) if (rc) goto out_ackvec_exit; + rc = ccid_initialize_builtins(); + if (rc) + goto out_sysctl_exit; + dccp_timestamping_init(); out: return rc; +out_sysctl_exit: + dccp_sysctl_exit(); out_ackvec_exit: dccp_ackvec_exit(); out_free_dccp_mib: @@ -1143,11 +1142,14 @@ out_free_dccp_ehash: out_free_bind_bucket_cachep: kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); dccp_hashinfo.bind_bucket_cachep = NULL; +out_free_percpu: + percpu_counter_destroy(&dccp_orphan_count); goto out; } static void __exit dccp_fini(void) { + ccid_cleanup_builtins(); dccp_mib_exit(); free_pages((unsigned long)dccp_hashinfo.bhash, get_order(dccp_hashinfo.bhash_size * diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c index 21295993fdb..018e210875e 100644 --- a/net/dccp/sysctl.c +++ b/net/dccp/sysctl.c @@ -41,27 +41,6 @@ static struct ctl_table dccp_default_table[] = { .proc_handler = proc_dointvec, }, { - .procname = "ack_ratio", - .data = &sysctl_dccp_feat_ack_ratio, - .maxlen = sizeof(sysctl_dccp_feat_ack_ratio), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "send_ackvec", - .data = &sysctl_dccp_feat_send_ack_vector, - .maxlen = sizeof(sysctl_dccp_feat_send_ack_vector), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "send_ndp", - .data = &sysctl_dccp_feat_send_ndp_count, - .maxlen = sizeof(sysctl_dccp_feat_send_ndp_count), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { .procname = "request_retries", .data = &sysctl_dccp_request_retries, .maxlen = sizeof(sysctl_dccp_request_retries), diff --git a/net/dccp/timer.c b/net/dccp/timer.c index 54b3c7e9e01..162d1e683c3 100644 --- a/net/dccp/timer.c +++ b/net/dccp/timer.c @@ -87,17 +87,6 @@ static void dccp_retransmit_timer(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); - /* retransmit timer is used for feature negotiation throughout - * connection. In this case, no packet is re-transmitted, but rather an - * ack is generated and pending changes are placed into its options. - */ - if (sk->sk_send_head == NULL) { - dccp_pr_debug("feat negotiation retransmit timeout %p\n", sk); - if (sk->sk_state == DCCP_OPEN) - dccp_send_ack(sk); - goto backoff; - } - /* * More than than 4MSL (8 minutes) has passed, a RESET(aborted) was * sent, no need to retransmit, this sock is dead. @@ -126,7 +115,6 @@ static void dccp_retransmit_timer(struct sock *sk) return; } -backoff: icsk->icsk_backoff++; icsk->icsk_rto = min(icsk->icsk_rto << 1, DCCP_RTO_MAX); |