From bf3a46aa9b96f6eb3a49a568f72a2801c3e830c0 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 19:22:01 -0700 Subject: [NETFILTER]: convert nfmark and conntrack mark to 32bit As discussed at netconf'05, we convert nfmark and conntrack-mark to be 32bits even on 64bit architectures. Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_standalone.c | 2 +- net/ipv4/netfilter/ipt_CLUSTERIP.c | 2 +- net/ipv4/netfilter/ipt_CONNMARK.c | 11 ++++++++--- net/ipv4/netfilter/ipt_MARK.c | 12 ++++++++++++ net/ipv4/netfilter/ipt_connmark.c | 7 +++++++ net/ipv4/netfilter/ipt_mark.c | 7 +++++++ 6 files changed, 36 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index 61798c46e91..dccd4abab7a 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -185,7 +185,7 @@ static int ct_seq_show(struct seq_file *s, void *v) return -ENOSPC; #if defined(CONFIG_IP_NF_CONNTRACK_MARK) - if (seq_printf(s, "mark=%lu ", conntrack->mark)) + if (seq_printf(s, "mark=%u ", conntrack->mark)) return -ENOSPC; #endif diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 6706d3a1bc4..2d05cafec22 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -367,7 +367,7 @@ target(struct sk_buff **pskb, #ifdef DEBUG_CLUSTERP DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); #endif - DEBUGP("hash=%u ct_hash=%lu ", hash, ct->mark); + DEBUGP("hash=%u ct_hash=%u ", hash, ct->mark); if (!clusterip_responsible(cipinfo->config, hash)) { DEBUGP("not responsible\n"); return NF_DROP; diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c index 30ddd3e18eb..8ed744157b1 100644 --- a/net/ipv4/netfilter/ipt_CONNMARK.c +++ b/net/ipv4/netfilter/ipt_CONNMARK.c @@ -40,9 +40,9 @@ target(struct sk_buff **pskb, void *userinfo) { const struct ipt_connmark_target_info *markinfo = targinfo; - unsigned long diff; - unsigned long nfmark; - unsigned long newmark; + u_int32_t diff; + u_int32_t nfmark; + u_int32_t newmark; enum ip_conntrack_info ctinfo; struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo); @@ -94,6 +94,11 @@ checkentry(const char *tablename, } } + if (matchinfo->mark > 0xffffffff || matchinfo->mask > 0xffffffff) { + printk(KERN_WARNING "CONNMARK: Only supports 32bit mark\n"); + return 0; + } + return 1; } diff --git a/net/ipv4/netfilter/ipt_MARK.c b/net/ipv4/netfilter/ipt_MARK.c index 33c6f9b63b8..8526398346c 100644 --- a/net/ipv4/netfilter/ipt_MARK.c +++ b/net/ipv4/netfilter/ipt_MARK.c @@ -76,6 +76,8 @@ checkentry_v0(const char *tablename, unsigned int targinfosize, unsigned int hook_mask) { + struct ipt_mark_target_info *markinfo = targinfo; + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) { printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n", targinfosize, @@ -88,6 +90,11 @@ checkentry_v0(const char *tablename, return 0; } + if (markinfo->mark > 0xffffffff) { + printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n"); + return 0; + } + return 1; } @@ -120,6 +127,11 @@ checkentry_v1(const char *tablename, return 0; } + if (markinfo->mark > 0xffffffff) { + printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n"); + return 0; + } + return 1; } diff --git a/net/ipv4/netfilter/ipt_connmark.c b/net/ipv4/netfilter/ipt_connmark.c index 2706f96cea5..bf8de47ce00 100644 --- a/net/ipv4/netfilter/ipt_connmark.c +++ b/net/ipv4/netfilter/ipt_connmark.c @@ -54,9 +54,16 @@ checkentry(const char *tablename, unsigned int matchsize, unsigned int hook_mask) { + struct ipt_connmark_info *cm = + (struct ipt_connmark_info *)matchinfo; if (matchsize != IPT_ALIGN(sizeof(struct ipt_connmark_info))) return 0; + if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) { + printk(KERN_WARNING "connmark: only support 32bit mark\n"); + return 0; + } + return 1; } diff --git a/net/ipv4/netfilter/ipt_mark.c b/net/ipv4/netfilter/ipt_mark.c index 8955728127b..00bef6cdd3f 100644 --- a/net/ipv4/netfilter/ipt_mark.c +++ b/net/ipv4/netfilter/ipt_mark.c @@ -37,9 +37,16 @@ checkentry(const char *tablename, unsigned int matchsize, unsigned int hook_mask) { + struct ipt_mark_info *minfo = (struct ipt_mark_info *) matchinfo; + if (matchsize != IPT_ALIGN(sizeof(struct ipt_mark_info))) return 0; + if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) { + printk(KERN_WARNING "mark: only supports 32bit mark\n"); + return 0; + } + return 1; } -- cgit v1.2.3 From 6869c4d8e066e21623c812c448a05f1ed931c9c6 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 19:24:19 -0700 Subject: [NETFILTER]: reduce netfilter sk_buff enlargement As discussed at netconf'05, we're trying to save every bit in sk_buff. The patch below makes sk_buff 8 bytes smaller. I did some basic testing on my notebook and it seems to work. The only real in-tree user of nfcache was IPVS, who only needs a single bit. Unfortunately I couldn't find some other free bit in sk_buff to stuff that bit into, so I introduced a separate field for them. Maybe the IPVS guys can resolve that to further save space. Initially I wanted to shrink pkt_type to three bits (PACKET_HOST and alike are only 6 values defined), but unfortunately the bluetooth code overloads pkt_type :( The conntrack-event-api (out-of-tree) uses nfcache, but Rusty just came up with a way how to do it without any skb fields, so it's safe to remove it. - remove all never-implemented 'nfcache' code - don't have ipvs code abuse 'nfcache' field. currently get's their own compile-conditional skb->ipvs_property field. IPVS maintainers can decide to move this bit elswhere, but nfcache needs to die. - remove skb->nfcache field to save 4 bytes - move skb->nfctinfo into three unused bits to save further 4 bytes Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 1 - net/ipv4/ipvs/ip_vs_core.c | 9 +++++---- net/ipv4/ipvs/ip_vs_xmit.c | 2 +- net/ipv4/netfilter/ip_conntrack_core.c | 7 +------ net/ipv4/netfilter/ip_nat_core.c | 1 - net/ipv4/netfilter/ip_nat_standalone.c | 2 -- net/ipv4/netfilter/ip_queue.c | 1 - net/ipv4/netfilter/ip_tables.c | 1 - net/ipv4/netfilter/ipt_CLASSIFY.c | 4 +--- net/ipv4/netfilter/ipt_CONNMARK.c | 4 +--- net/ipv4/netfilter/ipt_DSCP.c | 1 - net/ipv4/netfilter/ipt_ECN.c | 2 -- net/ipv4/netfilter/ipt_MARK.c | 10 ++++------ net/ipv4/netfilter/ipt_REJECT.c | 1 - net/ipv4/netfilter/ipt_TCPMSS.c | 1 - net/ipv4/netfilter/ipt_TOS.c | 1 - 16 files changed, 13 insertions(+), 35 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 80d13103b2b..766564cb420 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -392,7 +392,6 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) #endif #ifdef CONFIG_NETFILTER to->nfmark = from->nfmark; - to->nfcache = from->nfcache; /* Connection association is same as pre-frag packet */ nf_conntrack_put(to->nfct); to->nfct = from->nfct; diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 5fb257dd07c..3ac7eeca04a 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -22,6 +22,7 @@ * * Changes: * Paul `Rusty' Russell properly handle non-linear skbs + * Harald Welte don't use nfcache * */ @@ -529,7 +530,7 @@ static unsigned int ip_vs_post_routing(unsigned int hooknum, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - if (!((*pskb)->nfcache & NFC_IPVS_PROPERTY)) + if (!((*pskb)->ipvs_property)) return NF_ACCEPT; /* The packet was sent from IPVS, exit this chain */ @@ -701,7 +702,7 @@ static int ip_vs_out_icmp(struct sk_buff **pskb, int *related) /* do the statistics and put it back */ ip_vs_out_stats(cp, skb); - skb->nfcache |= NFC_IPVS_PROPERTY; + skb->ipvs_property = 1; verdict = NF_ACCEPT; out: @@ -739,7 +740,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb, EnterFunction(11); - if (skb->nfcache & NFC_IPVS_PROPERTY) + if (skb->ipvs_property) return NF_ACCEPT; iph = skb->nh.iph; @@ -821,7 +822,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **pskb, ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); ip_vs_conn_put(cp); - skb->nfcache |= NFC_IPVS_PROPERTY; + skb->ipvs_property = 1; LeaveFunction(11); return NF_ACCEPT; diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c index a8512a3fd08..3b87482049c 100644 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ b/net/ipv4/ipvs/ip_vs_xmit.c @@ -127,7 +127,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest) #define IP_VS_XMIT(skb, rt) \ do { \ - (skb)->nfcache |= NFC_IPVS_PROPERTY; \ + (skb)->ipvs_property = 1; \ (skb)->ip_summed = CHECKSUM_NONE; \ NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \ (rt)->u.dst.dev, dst_output); \ diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index a7f0c821a9b..04c3414361d 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -625,9 +625,6 @@ unsigned int ip_conntrack_in(unsigned int hooknum, return NF_DROP; } - /* FIXME: Do this right please. --RR */ - (*pskb)->nfcache |= NFC_UNKNOWN; - /* Doesn't cover locally-generated broadcast, so not worth it. */ #if 0 /* Ignore broadcast: no `connection'. */ @@ -943,10 +940,8 @@ ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) skb = ip_defrag(skb, user); local_bh_enable(); - if (skb) { + if (skb) ip_send_check(skb->nh.iph); - skb->nfcache |= NFC_ALTERED; - } return skb; } diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c index 739b6dde1c8..ed4d731880f 100644 --- a/net/ipv4/netfilter/ip_nat_core.c +++ b/net/ipv4/netfilter/ip_nat_core.c @@ -321,7 +321,6 @@ manip_pkt(u_int16_t proto, { struct iphdr *iph; - (*pskb)->nfcache |= NFC_ALTERED; if (!skb_ip_make_writable(pskb, iphdroff + sizeof(*iph))) return 0; diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c index 91d5ea1dbbc..9ecba979033 100644 --- a/net/ipv4/netfilter/ip_nat_standalone.c +++ b/net/ipv4/netfilter/ip_nat_standalone.c @@ -73,8 +73,6 @@ ip_nat_fn(unsigned int hooknum, IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET))); - (*pskb)->nfcache |= NFC_UNKNOWN; - /* If we had a hardware checksum before, it's now invalid */ if ((*pskb)->ip_summed == CHECKSUM_HW) if (skb_checksum_help(*pskb, (out == NULL))) diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index c6baa817438..bc0af8d8e91 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -392,7 +392,6 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e) return -ENOMEM; memcpy(e->skb->data, v->payload, v->data_len); e->skb->ip_summed = CHECKSUM_NONE; - e->skb->nfcache |= NFC_ALTERED; /* * Extra routing may needed on local out, as the QUEUE target never diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index c88dfcd38c5..ff8d85d2070 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -312,7 +312,6 @@ ipt_do_table(struct sk_buff **pskb, do { IP_NF_ASSERT(e); IP_NF_ASSERT(back); - (*pskb)->nfcache |= e->nfcache; if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) { struct ipt_entry_target *t; diff --git a/net/ipv4/netfilter/ipt_CLASSIFY.c b/net/ipv4/netfilter/ipt_CLASSIFY.c index 9842e6e2318..dab78d8bd49 100644 --- a/net/ipv4/netfilter/ipt_CLASSIFY.c +++ b/net/ipv4/netfilter/ipt_CLASSIFY.c @@ -32,10 +32,8 @@ target(struct sk_buff **pskb, { const struct ipt_classify_target_info *clinfo = targinfo; - if((*pskb)->priority != clinfo->priority) { + if((*pskb)->priority != clinfo->priority) (*pskb)->priority = clinfo->priority; - (*pskb)->nfcache |= NFC_ALTERED; - } return IPT_CONTINUE; } diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c index 8ed744157b1..13463802133 100644 --- a/net/ipv4/netfilter/ipt_CONNMARK.c +++ b/net/ipv4/netfilter/ipt_CONNMARK.c @@ -61,10 +61,8 @@ target(struct sk_buff **pskb, case IPT_CONNMARK_RESTORE: nfmark = (*pskb)->nfmark; diff = (ct->mark ^ nfmark) & markinfo->mask; - if (diff != 0) { + if (diff != 0) (*pskb)->nfmark = nfmark ^ diff; - (*pskb)->nfcache |= NFC_ALTERED; - } break; } } diff --git a/net/ipv4/netfilter/ipt_DSCP.c b/net/ipv4/netfilter/ipt_DSCP.c index 3ea4509099f..975476fef27 100644 --- a/net/ipv4/netfilter/ipt_DSCP.c +++ b/net/ipv4/netfilter/ipt_DSCP.c @@ -51,7 +51,6 @@ target(struct sk_buff **pskb, sizeof(diffs), (*pskb)->nh.iph->check ^ 0xFFFF)); - (*pskb)->nfcache |= NFC_ALTERED; } return IPT_CONTINUE; } diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index 94a0ce1c1c9..f63a9bc0e4d 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -43,7 +43,6 @@ set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo) sizeof(diffs), (*pskb)->nh.iph->check ^0xFFFF)); - (*pskb)->nfcache |= NFC_ALTERED; } return 1; } @@ -87,7 +86,6 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward) tcph->check = csum_fold(csum_partial((char *)diffs, sizeof(diffs), tcph->check^0xFFFF)); - (*pskb)->nfcache |= NFC_ALTERED; return 1; } diff --git a/net/ipv4/netfilter/ipt_MARK.c b/net/ipv4/netfilter/ipt_MARK.c index 8526398346c..52b4f2c296b 100644 --- a/net/ipv4/netfilter/ipt_MARK.c +++ b/net/ipv4/netfilter/ipt_MARK.c @@ -29,10 +29,9 @@ target_v0(struct sk_buff **pskb, { const struct ipt_mark_target_info *markinfo = targinfo; - if((*pskb)->nfmark != markinfo->mark) { + if((*pskb)->nfmark != markinfo->mark) (*pskb)->nfmark = markinfo->mark; - (*pskb)->nfcache |= NFC_ALTERED; - } + return IPT_CONTINUE; } @@ -61,10 +60,9 @@ target_v1(struct sk_buff **pskb, break; } - if((*pskb)->nfmark != mark) { + if((*pskb)->nfmark != mark) (*pskb)->nfmark = mark; - (*pskb)->nfcache |= NFC_ALTERED; - } + return IPT_CONTINUE; } diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 91569644602..f115a84a4ac 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -156,7 +156,6 @@ static void send_reset(struct sk_buff *oldskb, int hook) /* This packet will not be the same as the other: clear nf fields */ nf_reset(nskb); - nskb->nfcache = 0; nskb->nfmark = 0; #ifdef CONFIG_BRIDGE_NETFILTER nf_bridge_put(nskb->nf_bridge); diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c index 7b84a254440..949288319ca 100644 --- a/net/ipv4/netfilter/ipt_TCPMSS.c +++ b/net/ipv4/netfilter/ipt_TCPMSS.c @@ -190,7 +190,6 @@ ipt_tcpmss_target(struct sk_buff **pskb, newmss); retmodified: - (*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED; return IPT_CONTINUE; } diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c index 85c70d240f8..49abb7eef0a 100644 --- a/net/ipv4/netfilter/ipt_TOS.c +++ b/net/ipv4/netfilter/ipt_TOS.c @@ -46,7 +46,6 @@ target(struct sk_buff **pskb, sizeof(diffs), (*pskb)->nh.iph->check ^0xFFFF)); - (*pskb)->nfcache |= NFC_ALTERED; } return IPT_CONTINUE; } -- cgit v1.2.3 From 8728b834b226ffcf2c94a58530090e292af2a7bf Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 9 Aug 2005 19:25:21 -0700 Subject: [NET]: Kill skb->list Remove the "list" member of struct sk_buff, as it is entirely redundant. All SKB list removal callers know which list the SKB is on, so storing this in sk_buff does nothing other than taking up some space. Two tricky bits were SCTP, which I took care of, and two ATM drivers which Francois Romieu fixed up. Signed-off-by: David S. Miller Signed-off-by: Francois Romieu --- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_input.c | 29 ++++++++++++++++------------- net/ipv4/tcp_output.c | 6 +++--- 3 files changed, 20 insertions(+), 17 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 69b1fcf7007..d2696af46c7 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -975,7 +975,7 @@ do_fault: if (!skb->len) { if (sk->sk_send_head == skb) sk->sk_send_head = NULL; - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &sk->sk_write_queue); sk_stream_free_skb(sk, skb); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 53a8a5399f1..ffa24025cd0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2085,7 +2085,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt seq_rtt = now - scb->when; tcp_dec_pcount_approx(&tp->fackets_out, skb); tcp_packets_out_dec(tp, skb); - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &sk->sk_write_queue); sk_stream_free_skb(sk, skb); } @@ -2853,7 +2853,7 @@ static void tcp_ofo_queue(struct sock *sk) if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { SOCK_DEBUG(sk, "ofo packet was already received \n"); - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &tp->out_of_order_queue); __kfree_skb(skb); continue; } @@ -2861,7 +2861,7 @@ static void tcp_ofo_queue(struct sock *sk) tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &tp->out_of_order_queue); __skb_queue_tail(&sk->sk_receive_queue, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; if(skb->h.th->fin) @@ -3027,7 +3027,7 @@ drop: u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (seq == TCP_SKB_CB(skb1)->end_seq) { - __skb_append(skb1, skb); + __skb_append(skb1, skb, &tp->out_of_order_queue); if (!tp->rx_opt.num_sacks || tp->selective_acks[0].end_seq != seq) @@ -3071,7 +3071,7 @@ drop: tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq); break; } - __skb_unlink(skb1, skb1->list); + __skb_unlink(skb1, &tp->out_of_order_queue); tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); __kfree_skb(skb1); } @@ -3088,8 +3088,9 @@ add_sack: * simplifies code) */ static void -tcp_collapse(struct sock *sk, struct sk_buff *head, - struct sk_buff *tail, u32 start, u32 end) +tcp_collapse(struct sock *sk, struct sk_buff_head *list, + struct sk_buff *head, struct sk_buff *tail, + u32 start, u32 end) { struct sk_buff *skb; @@ -3099,7 +3100,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head, /* No new bits? It is possible on ofo queue. */ if (!before(start, TCP_SKB_CB(skb)->end_seq)) { struct sk_buff *next = skb->next; - __skb_unlink(skb, skb->list); + __skb_unlink(skb, list); __kfree_skb(skb); NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); skb = next; @@ -3145,7 +3146,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head, nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head); memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; - __skb_insert(nskb, skb->prev, skb, skb->list); + __skb_insert(nskb, skb->prev, skb, list); sk_stream_set_owner_r(nskb, sk); /* Copy data, releasing collapsed skbs. */ @@ -3164,7 +3165,7 @@ tcp_collapse(struct sock *sk, struct sk_buff *head, } if (!before(start, TCP_SKB_CB(skb)->end_seq)) { struct sk_buff *next = skb->next; - __skb_unlink(skb, skb->list); + __skb_unlink(skb, list); __kfree_skb(skb); NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); skb = next; @@ -3200,7 +3201,8 @@ static void tcp_collapse_ofo_queue(struct sock *sk) if (skb == (struct sk_buff *)&tp->out_of_order_queue || after(TCP_SKB_CB(skb)->seq, end) || before(TCP_SKB_CB(skb)->end_seq, start)) { - tcp_collapse(sk, head, skb, start, end); + tcp_collapse(sk, &tp->out_of_order_queue, + head, skb, start, end); head = skb; if (skb == (struct sk_buff *)&tp->out_of_order_queue) break; @@ -3237,7 +3239,8 @@ static int tcp_prune_queue(struct sock *sk) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); tcp_collapse_ofo_queue(sk); - tcp_collapse(sk, sk->sk_receive_queue.next, + tcp_collapse(sk, &sk->sk_receive_queue, + sk->sk_receive_queue.next, (struct sk_buff*)&sk->sk_receive_queue, tp->copied_seq, tp->rcv_nxt); sk_stream_mem_reclaim(sk); @@ -3462,7 +3465,7 @@ static void tcp_check_urg(struct sock * sk, struct tcphdr * th) struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); tp->copied_seq++; if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) { - __skb_unlink(skb, skb->list); + __skb_unlink(skb, &sk->sk_receive_queue); __kfree_skb(skb); } } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index dd30dd137b7..a4d1eb9a092 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -505,7 +505,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned /* Link BUFF into the send queue. */ skb_header_release(buff); - __skb_append(skb, buff); + __skb_append(skb, buff, &sk->sk_write_queue); return 0; } @@ -893,7 +893,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* Link BUFF into the send queue. */ skb_header_release(buff); - __skb_append(skb, buff); + __skb_append(skb, buff, &sk->sk_write_queue); return 0; } @@ -1238,7 +1238,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m tcp_skb_pcount(next_skb) != 1); /* Ok. We will be able to collapse the packet. */ - __skb_unlink(next_skb, next_skb->list); + __skb_unlink(next_skb, &sk->sk_write_queue); memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); -- cgit v1.2.3 From ac3247baf8ecadf168642e3898b0212c29c79715 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 19:28:03 -0700 Subject: [NETFILTER]: connection tracking event notifiers This adds a notifier chain based event mechanism for ip_conntrack state changes. As opposed to the previous implementations in patch-o-matic, we do no longer need a field in the skb to achieve this. Thanks to the valuable input from Patrick McHardy and Rusty on the idea of a per_cpu implementation. Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/Kconfig | 10 +++ net/ipv4/netfilter/ip_conntrack_core.c | 122 +++++++++++++++++++++++++-- net/ipv4/netfilter/ip_conntrack_ftp.c | 12 ++- net/ipv4/netfilter/ip_conntrack_proto_icmp.c | 1 + net/ipv4/netfilter/ip_conntrack_proto_sctp.c | 2 + net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 4 + net/ipv4/netfilter/ip_conntrack_proto_udp.c | 3 +- net/ipv4/netfilter/ip_conntrack_standalone.c | 10 +++ 8 files changed, 154 insertions(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 46d4cb1c06f..ff3393eba92 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -40,6 +40,16 @@ config IP_NF_CONNTRACK_MARK of packets, but this mark value is kept in the conntrack session instead of the individual packets. +config IP_NF_CONNTRACK_EVENTS + bool "Connection tracking events" + depends on IP_NF_CONNTRACK + help + If this option is enabled, the connection tracking code will + provide a notifier chain that can be used by other kernel code + to get notified about changes in the connection tracking state. + + IF unsure, say `N'. + config IP_NF_CT_PROTO_SCTP tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)' depends on IP_NF_CONNTRACK && EXPERIMENTAL diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 04c3414361d..caf89deae11 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -37,6 +37,7 @@ #include #include #include +#include /* ip_conntrack_lock protects the main hash table, protocol/helper/expected registrations, conntrack timers*/ @@ -49,7 +50,7 @@ #include #include -#define IP_CONNTRACK_VERSION "2.1" +#define IP_CONNTRACK_VERSION "2.2" #if 0 #define DEBUGP printk @@ -76,6 +77,81 @@ unsigned int ip_ct_log_invalid; static LIST_HEAD(unconfirmed); static int ip_conntrack_vmalloc; +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +struct notifier_block *ip_conntrack_chain; +struct notifier_block *ip_conntrack_expect_chain; + +DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache); + +static inline void __deliver_cached_events(struct ip_conntrack_ecache *ecache) +{ + if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events) + notifier_call_chain(&ip_conntrack_chain, ecache->events, + ecache->ct); + ecache->events = 0; +} + +void __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache) +{ + __deliver_cached_events(ecache); +} + +/* Deliver all cached events for a particular conntrack. This is called + * by code prior to async packet handling or freeing the skb */ +void +ip_conntrack_deliver_cached_events_for(const struct ip_conntrack *ct) +{ + struct ip_conntrack_ecache *ecache = + &__get_cpu_var(ip_conntrack_ecache); + + if (!ct) + return; + + if (ecache->ct == ct) { + DEBUGP("ecache: delivering event for %p\n", ct); + __deliver_cached_events(ecache); + } else { + if (net_ratelimit()) + printk(KERN_WARNING "ecache: want to deliver for %p, " + "but cache has %p\n", ct, ecache->ct); + } + + /* signalize that events have already been delivered */ + ecache->ct = NULL; +} + +/* Deliver cached events for old pending events, if current conntrack != old */ +void ip_conntrack_event_cache_init(const struct sk_buff *skb) +{ + struct ip_conntrack *ct = (struct ip_conntrack *) skb->nfct; + struct ip_conntrack_ecache *ecache = + &__get_cpu_var(ip_conntrack_ecache); + + /* take care of delivering potentially old events */ + if (ecache->ct != ct) { + enum ip_conntrack_info ctinfo; + /* we have to check, since at startup the cache is NULL */ + if (likely(ecache->ct)) { + DEBUGP("ecache: entered for different conntrack: " + "ecache->ct=%p, skb->nfct=%p. delivering " + "events\n", ecache->ct, ct); + __deliver_cached_events(ecache); + ip_conntrack_put(ecache->ct); + } else { + DEBUGP("ecache: entered for conntrack %p, " + "cache was clean before\n", ct); + } + + /* initialize for this conntrack/packet */ + ecache->ct = ip_conntrack_get(skb, &ctinfo); + /* ecache->events cleared by __deliver_cached_devents() */ + } else { + DEBUGP("ecache: re-entered for conntrack %p.\n", ct); + } +} + +#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */ + DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); void @@ -223,6 +299,8 @@ destroy_conntrack(struct nf_conntrack *nfct) IP_NF_ASSERT(atomic_read(&nfct->use) == 0); IP_NF_ASSERT(!timer_pending(&ct->timeout)); + set_bit(IPS_DYING_BIT, &ct->status); + /* To make sure we don't get any weird locking issues here: * destroy_conntrack() MUST NOT be called with a write lock * to ip_conntrack_lock!!! -HW */ @@ -261,6 +339,7 @@ static void death_by_timeout(unsigned long ul_conntrack) { struct ip_conntrack *ct = (void *)ul_conntrack; + ip_conntrack_event(IPCT_DESTROY, ct); write_lock_bh(&ip_conntrack_lock); /* Inside lock so preempt is disabled on module removal path. * Otherwise we can get spurious warnings. */ @@ -374,6 +453,16 @@ __ip_conntrack_confirm(struct sk_buff **pskb) set_bit(IPS_CONFIRMED_BIT, &ct->status); CONNTRACK_STAT_INC(insert); write_unlock_bh(&ip_conntrack_lock); + if (ct->helper) + ip_conntrack_event_cache(IPCT_HELPER, *pskb); +#ifdef CONFIG_IP_NF_NAT_NEEDED + if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) || + test_bit(IPS_DST_NAT_DONE_BIT, &ct->status)) + ip_conntrack_event_cache(IPCT_NATINFO, *pskb); +#endif + ip_conntrack_event_cache(master_ct(ct) ? + IPCT_RELATED : IPCT_NEW, *pskb); + return NF_ACCEPT; } @@ -607,7 +696,7 @@ unsigned int ip_conntrack_in(unsigned int hooknum, struct ip_conntrack *ct; enum ip_conntrack_info ctinfo; struct ip_conntrack_protocol *proto; - int set_reply; + int set_reply = 0; int ret; /* Previously seen (loopback or untracked)? Ignore. */ @@ -666,6 +755,8 @@ unsigned int ip_conntrack_in(unsigned int hooknum, IP_NF_ASSERT((*pskb)->nfct); + ip_conntrack_event_cache_init(*pskb); + ret = proto->packet(ct, *pskb, ctinfo); if (ret < 0) { /* Invalid: inverse of the return code tells @@ -676,8 +767,8 @@ unsigned int ip_conntrack_in(unsigned int hooknum, return -ret; } - if (set_reply) - set_bit(IPS_SEEN_REPLY_BIT, &ct->status); + if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) + ip_conntrack_event_cache(IPCT_STATUS, *pskb); return ret; } @@ -824,6 +915,7 @@ int ip_conntrack_expect_related(struct ip_conntrack_expect *expect) evict_oldest_expect(expect->master); ip_conntrack_expect_insert(expect); + ip_conntrack_expect_event(IPEXP_NEW, expect); ret = 0; out: write_unlock_bh(&ip_conntrack_lock); @@ -861,8 +953,10 @@ int ip_conntrack_helper_register(struct ip_conntrack_helper *me) static inline int unhelp(struct ip_conntrack_tuple_hash *i, const struct ip_conntrack_helper *me) { - if (tuplehash_to_ctrack(i)->helper == me) + if (tuplehash_to_ctrack(i)->helper == me) { + ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i)); tuplehash_to_ctrack(i)->helper = NULL; + } return 0; } @@ -924,6 +1018,7 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct, if (del_timer(&ct->timeout)) { ct->timeout.expires = jiffies + extra_jiffies; add_timer(&ct->timeout); + ip_conntrack_event_cache(IPCT_REFRESH, skb); } ct_add_counters(ct, ctinfo, skb); write_unlock_bh(&ip_conntrack_lock); @@ -1012,6 +1107,23 @@ ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data) ip_conntrack_put(ct); } + +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS + { + /* we need to deliver all cached events in order to drop + * the reference counts */ + int cpu; + for_each_cpu(cpu) { + struct ip_conntrack_ecache *ecache = + &per_cpu(ip_conntrack_ecache, cpu); + if (ecache->ct) { + __ip_ct_deliver_cached_events(ecache); + ip_conntrack_put(ecache->ct); + ecache->ct = NULL; + } + } + } +#endif } /* Fast function for those who don't want to parse /proc (and I don't diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c index 7a3b773be3f..9658896f899 100644 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c @@ -262,7 +262,8 @@ static int find_nl_seq(u32 seq, const struct ip_ct_ftp_master *info, int dir) } /* We don't update if it's older than what we have. */ -static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir) +static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir, + struct sk_buff *skb) { unsigned int i, oldest = NUM_SEQ_TO_REMEMBER; @@ -276,10 +277,13 @@ static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir) oldest = i; } - if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) + if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) { info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq; - else if (oldest != NUM_SEQ_TO_REMEMBER) + ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb); + } else if (oldest != NUM_SEQ_TO_REMEMBER) { info->seq_aft_nl[dir][oldest] = nl_seq; + ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb); + } } static int help(struct sk_buff **pskb, @@ -439,7 +443,7 @@ out_update_nl: /* Now if this ends in \n, update ftp info. Seq may have been * adjusted by NAT code. */ if (ends_in_nl) - update_nl_seq(seq, ct_ftp_info,dir); + update_nl_seq(seq, ct_ftp_info,dir, *pskb); out: spin_unlock_bh(&ip_ftp_lock); return ret; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index 602c74db325..dca1f63d6f5 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -102,6 +102,7 @@ static int icmp_packet(struct ip_conntrack *ct, ct->timeout.function((unsigned long)ct); } else { atomic_inc(&ct->proto.icmp.count); + ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout); } diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c index 31d75390bf1..3d5f878a07d 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c @@ -404,6 +404,8 @@ static int sctp_packet(struct ip_conntrack *conntrack, } conntrack->proto.sctp.state = newconntrack; + if (oldsctpstate != newconntrack) + ip_conntrack_event_cache(IPCT_PROTOINFO, skb); write_unlock_bh(&sctp_lock); } diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 809dfed766d..a569ad1ee4d 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -973,6 +973,10 @@ static int tcp_packet(struct ip_conntrack *conntrack, ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; write_unlock_bh(&tcp_lock); + ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); + if (new_state != old_state) + ip_conntrack_event_cache(IPCT_PROTOINFO, skb); + if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { /* If only reply is a RST, we can consider ourselves not to have an established connection: this is a fairly common diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c index 8c1eaba098d..6066eaf4d82 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c @@ -73,7 +73,8 @@ static int udp_packet(struct ip_conntrack *conntrack, ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout_stream); /* Also, more likely to be important, and not a probe */ - set_bit(IPS_ASSURED_BIT, &conntrack->status); + if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status)) + ip_conntrack_event_cache(IPCT_STATUS, skb); } else ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout); diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index dccd4abab7a..f0880004115 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -402,6 +402,7 @@ static unsigned int ip_confirm(unsigned int hooknum, const struct net_device *out, int (*okfn)(struct sk_buff *)) { + ip_conntrack_event_cache_init(*pskb); /* We've seen it coming out the other side: confirm it */ return ip_conntrack_confirm(pskb); } @@ -419,6 +420,7 @@ static unsigned int ip_conntrack_help(unsigned int hooknum, ct = ip_conntrack_get(*pskb, &ctinfo); if (ct && ct->helper) { unsigned int ret; + ip_conntrack_event_cache_init(*pskb); ret = ct->helper->help(pskb, ct, ctinfo); if (ret != NF_ACCEPT) return ret; @@ -889,6 +891,7 @@ static int init_or_cleanup(int init) return ret; cleanup: + synchronize_net(); #ifdef CONFIG_SYSCTL unregister_sysctl_table(ip_ct_sysctl_header); cleanup_localinops: @@ -971,6 +974,13 @@ void need_ip_conntrack(void) { } +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +EXPORT_SYMBOL_GPL(ip_conntrack_chain); +EXPORT_SYMBOL_GPL(ip_conntrack_expect_chain); +EXPORT_SYMBOL_GPL(ip_conntrack_register_notifier); +EXPORT_SYMBOL_GPL(ip_conntrack_unregister_notifier); +EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache); +#endif EXPORT_SYMBOL(ip_conntrack_protocol_register); EXPORT_SYMBOL(ip_conntrack_protocol_unregister); EXPORT_SYMBOL(ip_ct_get_tuple); -- cgit v1.2.3 From 080774a243f56ce2195ace96fba3d18548ee48ce Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 19:32:58 -0700 Subject: [NETFILTER]: Add ctnetlink subsystem Add ctnetlink subsystem for userspace-access to ip_conntrack table. This allows reading and updating of existing entries, as well as creating new ones (and new expect's) via nfnetlink. Please note the 'strange' byte order: nfattr (tag+length) are in host byte order, while the payload is always guaranteed to be in network byte order. This allows a simple userspace process to encapsulate netlink messages into arch-independent udp packets by just processing/swapping the headers and not knowing anything about the actual payload. Signed-off-by: Harald Welte Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/Kconfig | 7 + net/ipv4/netfilter/Makefile | 4 + net/ipv4/netfilter/ip_conntrack_core.c | 281 ++++- net/ipv4/netfilter/ip_conntrack_netlink.c | 1588 ++++++++++++++++++++++++++ net/ipv4/netfilter/ip_conntrack_proto_icmp.c | 64 +- net/ipv4/netfilter/ip_conntrack_proto_sctp.c | 7 +- net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 23 + net/ipv4/netfilter/ip_conntrack_proto_udp.c | 5 + net/ipv4/netfilter/ip_conntrack_standalone.c | 38 +- net/ipv4/netfilter/ip_nat_core.c | 99 +- net/ipv4/netfilter/ip_nat_proto_icmp.c | 9 +- net/ipv4/netfilter/ip_nat_proto_tcp.c | 10 +- net/ipv4/netfilter/ip_nat_proto_udp.c | 9 +- net/ipv4/netfilter/ip_nat_proto_unknown.c | 2 +- net/ipv4/netfilter/ip_nat_standalone.c | 2 + 15 files changed, 2066 insertions(+), 82 deletions(-) create mode 100644 net/ipv4/netfilter/ip_conntrack_netlink.c (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index ff3393eba92..e47ba39eb65 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -702,5 +702,12 @@ config IP_NF_ARP_MANGLE Allows altering the ARP packet payload: source and destination hardware and network addresses. +config IP_NF_CONNTRACK_NETLINK + tristate 'Connection tracking netlink interface' + depends on IP_NF_CONNTRACK && NETFILTER_NETLINK + help + This option enables support for a netlink-based userspace interface + + endmenu diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 45796d5924d..abf2a7d1a58 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -9,6 +9,10 @@ iptable_nat-objs := ip_nat_standalone.o ip_nat_rule.o ip_nat_core.o ip_nat_helpe # connection tracking obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o +# conntrack netlink interface +obj-$(CONFIG_IP_NF_CONNTRACK_NETLINK) += ip_conntrack_netlink.o + + # SCTP protocol connection tracking obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index caf89deae11..d9fddae8d78 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -50,7 +50,7 @@ #include #include -#define IP_CONNTRACK_VERSION "2.2" +#define IP_CONNTRACK_VERSION "2.3" #if 0 #define DEBUGP printk @@ -77,6 +77,8 @@ unsigned int ip_ct_log_invalid; static LIST_HEAD(unconfirmed); static int ip_conntrack_vmalloc; +static unsigned int ip_conntrack_next_id = 1; +static unsigned int ip_conntrack_expect_next_id = 1; #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS struct notifier_block *ip_conntrack_chain; struct notifier_block *ip_conntrack_expect_chain; @@ -154,13 +156,6 @@ void ip_conntrack_event_cache_init(const struct sk_buff *skb) DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); -void -ip_conntrack_put(struct ip_conntrack *ct) -{ - IP_NF_ASSERT(ct); - nf_conntrack_put(&ct->ct_general); -} - static int ip_conntrack_hash_rnd_initted; static unsigned int ip_conntrack_hash_rnd; @@ -222,6 +217,12 @@ static void unlink_expect(struct ip_conntrack_expect *exp) exp->master->expecting--; } +void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp) +{ + unlink_expect(exp); + ip_conntrack_expect_put(exp); +} + static void expectation_timed_out(unsigned long ul_expect) { struct ip_conntrack_expect *exp = (void *)ul_expect; @@ -232,6 +233,33 @@ static void expectation_timed_out(unsigned long ul_expect) ip_conntrack_expect_put(exp); } +struct ip_conntrack_expect * +__ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple) +{ + struct ip_conntrack_expect *i; + + list_for_each_entry(i, &ip_conntrack_expect_list, list) { + if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) { + atomic_inc(&i->use); + return i; + } + } + return NULL; +} + +/* Just find a expectation corresponding to a tuple. */ +struct ip_conntrack_expect * +ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple) +{ + struct ip_conntrack_expect *i; + + read_lock_bh(&ip_conntrack_lock); + i = __ip_conntrack_expect_find(tuple); + read_unlock_bh(&ip_conntrack_lock); + + return i; +} + /* If an expectation for this connection is found, it gets delete from * global list then returned. */ static struct ip_conntrack_expect * @@ -256,7 +284,7 @@ find_expectation(const struct ip_conntrack_tuple *tuple) } /* delete all expectations for this conntrack */ -static void remove_expectations(struct ip_conntrack *ct) +void ip_ct_remove_expectations(struct ip_conntrack *ct) { struct ip_conntrack_expect *i, *tmp; @@ -286,7 +314,7 @@ clean_from_lists(struct ip_conntrack *ct) LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]); /* Destroy all pending expectations */ - remove_expectations(ct); + ip_ct_remove_expectations(ct); } static void @@ -304,7 +332,7 @@ destroy_conntrack(struct nf_conntrack *nfct) /* To make sure we don't get any weird locking issues here: * destroy_conntrack() MUST NOT be called with a write lock * to ip_conntrack_lock!!! -HW */ - proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); + proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); if (proto && proto->destroy) proto->destroy(ct); @@ -316,7 +344,7 @@ destroy_conntrack(struct nf_conntrack *nfct) * except TFTP can create an expectation on the first packet, * before connection is in the list, so we need to clean here, * too. */ - remove_expectations(ct); + ip_ct_remove_expectations(ct); /* We overload first tuple to link into unconfirmed list. */ if (!is_confirmed(ct)) { @@ -331,8 +359,7 @@ destroy_conntrack(struct nf_conntrack *nfct) ip_conntrack_put(ct->master); DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct); - kmem_cache_free(ip_conntrack_cachep, ct); - atomic_dec(&ip_conntrack_count); + ip_conntrack_free(ct); } static void death_by_timeout(unsigned long ul_conntrack) @@ -359,7 +386,7 @@ conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i, && ip_ct_tuple_equal(tuple, &i->tuple); } -static struct ip_conntrack_tuple_hash * +struct ip_conntrack_tuple_hash * __ip_conntrack_find(const struct ip_conntrack_tuple *tuple, const struct ip_conntrack *ignored_conntrack) { @@ -394,6 +421,29 @@ ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple, return h; } +static void __ip_conntrack_hash_insert(struct ip_conntrack *ct, + unsigned int hash, + unsigned int repl_hash) +{ + ct->id = ++ip_conntrack_next_id; + list_prepend(&ip_conntrack_hash[hash], + &ct->tuplehash[IP_CT_DIR_ORIGINAL].list); + list_prepend(&ip_conntrack_hash[repl_hash], + &ct->tuplehash[IP_CT_DIR_REPLY].list); +} + +void ip_conntrack_hash_insert(struct ip_conntrack *ct) +{ + unsigned int hash, repl_hash; + + hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + write_lock_bh(&ip_conntrack_lock); + __ip_conntrack_hash_insert(ct, hash, repl_hash); + write_unlock_bh(&ip_conntrack_lock); +} + /* Confirm a connection given skb; places it in hash table */ int __ip_conntrack_confirm(struct sk_buff **pskb) @@ -440,10 +490,7 @@ __ip_conntrack_confirm(struct sk_buff **pskb) /* Remove from unconfirmed list */ list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); - list_prepend(&ip_conntrack_hash[hash], - &ct->tuplehash[IP_CT_DIR_ORIGINAL]); - list_prepend(&ip_conntrack_hash[repl_hash], - &ct->tuplehash[IP_CT_DIR_REPLY]); + __ip_conntrack_hash_insert(ct, hash, repl_hash); /* Timer relative to confirmation time, not original setting time, otherwise we'd get timer wrap in weird delay cases. */ @@ -527,34 +574,84 @@ static inline int helper_cmp(const struct ip_conntrack_helper *i, return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask); } -static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple) +static struct ip_conntrack_helper * +__ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple) { return LIST_FIND(&helpers, helper_cmp, struct ip_conntrack_helper *, tuple); } -/* Allocate a new conntrack: we return -ENOMEM if classification - failed due to stress. Otherwise it really is unclassifiable. */ -static struct ip_conntrack_tuple_hash * -init_conntrack(const struct ip_conntrack_tuple *tuple, - struct ip_conntrack_protocol *protocol, - struct sk_buff *skb) +struct ip_conntrack_helper * +ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple) +{ + struct ip_conntrack_helper *helper; + + /* need ip_conntrack_lock to assure that helper exists until + * try_module_get() is called */ + read_lock_bh(&ip_conntrack_lock); + + helper = __ip_conntrack_helper_find(tuple); + if (helper) { + /* need to increase module usage count to assure helper will + * not go away while the caller is e.g. busy putting a + * conntrack in the hash that uses the helper */ + if (!try_module_get(helper->me)) + helper = NULL; + } + + read_unlock_bh(&ip_conntrack_lock); + + return helper; +} + +void ip_conntrack_helper_put(struct ip_conntrack_helper *helper) +{ + module_put(helper->me); +} + +struct ip_conntrack_protocol * +__ip_conntrack_proto_find(u_int8_t protocol) +{ + return ip_ct_protos[protocol]; +} + +/* this is guaranteed to always return a valid protocol helper, since + * it falls back to generic_protocol */ +struct ip_conntrack_protocol * +ip_conntrack_proto_find_get(u_int8_t protocol) +{ + struct ip_conntrack_protocol *p; + + preempt_disable(); + p = __ip_conntrack_proto_find(protocol); + if (p) { + if (!try_module_get(p->me)) + p = &ip_conntrack_generic_protocol; + } + preempt_enable(); + + return p; +} + +void ip_conntrack_proto_put(struct ip_conntrack_protocol *p) +{ + module_put(p->me); +} + +struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig, + struct ip_conntrack_tuple *repl) { struct ip_conntrack *conntrack; - struct ip_conntrack_tuple repl_tuple; - size_t hash; - struct ip_conntrack_expect *exp; if (!ip_conntrack_hash_rnd_initted) { get_random_bytes(&ip_conntrack_hash_rnd, 4); ip_conntrack_hash_rnd_initted = 1; } - hash = hash_conntrack(tuple); - if (ip_conntrack_max && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) { + unsigned int hash = hash_conntrack(orig); /* Try dropping from this hash chain. */ if (!early_drop(&ip_conntrack_hash[hash])) { if (net_ratelimit()) @@ -565,31 +662,58 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, } } - if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) { - DEBUGP("Can't invert tuple.\n"); - return NULL; - } - conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC); if (!conntrack) { DEBUGP("Can't allocate conntrack.\n"); - return ERR_PTR(-ENOMEM); + return NULL; } memset(conntrack, 0, sizeof(*conntrack)); atomic_set(&conntrack->ct_general.use, 1); conntrack->ct_general.destroy = destroy_conntrack; - conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple; - conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple; - if (!protocol->new(conntrack, skb)) { - kmem_cache_free(ip_conntrack_cachep, conntrack); - return NULL; - } + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; + conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; /* Don't set timer yet: wait for confirmation */ init_timer(&conntrack->timeout); conntrack->timeout.data = (unsigned long)conntrack; conntrack->timeout.function = death_by_timeout; + atomic_inc(&ip_conntrack_count); + + return conntrack; +} + +void +ip_conntrack_free(struct ip_conntrack *conntrack) +{ + atomic_dec(&ip_conntrack_count); + kmem_cache_free(ip_conntrack_cachep, conntrack); +} + +/* Allocate a new conntrack: we return -ENOMEM if classification + * failed due to stress. Otherwise it really is unclassifiable */ +static struct ip_conntrack_tuple_hash * +init_conntrack(struct ip_conntrack_tuple *tuple, + struct ip_conntrack_protocol *protocol, + struct sk_buff *skb) +{ + struct ip_conntrack *conntrack; + struct ip_conntrack_tuple repl_tuple; + struct ip_conntrack_expect *exp; + + if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) { + DEBUGP("Can't invert tuple.\n"); + return NULL; + } + + if (!(conntrack = ip_conntrack_alloc(tuple, &repl_tuple))) + return NULL; + + if (!protocol->new(conntrack, skb)) { + ip_conntrack_free(conntrack); + return NULL; + } + write_lock_bh(&ip_conntrack_lock); exp = find_expectation(tuple); @@ -610,7 +734,7 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, nf_conntrack_get(&conntrack->master->ct_general); CONNTRACK_STAT_INC(expect_new); } else { - conntrack->helper = ip_ct_find_helper(&repl_tuple); + conntrack->helper = __ip_conntrack_helper_find(&repl_tuple); CONNTRACK_STAT_INC(new); } @@ -618,7 +742,6 @@ init_conntrack(const struct ip_conntrack_tuple *tuple, /* Overload tuple linked list to put us in unconfirmed list. */ list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); - atomic_inc(&ip_conntrack_count); write_unlock_bh(&ip_conntrack_lock); if (exp) { @@ -729,7 +852,7 @@ unsigned int ip_conntrack_in(unsigned int hooknum, } #endif - proto = ip_ct_find_proto((*pskb)->nh.iph->protocol); + proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol); /* It may be an special packet, error, unclean... * inverse of the return code tells to the netfilter @@ -777,7 +900,7 @@ int invert_tuplepr(struct ip_conntrack_tuple *inverse, const struct ip_conntrack_tuple *orig) { return ip_ct_invert_tuple(inverse, orig, - ip_ct_find_proto(orig->dst.protonum)); + __ip_conntrack_proto_find(orig->dst.protonum)); } /* Would two expected things clash? */ @@ -857,6 +980,8 @@ static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp) exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ; add_timer(&exp->timeout); + exp->id = ++ip_conntrack_expect_next_id; + atomic_inc(&exp->use); CONNTRACK_STAT_INC(expect_create); } @@ -936,7 +1061,7 @@ void ip_conntrack_alter_reply(struct ip_conntrack *conntrack, conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; if (!conntrack->master && conntrack->expecting == 0) - conntrack->helper = ip_ct_find_helper(newreply); + conntrack->helper = __ip_conntrack_helper_find(newreply); write_unlock_bh(&ip_conntrack_lock); } @@ -950,6 +1075,19 @@ int ip_conntrack_helper_register(struct ip_conntrack_helper *me) return 0; } +struct ip_conntrack_helper * +__ip_conntrack_helper_find_byname(const char *name) +{ + struct ip_conntrack_helper *h; + + list_for_each_entry(h, &helpers, list) { + if (!strcmp(h->name, name)) + return h; + } + + return NULL; +} + static inline int unhelp(struct ip_conntrack_tuple_hash *i, const struct ip_conntrack_helper *me) { @@ -1025,6 +1163,39 @@ void ip_ct_refresh_acct(struct ip_conntrack *ct, } } +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) +/* Generic function for tcp/udp/sctp/dccp and alike. This needs to be + * in ip_conntrack_core, since we don't want the protocols to autoload + * or depend on ctnetlink */ +int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb, + const struct ip_conntrack_tuple *tuple) +{ + NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t), + &tuple->src.u.tcp.port); + NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t), + &tuple->dst.u.tcp.port); + return 0; + +nfattr_failure: + return -1; +} + +int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[], + struct ip_conntrack_tuple *t) +{ + if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1]) + return -EINVAL; + + t->src.u.tcp.port = + *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]); + t->dst.u.tcp.port = + *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]); + + return 0; +} +#endif + /* Returns new sk_buff, or NULL */ struct sk_buff * ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) @@ -1203,16 +1374,13 @@ static void free_conntrack_hash(void) * ip_conntrack_htable_size)); } -/* Mishearing the voices in his head, our hero wonders how he's - supposed to kill the mall. */ -void ip_conntrack_cleanup(void) +void ip_conntrack_flush() { - ip_ct_attach = NULL; /* This makes sure all current packets have passed through netfilter framework. Roll on, two-stage module delete... */ synchronize_net(); - + i_see_dead_people: ip_ct_iterate_cleanup(kill_all, NULL); if (atomic_read(&ip_conntrack_count) != 0) { @@ -1222,7 +1390,14 @@ void ip_conntrack_cleanup(void) /* wait until all references to ip_conntrack_untracked are dropped */ while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1) schedule(); +} +/* Mishearing the voices in his head, our hero wonders how he's + supposed to kill the mall. */ +void ip_conntrack_cleanup(void) +{ + ip_ct_attach = NULL; + ip_conntrack_flush(); kmem_cache_destroy(ip_conntrack_cachep); kmem_cache_destroy(ip_conntrack_expect_cachep); free_conntrack_hash(); diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c new file mode 100644 index 00000000000..f43ec18c916 --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -0,0 +1,1588 @@ +/* Connection tracking via netlink socket. Allows for user space + * protocol helpers and general trouble making from userspace. + * + * (C) 2001 by Jay Schulist + * (C) 2002-2005 by Harald Welte + * (C) 2003 by Patrick Mchardy + * (C) 2005 by Pablo Neira Ayuso + * + * I've reworked this stuff to use attributes instead of conntrack + * structures. 5.44 am. I need more tea. --pablo 05/07/11. + * + * Initial connection tracking via netlink development funded and + * generally made possible by Network Robots, Inc. (www.networkrobots.com) + * + * Further development of this code funded by Astaro AG (http://www.astaro.com) + * + * This software may be used and distributed according to the terms + * of the GNU General Public License, incorporated herein by reference. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); + +static char __initdata version[] = "0.90"; + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + + +static inline int +ctnetlink_dump_tuples_proto(struct sk_buff *skb, + const struct ip_conntrack_tuple *tuple) +{ + struct ip_conntrack_protocol *proto; + + NFA_PUT(skb, CTA_PROTO_NUM, sizeof(u_int8_t), &tuple->dst.protonum); + + proto = ip_conntrack_proto_find_get(tuple->dst.protonum); + if (proto && proto->tuple_to_nfattr) + return proto->tuple_to_nfattr(skb, tuple); + + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_tuples(struct sk_buff *skb, + const struct ip_conntrack_tuple *tuple) +{ + struct nfattr *nest_parms; + + nest_parms = NFA_NEST(skb, CTA_TUPLE_IP); + NFA_PUT(skb, CTA_IP_V4_SRC, sizeof(u_int32_t), &tuple->src.ip); + NFA_PUT(skb, CTA_IP_V4_DST, sizeof(u_int32_t), &tuple->dst.ip); + NFA_NEST_END(skb, nest_parms); + + nest_parms = NFA_NEST(skb, CTA_TUPLE_PROTO); + ctnetlink_dump_tuples_proto(skb, tuple); + NFA_NEST_END(skb, nest_parms); + + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_status(struct sk_buff *skb, const struct ip_conntrack *ct) +{ + u_int32_t status = htonl((u_int32_t) ct->status); + NFA_PUT(skb, CTA_STATUS, sizeof(status), &status); + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_timeout(struct sk_buff *skb, const struct ip_conntrack *ct) +{ + long timeout_l = ct->timeout.expires - jiffies; + u_int32_t timeout; + + if (timeout_l < 0) + timeout = 0; + else + timeout = htonl(timeout_l / HZ); + + NFA_PUT(skb, CTA_TIMEOUT, sizeof(timeout), &timeout); + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct ip_conntrack *ct) +{ + struct ip_conntrack_protocol *proto = ip_conntrack_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); + + struct nfattr *nest_proto; + int ret; + + if (!proto || !proto->to_nfattr) + return 0; + + nest_proto = NFA_NEST(skb, CTA_PROTOINFO); + + ret = proto->to_nfattr(skb, nest_proto, ct); + + ip_conntrack_proto_put(proto); + + NFA_NEST_END(skb, nest_proto); + + return ret; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct ip_conntrack *ct) +{ + struct nfattr *nest_helper; + + if (!ct->helper) + return 0; + + nest_helper = NFA_NEST(skb, CTA_HELP); + NFA_PUT(skb, CTA_HELP_NAME, CTA_HELP_MAXNAMESIZE, &ct->helper->name); + + if (ct->helper->to_nfattr) + ct->helper->to_nfattr(skb, ct); + + NFA_NEST_END(skb, nest_helper); + + return 0; + +nfattr_failure: + return -1; +} + +#ifdef CONFIG_IP_NF_CT_ACCT +static inline int +ctnetlink_dump_counters(struct sk_buff *skb, const struct ip_conntrack *ct, + enum ip_conntrack_dir dir) +{ + enum ctattr_type type = dir ? CTA_COUNTERS_REPLY: CTA_COUNTERS_ORIG; + struct nfattr *nest_count = NFA_NEST(skb, type); + u_int64_t tmp; + + tmp = cpu_to_be64(ct->counters[dir].packets); + NFA_PUT(skb, CTA_COUNTERS_PACKETS, sizeof(u_int64_t), &tmp); + + tmp = cpu_to_be64(ct->counters[dir].bytes); + NFA_PUT(skb, CTA_COUNTERS_BYTES, sizeof(u_int64_t), &tmp); + + NFA_NEST_END(skb, nest_count); + + return 0; + +nfattr_failure: + return -1; +} +#else +#define ctnetlink_dump_counters(a, b, c) (0) +#endif + +#ifdef CONFIG_IP_NF_CONNTRACK_MARK +static inline int +ctnetlink_dump_mark(struct sk_buff *skb, const struct ip_conntrack *ct) +{ + u_int32_t mark = htonl(ct->mark); + + NFA_PUT(skb, CTA_MARK, sizeof(u_int32_t), &mark); + return 0; + +nfattr_failure: + return -1; +} +#else +#define ctnetlink_dump_mark(a, b) (0) +#endif + +static inline int +ctnetlink_dump_id(struct sk_buff *skb, const struct ip_conntrack *ct) +{ + u_int32_t id = htonl(ct->id); + NFA_PUT(skb, CTA_ID, sizeof(u_int32_t), &id); + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_dump_use(struct sk_buff *skb, const struct ip_conntrack *ct) +{ + unsigned int use = htonl(atomic_read(&ct->ct_general.use)); + + NFA_PUT(skb, CTA_USE, sizeof(u_int32_t), &use); + return 0; + +nfattr_failure: + return -1; +} + +#define tuple(ct, dir) (&(ct)->tuplehash[dir].tuple) + +static int +ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq, + int event, int nowait, + const struct ip_conntrack *ct) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct nfattr *nest_parms; + unsigned char *b; + + b = skb->tail; + + event |= NFNL_SUBSYS_CTNETLINK << 8; + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + + nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0; + nfmsg->nfgen_family = AF_INET; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG); + if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) + goto nfattr_failure; + NFA_NEST_END(skb, nest_parms); + + nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY); + if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0) + goto nfattr_failure; + NFA_NEST_END(skb, nest_parms); + + if (ctnetlink_dump_status(skb, ct) < 0 || + ctnetlink_dump_timeout(skb, ct) < 0 || + ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || + ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 || + ctnetlink_dump_protoinfo(skb, ct) < 0 || + ctnetlink_dump_helpinfo(skb, ct) < 0 || + ctnetlink_dump_mark(skb, ct) < 0 || + ctnetlink_dump_id(skb, ct) < 0 || + ctnetlink_dump_use(skb, ct) < 0) + goto nfattr_failure; + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +nfattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +static int ctnetlink_conntrack_event(struct notifier_block *this, + unsigned long events, void *ptr) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct nfattr *nest_parms; + struct ip_conntrack *ct = (struct ip_conntrack *)ptr; + struct sk_buff *skb; + unsigned int type; + unsigned char *b; + unsigned int flags = 0, groups; + + /* ignore our fake conntrack entry */ + if (ct == &ip_conntrack_untracked) + return NOTIFY_DONE; + + if (events & IPCT_DESTROY) { + type = IPCTNL_MSG_CT_DELETE; + groups = NF_NETLINK_CONNTRACK_DESTROY; + goto alloc_skb; + } + if (events & (IPCT_NEW | IPCT_RELATED)) { + type = IPCTNL_MSG_CT_NEW; + flags = NLM_F_CREATE|NLM_F_EXCL; + /* dump everything */ + events = ~0UL; + groups = NF_NETLINK_CONNTRACK_NEW; + goto alloc_skb; + } + if (events & (IPCT_STATUS | + IPCT_PROTOINFO | + IPCT_HELPER | + IPCT_HELPINFO | + IPCT_NATINFO)) { + type = IPCTNL_MSG_CT_NEW; + groups = NF_NETLINK_CONNTRACK_UPDATE; + goto alloc_skb; + } + + return NOTIFY_DONE; + +alloc_skb: + /* FIXME: Check if there are any listeners before, don't hurt performance */ + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); + if (!skb) + return NOTIFY_DONE; + + b = skb->tail; + + type |= NFNL_SUBSYS_CTNETLINK << 8; + nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + + nlh->nlmsg_flags = flags; + nfmsg->nfgen_family = AF_INET; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + nest_parms = NFA_NEST(skb, CTA_TUPLE_ORIG); + if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_ORIGINAL)) < 0) + goto nfattr_failure; + NFA_NEST_END(skb, nest_parms); + + nest_parms = NFA_NEST(skb, CTA_TUPLE_REPLY); + if (ctnetlink_dump_tuples(skb, tuple(ct, IP_CT_DIR_REPLY)) < 0) + goto nfattr_failure; + NFA_NEST_END(skb, nest_parms); + + /* NAT stuff is now a status flag */ + if ((events & IPCT_STATUS || events & IPCT_NATINFO) + && ctnetlink_dump_status(skb, ct) < 0) + goto nfattr_failure; + if (events & IPCT_REFRESH + && ctnetlink_dump_timeout(skb, ct) < 0) + goto nfattr_failure; + if (events & IPCT_PROTOINFO + && ctnetlink_dump_protoinfo(skb, ct) < 0) + goto nfattr_failure; + if (events & IPCT_HELPINFO + && ctnetlink_dump_helpinfo(skb, ct) < 0) + goto nfattr_failure; + + if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || + ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0) + goto nfattr_failure; + + nlh->nlmsg_len = skb->tail - b; + nfnetlink_send(skb, 0, groups, 0); + return NOTIFY_DONE; + +nlmsg_failure: +nfattr_failure: + kfree_skb(skb); + return NOTIFY_DONE; +} +#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */ + +static int ctnetlink_done(struct netlink_callback *cb) +{ + DEBUGP("entered %s\n", __FUNCTION__); + return 0; +} + +static int +ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ip_conntrack *ct = NULL; + struct ip_conntrack_tuple_hash *h; + struct list_head *i; + u_int32_t *id = (u_int32_t *) &cb->args[1]; + + DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__, + cb->args[0], *id); + + read_lock_bh(&ip_conntrack_lock); + for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { + list_for_each(i, &ip_conntrack_hash[cb->args[0]]) { + h = (struct ip_conntrack_tuple_hash *) i; + if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) + continue; + ct = tuplehash_to_ctrack(h); + if (ct->id <= *id) + continue; + if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + IPCTNL_MSG_CT_NEW, + 1, ct) < 0) + goto out; + *id = ct->id; + } + } +out: + read_unlock_bh(&ip_conntrack_lock); + + DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); + + return skb->len; +} + +#ifdef CONFIG_IP_NF_CT_ACCT +static int +ctnetlink_dump_table_w(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ip_conntrack *ct = NULL; + struct ip_conntrack_tuple_hash *h; + struct list_head *i; + u_int32_t *id = (u_int32_t *) &cb->args[1]; + + DEBUGP("entered %s, last bucket=%u id=%u\n", __FUNCTION__, + cb->args[0], *id); + + write_lock_bh(&ip_conntrack_lock); + for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { + list_for_each(i, &ip_conntrack_hash[cb->args[0]]) { + h = (struct ip_conntrack_tuple_hash *) i; + if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) + continue; + ct = tuplehash_to_ctrack(h); + if (ct->id <= *id) + continue; + if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + IPCTNL_MSG_CT_NEW, + 1, ct) < 0) + goto out; + *id = ct->id; + + memset(&ct->counters, 0, sizeof(ct->counters)); + } + } +out: + write_unlock_bh(&ip_conntrack_lock); + + DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); + + return skb->len; +} +#endif + +static const int cta_min_ip[CTA_IP_MAX] = { + [CTA_IP_V4_SRC-1] = sizeof(u_int32_t), + [CTA_IP_V4_DST-1] = sizeof(u_int32_t), +}; + +static inline int +ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple) +{ + struct nfattr *tb[CTA_IP_MAX]; + + DEBUGP("entered %s\n", __FUNCTION__); + + memset(tb, 0, CTA_IP_MAX * sizeof(tb)); + + if (nfattr_parse_nested(tb, CTA_IP_MAX, attr) < 0) + goto nfattr_failure; + + if (nfattr_bad_size(tb, CTA_IP_MAX, cta_min_ip)) + return -EINVAL; + + if (!tb[CTA_IP_V4_SRC-1]) + return -EINVAL; + tuple->src.ip = *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_SRC-1]); + + if (!tb[CTA_IP_V4_DST-1]) + return -EINVAL; + tuple->dst.ip = *(u_int32_t *)NFA_DATA(tb[CTA_IP_V4_DST-1]); + + DEBUGP("leaving\n"); + + return 0; + +nfattr_failure: + return -1; +} + +static const int cta_min_proto[CTA_PROTO_MAX] = { + [CTA_PROTO_NUM-1] = sizeof(u_int16_t), + [CTA_PROTO_SRC_PORT-1] = sizeof(u_int16_t), + [CTA_PROTO_DST_PORT-1] = sizeof(u_int16_t), + [CTA_PROTO_ICMP_TYPE-1] = sizeof(u_int8_t), + [CTA_PROTO_ICMP_CODE-1] = sizeof(u_int8_t), + [CTA_PROTO_ICMP_ID-1] = sizeof(u_int16_t), +}; + +static inline int +ctnetlink_parse_tuple_proto(struct nfattr *attr, + struct ip_conntrack_tuple *tuple) +{ + struct nfattr *tb[CTA_PROTO_MAX]; + struct ip_conntrack_protocol *proto; + int ret = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + memset(tb, 0, CTA_PROTO_MAX * sizeof(tb)); + + if (nfattr_parse_nested(tb, CTA_PROTO_MAX, attr) < 0) + goto nfattr_failure; + + if (nfattr_bad_size(tb, CTA_PROTO_MAX, cta_min_proto)) + return -EINVAL; + + if (!tb[CTA_PROTO_NUM-1]) + return -EINVAL; + tuple->dst.protonum = *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_NUM-1]); + + proto = ip_conntrack_proto_find_get(tuple->dst.protonum); + + if (likely(proto && proto->nfattr_to_tuple)) { + ret = proto->nfattr_to_tuple(tb, tuple); + ip_conntrack_proto_put(proto); + } + + return ret; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple, + enum ctattr_tuple type) +{ + struct nfattr *tb[CTA_TUPLE_MAX]; + int err; + + DEBUGP("entered %s\n", __FUNCTION__); + + memset(tb, 0, CTA_TUPLE_MAX * sizeof(tb)); + memset(tuple, 0, sizeof(*tuple)); + + if (nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]) < 0) + goto nfattr_failure; + + if (!tb[CTA_TUPLE_IP-1]) + return -EINVAL; + + err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP-1], tuple); + if (err < 0) + return err; + + if (!tb[CTA_TUPLE_PROTO-1]) + return -EINVAL; + + err = ctnetlink_parse_tuple_proto(tb[CTA_TUPLE_PROTO-1], tuple); + if (err < 0) + return err; + + /* orig and expect tuples get DIR_ORIGINAL */ + if (type == CTA_TUPLE_REPLY) + tuple->dst.dir = IP_CT_DIR_REPLY; + else + tuple->dst.dir = IP_CT_DIR_ORIGINAL; + + DUMP_TUPLE(tuple); + + DEBUGP("leaving\n"); + + return 0; + +nfattr_failure: + return -1; +} + +#ifdef CONFIG_IP_NF_NAT_NEEDED +static const int cta_min_protonat[CTA_PROTONAT_MAX] = { + [CTA_PROTONAT_PORT_MIN-1] = sizeof(u_int16_t), + [CTA_PROTONAT_PORT_MAX-1] = sizeof(u_int16_t), +}; + +static int ctnetlink_parse_nat_proto(struct nfattr *attr, + const struct ip_conntrack *ct, + struct ip_nat_range *range) +{ + struct nfattr *tb[CTA_PROTONAT_MAX]; + struct ip_nat_protocol *npt; + + DEBUGP("entered %s\n", __FUNCTION__); + + memset(tb, 0, CTA_PROTONAT_MAX * sizeof(tb)); + + if (nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr) < 0) + goto nfattr_failure; + + if (nfattr_bad_size(tb, CTA_PROTONAT_MAX, cta_min_protonat)) + goto nfattr_failure; + + npt = ip_nat_proto_find_get(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); + if (!npt) + return 0; + + if (!npt->nfattr_to_range) { + ip_nat_proto_put(npt); + return 0; + } + + /* nfattr_to_range returns 1 if it parsed, 0 if not, neg. on error */ + if (npt->nfattr_to_range(tb, range) > 0) + range->flags |= IP_NAT_RANGE_PROTO_SPECIFIED; + + ip_nat_proto_put(npt); + + DEBUGP("leaving\n"); + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_parse_nat(struct nfattr *cda[], + const struct ip_conntrack *ct, struct ip_nat_range *range) +{ + struct nfattr *tb[CTA_NAT_MAX]; + int err; + + DEBUGP("entered %s\n", __FUNCTION__); + + memset(tb, 0, CTA_NAT_MAX * sizeof(tb)); + memset(range, 0, sizeof(*range)); + + if (nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]) < 0) + goto nfattr_failure; + + if (tb[CTA_NAT_MINIP-1]) + range->min_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MINIP-1]); + + if (!tb[CTA_NAT_MAXIP-1]) + range->max_ip = range->min_ip; + else + range->max_ip = *(u_int32_t *)NFA_DATA(tb[CTA_NAT_MAXIP-1]); + + if (range->min_ip) + range->flags |= IP_NAT_RANGE_MAP_IPS; + + if (!tb[CTA_NAT_PROTO-1]) + return 0; + + err = ctnetlink_parse_nat_proto(tb[CTA_NAT_PROTO-1], ct, range); + if (err < 0) + return err; + + DEBUGP("leaving\n"); + return 0; + +nfattr_failure: + return -1; +} +#endif + +static inline int +ctnetlink_parse_help(struct nfattr *attr, char **helper_name) +{ + struct nfattr *tb[CTA_HELP_MAX]; + + DEBUGP("entered %s\n", __FUNCTION__); + memset(tb, 0, CTA_HELP_MAX * sizeof(tb)); + + if (nfattr_parse_nested(tb, CTA_HELP_MAX, attr) < 0) + goto nfattr_failure; + + if (!tb[CTA_HELP_NAME-1]) + return -EINVAL; + + *helper_name = NFA_DATA(tb[CTA_HELP_NAME-1]); + + return 0; + +nfattr_failure: + return -1; +} + +static int +ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct ip_conntrack_tuple_hash *h; + struct ip_conntrack_tuple tuple; + struct ip_conntrack *ct; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (cda[CTA_TUPLE_ORIG-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); + else if (cda[CTA_TUPLE_REPLY-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY); + else { + /* Flush the whole table */ + ip_conntrack_flush(); + return 0; + } + + if (err < 0) + return err; + + h = ip_conntrack_find_get(&tuple, NULL); + if (!h) { + DEBUGP("tuple not found in conntrack hash\n"); + return -ENOENT; + } + + ct = tuplehash_to_ctrack(h); + + if (cda[CTA_ID-1]) { + u_int32_t id = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_ID-1])); + if (ct->id != id) { + ip_conntrack_put(ct); + return -ENOENT; + } + } + if (del_timer(&ct->timeout)) { + ip_conntrack_put(ct); + ct->timeout.function((unsigned long)ct); + return 0; + } + ip_conntrack_put(ct); + DEBUGP("leaving\n"); + + return 0; +} + +static int +ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct ip_conntrack_tuple_hash *h; + struct ip_conntrack_tuple tuple; + struct ip_conntrack *ct; + struct sk_buff *skb2 = NULL; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct nfgenmsg *msg = NLMSG_DATA(nlh); + u32 rlen; + + if (msg->nfgen_family != AF_INET) + return -EAFNOSUPPORT; + + if (NFNL_MSG_TYPE(nlh->nlmsg_type) == + IPCTNL_MSG_CT_GET_CTRZERO) { +#ifdef CONFIG_IP_NF_CT_ACCT + if ((*errp = netlink_dump_start(ctnl, skb, nlh, + ctnetlink_dump_table_w, + ctnetlink_done)) != 0) + return -EINVAL; +#else + return -ENOTSUPP; +#endif + } else { + if ((*errp = netlink_dump_start(ctnl, skb, nlh, + ctnetlink_dump_table, + ctnetlink_done)) != 0) + return -EINVAL; + } + + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + skb_pull(skb, rlen); + return 0; + } + + if (cda[CTA_TUPLE_ORIG-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); + else if (cda[CTA_TUPLE_REPLY-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY); + else + return -EINVAL; + + if (err < 0) + return err; + + h = ip_conntrack_find_get(&tuple, NULL); + if (!h) { + DEBUGP("tuple not found in conntrack hash"); + return -ENOENT; + } + DEBUGP("tuple found\n"); + ct = tuplehash_to_ctrack(h); + + err = -ENOMEM; + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); + if (!skb2) { + ip_conntrack_put(ct); + return -ENOMEM; + } + NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid; + + err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).pid, nlh->nlmsg_seq, + IPCTNL_MSG_CT_NEW, 1, ct); + ip_conntrack_put(ct); + if (err <= 0) + goto out; + + err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + if (err < 0) + goto out; + + DEBUGP("leaving\n"); + return 0; + +out: + if (skb2) + kfree_skb(skb2); + return -1; +} + +static inline int +ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[]) +{ + unsigned long d, status = *(u_int32_t *)NFA_DATA(cda[CTA_STATUS-1]); + d = ct->status ^ status; + + if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING)) + /* unchangeable */ + return -EINVAL; + + if (d & IPS_SEEN_REPLY && !(status & IPS_SEEN_REPLY)) + /* SEEN_REPLY bit can only be set */ + return -EINVAL; + + + if (d & IPS_ASSURED && !(status & IPS_ASSURED)) + /* ASSURED bit can only be set */ + return -EINVAL; + + if (cda[CTA_NAT-1]) { +#ifndef CONFIG_IP_NF_NAT_NEEDED + return -EINVAL; +#else + unsigned int hooknum; + struct ip_nat_range range; + + if (ctnetlink_parse_nat(cda, ct, &range) < 0) + return -EINVAL; + + DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", + NIPQUAD(range.min_ip), NIPQUAD(range.max_ip), + htons(range.min.all), htons(range.max.all)); + + /* This is tricky but it works. ip_nat_setup_info needs the + * hook number as parameter, so let's do the correct + * conversion and run away */ + if (status & IPS_SRC_NAT_DONE) + hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */ + else if (status & IPS_DST_NAT_DONE) + hooknum = NF_IP_PRE_ROUTING; /* IP_NAT_MANIP_DST */ + else + return -EINVAL; /* Missing NAT flags */ + + DEBUGP("NAT status: %lu\n", + status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); + + if (ip_nat_initialized(ct, hooknum)) + return -EEXIST; + ip_nat_setup_info(ct, &range, hooknum); + + DEBUGP("NAT status after setup_info: %lu\n", + ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); +#endif + } + + /* Be careful here, modifying NAT bits can screw up things, + * so don't let users modify them directly if they don't pass + * ip_nat_range. */ + ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK); + return 0; +} + + +static inline int +ctnetlink_change_helper(struct ip_conntrack *ct, struct nfattr *cda[]) +{ + struct ip_conntrack_helper *helper; + char *helpname; + int err; + + DEBUGP("entered %s\n", __FUNCTION__); + + /* don't change helper of sibling connections */ + if (ct->master) + return -EINVAL; + + err = ctnetlink_parse_help(cda[CTA_HELP-1], &helpname); + if (err < 0) + return err; + + helper = __ip_conntrack_helper_find_byname(helpname); + if (!helper) { + if (!strcmp(helpname, "")) + helper = NULL; + else + return -EINVAL; + } + + if (ct->helper) { + if (!helper) { + /* we had a helper before ... */ + ip_ct_remove_expectations(ct); + ct->helper = NULL; + } else { + /* need to zero data of old helper */ + memset(&ct->help, 0, sizeof(ct->help)); + } + } + + ct->helper = helper; + + return 0; +} + +static inline int +ctnetlink_change_timeout(struct ip_conntrack *ct, struct nfattr *cda[]) +{ + u_int32_t timeout = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1])); + + if (!del_timer(&ct->timeout)) + return -ETIME; + + ct->timeout.expires = jiffies + timeout * HZ; + add_timer(&ct->timeout); + + return 0; +} + +static int +ctnetlink_change_conntrack(struct ip_conntrack *ct, struct nfattr *cda[]) +{ + int err; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (cda[CTA_HELP-1]) { + err = ctnetlink_change_helper(ct, cda); + if (err < 0) + return err; + } + + if (cda[CTA_TIMEOUT-1]) { + err = ctnetlink_change_timeout(ct, cda); + if (err < 0) + return err; + } + + if (cda[CTA_STATUS-1]) { + err = ctnetlink_change_status(ct, cda); + if (err < 0) + return err; + } + + DEBUGP("all done\n"); + return 0; +} + +static int +ctnetlink_create_conntrack(struct nfattr *cda[], + struct ip_conntrack_tuple *otuple, + struct ip_conntrack_tuple *rtuple) +{ + struct ip_conntrack *ct; + int err = -EINVAL; + + DEBUGP("entered %s\n", __FUNCTION__); + + ct = ip_conntrack_alloc(otuple, rtuple); + if (ct == NULL || IS_ERR(ct)) + return -ENOMEM; + + if (!cda[CTA_TIMEOUT-1]) + goto err; + ct->timeout.expires = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_TIMEOUT-1])); + + ct->timeout.expires = jiffies + ct->timeout.expires * HZ; + ct->status |= IPS_CONFIRMED; + + err = ctnetlink_change_status(ct, cda); + if (err < 0) + goto err; + + ct->helper = ip_conntrack_helper_find_get(rtuple); + + add_timer(&ct->timeout); + ip_conntrack_hash_insert(ct); + + if (ct->helper) + ip_conntrack_helper_put(ct->helper); + + DEBUGP("conntrack with id %u inserted\n", ct->id); + return 0; + +err: + ip_conntrack_free(ct); + return err; +} + +static int +ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct ip_conntrack_tuple otuple, rtuple; + struct ip_conntrack_tuple_hash *h = NULL; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (cda[CTA_TUPLE_ORIG-1]) { + err = ctnetlink_parse_tuple(cda, &otuple, CTA_TUPLE_ORIG); + if (err < 0) + return err; + } + + if (cda[CTA_TUPLE_REPLY-1]) { + err = ctnetlink_parse_tuple(cda, &rtuple, CTA_TUPLE_REPLY); + if (err < 0) + return err; + } + + write_lock_bh(&ip_conntrack_lock); + if (cda[CTA_TUPLE_ORIG-1]) + h = __ip_conntrack_find(&otuple, NULL); + else if (cda[CTA_TUPLE_REPLY-1]) + h = __ip_conntrack_find(&rtuple, NULL); + + if (h == NULL) { + write_unlock_bh(&ip_conntrack_lock); + DEBUGP("no such conntrack, create new\n"); + err = -ENOENT; + if (nlh->nlmsg_flags & NLM_F_CREATE) + err = ctnetlink_create_conntrack(cda, &otuple, &rtuple); + goto out_unlock; + } else { + /* we only allow nat config for new conntracks */ + if (cda[CTA_NAT-1]) { + err = -EINVAL; + goto out_unlock; + } + } + + /* We manipulate the conntrack inside the global conntrack table lock, + * so there's no need to increase the refcount */ + DEBUGP("conntrack found\n"); + err = -EEXIST; + if (!(nlh->nlmsg_flags & NLM_F_EXCL)) + err = ctnetlink_change_conntrack(tuplehash_to_ctrack(h), cda); + +out_unlock: + write_unlock_bh(&ip_conntrack_lock); + return err; +} + +/*********************************************************************** + * EXPECT + ***********************************************************************/ + +static inline int +ctnetlink_exp_dump_tuple(struct sk_buff *skb, + const struct ip_conntrack_tuple *tuple, + enum ctattr_expect type) +{ + struct nfattr *nest_parms = NFA_NEST(skb, type); + + if (ctnetlink_dump_tuples(skb, tuple) < 0) + goto nfattr_failure; + + NFA_NEST_END(skb, nest_parms); + + return 0; + +nfattr_failure: + return -1; +} + +static inline int +ctnetlink_exp_dump_expect(struct sk_buff *skb, + const struct ip_conntrack_expect *exp) +{ + u_int32_t timeout = htonl((exp->timeout.expires - jiffies) / HZ); + u_int32_t id = htonl(exp->id); + struct nfattr *nest_parms = NFA_NEST(skb, CTA_EXPECT); + + if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0) + goto nfattr_failure; + if (ctnetlink_exp_dump_tuple(skb, &exp->mask, CTA_EXPECT_MASK) < 0) + goto nfattr_failure; + + NFA_PUT(skb, CTA_EXPECT_TIMEOUT, sizeof(timeout), &timeout); + NFA_PUT(skb, CTA_EXPECT_ID, sizeof(u_int32_t), &id); + NFA_NEST_END(skb, nest_parms); + + return 0; + +nfattr_failure: + return -1; +} + +static int +ctnetlink_exp_fill_info(struct sk_buff *skb, u32 pid, u32 seq, + int event, + int nowait, + const struct ip_conntrack_expect *exp) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + unsigned char *b; + + b = skb->tail; + + event |= NFNL_SUBSYS_CTNETLINK_EXP << 8; + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + + nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0; + nfmsg->nfgen_family = AF_INET; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (ctnetlink_exp_dump_expect(skb, exp) < 0) + goto nfattr_failure; + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +nfattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +static int ctnetlink_expect_event(struct notifier_block *this, + unsigned long events, void *ptr) +{ + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct ip_conntrack_expect *exp = (struct ip_conntrack_expect *)ptr; + struct sk_buff *skb; + unsigned int type; + unsigned char *b; + int flags = 0; + u16 proto; + + if (events & IPEXP_NEW) { + type = IPCTNL_MSG_EXP_NEW; + flags = NLM_F_CREATE|NLM_F_EXCL; + } else + return NOTIFY_DONE; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); + if (!skb) + return NOTIFY_DONE; + + b = skb->tail; + + type |= NFNL_SUBSYS_CTNETLINK << 8; + nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(struct nfgenmsg)); + nfmsg = NLMSG_DATA(nlh); + + nlh->nlmsg_flags = flags; + nfmsg->nfgen_family = AF_INET; + nfmsg->version = NFNETLINK_V0; + nfmsg->res_id = 0; + + if (ctnetlink_exp_dump_expect(skb, exp) < 0) + goto nfattr_failure; + + nlh->nlmsg_len = skb->tail - b; + proto = exp->tuple.dst.protonum; + nfnetlink_send(skb, 0, NF_NETLINK_CONNTRACK_EXP_NEW, 0); + return NOTIFY_DONE; + +nlmsg_failure: +nfattr_failure: + kfree_skb(skb); + return NOTIFY_DONE; +} +#endif + +static int +ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct ip_conntrack_expect *exp = NULL; + struct list_head *i; + u_int32_t *id = (u_int32_t *) &cb->args[0]; + + DEBUGP("entered %s, last id=%llu\n", __FUNCTION__, *id); + + read_lock_bh(&ip_conntrack_lock); + list_for_each(i, &ip_conntrack_expect_list) { + exp = (struct ip_conntrack_expect *) i; + if (exp->id <= *id) + continue; + if (ctnetlink_exp_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + IPCTNL_MSG_EXP_NEW, + 1, exp) < 0) + goto out; + *id = exp->id; + } +out: + read_unlock_bh(&ip_conntrack_lock); + + DEBUGP("leaving, last id=%llu\n", *id); + + return skb->len; +} + +static int +ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct ip_conntrack_tuple tuple; + struct ip_conntrack_expect *exp; + struct sk_buff *skb2; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (nlh->nlmsg_flags & NLM_F_DUMP) { + struct nfgenmsg *msg = NLMSG_DATA(nlh); + u32 rlen; + + if (msg->nfgen_family != AF_INET) + return -EAFNOSUPPORT; + + if ((*errp = netlink_dump_start(ctnl, skb, nlh, + ctnetlink_exp_dump_table, + ctnetlink_done)) != 0) + return -EINVAL; + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + skb_pull(skb, rlen); + return 0; + } + + if (cda[CTA_TUPLE_ORIG-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); + else if (cda[CTA_TUPLE_REPLY-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY); + else + return -EINVAL; + + if (err < 0) + return err; + + exp = ip_conntrack_expect_find_get(&tuple); + if (!exp) + return -ENOENT; + + err = -ENOMEM; + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb2) + goto out; + NETLINK_CB(skb2).dst_pid = NETLINK_CB(skb).pid; + + err = ctnetlink_exp_fill_info(skb2, NETLINK_CB(skb).pid, + nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, + 1, exp); + if (err <= 0) + goto out; + + ip_conntrack_expect_put(exp); + + err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).pid, MSG_DONTWAIT); + if (err < 0) + goto free; + + return err; + +out: + ip_conntrack_expect_put(exp); +free: + if (skb2) + kfree_skb(skb2); + return err; +} + +static int +ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct ip_conntrack_expect *exp, *tmp; + struct ip_conntrack_tuple tuple; + struct ip_conntrack_helper *h; + int err; + + /* delete by tuple needs either orig or reply tuple */ + if (cda[CTA_TUPLE_ORIG-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); + else if (cda[CTA_TUPLE_REPLY-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY); + else if (cda[CTA_HELP_NAME-1]) { + char *name = NFA_DATA(cda[CTA_HELP_NAME-1]); + + /* delete all expectations for this helper */ + write_lock_bh(&ip_conntrack_lock); + h = __ip_conntrack_helper_find_byname(name); + if (!h) { + write_unlock_bh(&ip_conntrack_lock); + return -EINVAL; + } + list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, + list) { + if (exp->master->helper == h + && del_timer(&exp->timeout)) + __ip_ct_expect_unlink_destroy(exp); + } + write_unlock(&ip_conntrack_lock); + return 0; + } else { + /* This basically means we have to flush everything*/ + write_lock_bh(&ip_conntrack_lock); + list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, + list) { + if (del_timer(&exp->timeout)) + __ip_ct_expect_unlink_destroy(exp); + } + write_unlock_bh(&ip_conntrack_lock); + return 0; + } + + if (err < 0) + return err; + + /* bump usage count to 2 */ + exp = ip_conntrack_expect_find_get(&tuple); + if (!exp) + return -ENOENT; + + if (cda[CTA_EXPECT_ID-1]) { + u_int32_t id = *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]); + if (exp->id != ntohl(id)) { + ip_conntrack_expect_put(exp); + return -ENOENT; + } + } + + /* after list removal, usage count == 1 */ + ip_conntrack_unexpect_related(exp); + /* have to put what we 'get' above. after this line usage count == 0 */ + ip_conntrack_expect_put(exp); + + return 0; +} +static int +ctnetlink_change_expect(struct ip_conntrack_expect *x, struct nfattr *cda[]) +{ + return -EOPNOTSUPP; +} + +static int +ctnetlink_create_expect(struct nfattr *cda[]) +{ + struct ip_conntrack_tuple tuple, mask, master_tuple; + struct ip_conntrack_tuple_hash *h = NULL; + struct ip_conntrack_expect *exp; + struct ip_conntrack *ct; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); + if (err < 0) + return err; + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASK); + if (err < 0) + return err; + + if (cda[CTA_TUPLE_ORIG-1]) + err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_TUPLE_ORIG); + else if (cda[CTA_TUPLE_REPLY-1]) + err = ctnetlink_parse_tuple(cda, &master_tuple, + CTA_TUPLE_REPLY); + else + return -EINVAL; + + if (err < 0) + return err; + + /* Look for master conntrack of this expectation */ + h = ip_conntrack_find_get(&master_tuple, NULL); + if (!h) + return -ENOENT; + ct = tuplehash_to_ctrack(h); + + if (!ct->helper) { + /* such conntrack hasn't got any helper, abort */ + err = -EINVAL; + goto out; + } + + exp = ip_conntrack_expect_alloc(ct); + if (!exp) { + err = -ENOMEM; + goto out; + } + + exp->expectfn = NULL; + exp->master = ct; + memcpy(&exp->tuple, &tuple, sizeof(struct ip_conntrack_tuple)); + memcpy(&exp->mask, &mask, sizeof(struct ip_conntrack_tuple)); + + err = ip_conntrack_expect_related(exp); + ip_conntrack_expect_put(exp); + +out: + ip_conntrack_put(tuplehash_to_ctrack(h)); + return err; +} + +static int +ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, + struct nlmsghdr *nlh, struct nfattr *cda[], int *errp) +{ + struct ip_conntrack_tuple tuple; + struct ip_conntrack_expect *exp; + int err = 0; + + DEBUGP("entered %s\n", __FUNCTION__); + + if (!cda[CTA_EXPECT_TUPLE-1] || !cda[CTA_EXPECT_MASK-1]) + return -EINVAL; + + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); + if (err < 0) + return err; + + write_lock_bh(&ip_conntrack_lock); + exp = __ip_conntrack_expect_find(&tuple); + + if (!exp) { + write_unlock_bh(&ip_conntrack_lock); + err = -ENOENT; + if (nlh->nlmsg_flags & NLM_F_CREATE) + err = ctnetlink_create_expect(cda); + return err; + } + + err = -EEXIST; + if (!(nlh->nlmsg_flags & NLM_F_EXCL)) + err = ctnetlink_change_expect(exp, cda); + write_unlock_bh(&ip_conntrack_lock); + + DEBUGP("leaving\n"); + + return err; +} + +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +static struct notifier_block ctnl_notifier = { + .notifier_call = ctnetlink_conntrack_event, +}; + +static struct notifier_block ctnl_notifier_exp = { + .notifier_call = ctnetlink_expect_event, +}; +#endif + +static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { + [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack, + .cap_required = CAP_NET_ADMIN }, + [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack, + .cap_required = CAP_NET_ADMIN }, + [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack, + .cap_required = CAP_NET_ADMIN }, + [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack, + .cap_required = CAP_NET_ADMIN }, +}; + +static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_MAX] = { + [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect, + .cap_required = CAP_NET_ADMIN }, + [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect, + .cap_required = CAP_NET_ADMIN }, + [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect, + .cap_required = CAP_NET_ADMIN }, +}; + +static struct nfnetlink_subsystem ctnl_subsys = { + .name = "conntrack", + .subsys_id = NFNL_SUBSYS_CTNETLINK, + .cb_count = IPCTNL_MSG_MAX, + .attr_count = CTA_MAX, + .cb = ctnl_cb, +}; + +static struct nfnetlink_subsystem ctnl_exp_subsys = { + .name = "conntrack_expect", + .subsys_id = NFNL_SUBSYS_CTNETLINK_EXP, + .cb_count = IPCTNL_MSG_EXP_MAX, + .attr_count = CTA_MAX, + .cb = ctnl_exp_cb, +}; + +static int __init ctnetlink_init(void) +{ + int ret; + + printk("ctnetlink v%s: registering with nfnetlink.\n", version); + ret = nfnetlink_subsys_register(&ctnl_subsys); + if (ret < 0) { + printk("ctnetlink_init: cannot register with nfnetlink.\n"); + goto err_out; + } + + ret = nfnetlink_subsys_register(&ctnl_exp_subsys); + if (ret < 0) { + printk("ctnetlink_init: cannot register exp with nfnetlink.\n"); + goto err_unreg_subsys; + } + +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS + ret = ip_conntrack_register_notifier(&ctnl_notifier); + if (ret < 0) { + printk("ctnetlink_init: cannot register notifier.\n"); + goto err_unreg_exp_subsys; + } + + ret = ip_conntrack_expect_register_notifier(&ctnl_notifier_exp); + if (ret < 0) { + printk("ctnetlink_init: cannot expect register notifier.\n"); + goto err_unreg_notifier; + } +#endif + + return 0; + +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS +err_unreg_notifier: + ip_conntrack_unregister_notifier(&ctnl_notifier); +err_unreg_exp_subsys: + nfnetlink_subsys_unregister(&ctnl_exp_subsys); +#endif +err_unreg_subsys: + nfnetlink_subsys_unregister(&ctnl_subsys); +err_out: + return ret; +} + +static void __exit ctnetlink_exit(void) +{ + printk("ctnetlink: unregistering from nfnetlink.\n"); + +#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS + ip_conntrack_unregister_notifier(&ctnl_notifier_exp); + ip_conntrack_unregister_notifier(&ctnl_notifier); +#endif + + nfnetlink_subsys_unregister(&ctnl_exp_subsys); + nfnetlink_subsys_unregister(&ctnl_subsys); + return; +} + +module_init(ctnetlink_init); +module_exit(ctnetlink_exit); diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index dca1f63d6f5..3f90cb9979a 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -109,16 +109,17 @@ static int icmp_packet(struct ip_conntrack *ct, return NF_ACCEPT; } +static u_int8_t valid_new[] = { + [ICMP_ECHO] = 1, + [ICMP_TIMESTAMP] = 1, + [ICMP_INFO_REQUEST] = 1, + [ICMP_ADDRESS] = 1 +}; + /* Called when a new connection for this protocol found. */ static int icmp_new(struct ip_conntrack *conntrack, const struct sk_buff *skb) { - static u_int8_t valid_new[] - = { [ICMP_ECHO] = 1, - [ICMP_TIMESTAMP] = 1, - [ICMP_INFO_REQUEST] = 1, - [ICMP_ADDRESS] = 1 }; - if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) { /* Can't create a new ICMP `conn' with this. */ @@ -159,11 +160,12 @@ icmp_error_message(struct sk_buff *skb, return NF_ACCEPT; } - innerproto = ip_ct_find_proto(inside->ip.protocol); + innerproto = ip_conntrack_proto_find_get(inside->ip.protocol); dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp) + inside->ip.ihl*4; /* Are they talking about one of our connections? */ if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) { DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol); + ip_conntrack_proto_put(innerproto); return NF_ACCEPT; } @@ -171,8 +173,10 @@ icmp_error_message(struct sk_buff *skb, been preserved inside the ICMP. */ if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { DEBUGP("icmp_error_track: Can't invert tuple\n"); + ip_conntrack_proto_put(innerproto); return NF_ACCEPT; } + ip_conntrack_proto_put(innerproto); *ctinfo = IP_CT_RELATED; @@ -266,6 +270,47 @@ checksum_skipped: return icmp_error_message(skb, ctinfo, hooknum); } +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) +static int icmp_tuple_to_nfattr(struct sk_buff *skb, + const struct ip_conntrack_tuple *t) +{ + NFA_PUT(skb, CTA_PROTO_ICMP_ID, sizeof(u_int16_t), + &t->src.u.icmp.id); + NFA_PUT(skb, CTA_PROTO_ICMP_TYPE, sizeof(u_int8_t), + &t->dst.u.icmp.type); + NFA_PUT(skb, CTA_PROTO_ICMP_CODE, sizeof(u_int8_t), + &t->dst.u.icmp.code); + + if (t->dst.u.icmp.type >= sizeof(valid_new) + || !valid_new[t->dst.u.icmp.type]) + return -EINVAL; + + return 0; + +nfattr_failure: + return -1; +} + +static int icmp_nfattr_to_tuple(struct nfattr *tb[], + struct ip_conntrack_tuple *tuple) +{ + if (!tb[CTA_PROTO_ICMP_TYPE-1] + || !tb[CTA_PROTO_ICMP_CODE-1] + || !tb[CTA_PROTO_ICMP_ID-1]) + return -1; + + tuple->dst.u.icmp.type = + *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_TYPE-1]); + tuple->dst.u.icmp.code = + *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_CODE-1]); + tuple->src.u.icmp.id = + *(u_int8_t *)NFA_DATA(tb[CTA_PROTO_ICMP_ID-1]); + + return 0; +} +#endif + struct ip_conntrack_protocol ip_conntrack_protocol_icmp = { .proto = IPPROTO_ICMP, @@ -277,4 +322,9 @@ struct ip_conntrack_protocol ip_conntrack_protocol_icmp = .packet = icmp_packet, .new = icmp_new, .error = icmp_error, +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) + .tuple_to_nfattr = icmp_tuple_to_nfattr, + .nfattr_to_tuple = icmp_nfattr_to_tuple, +#endif }; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c index 3d5f878a07d..a875f35e576 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c @@ -505,7 +505,12 @@ static struct ip_conntrack_protocol ip_conntrack_protocol_sctp = { .packet = sctp_packet, .new = sctp_new, .destroy = NULL, - .me = THIS_MODULE + .me = THIS_MODULE, +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) + .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr, + .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple, +#endif }; #ifdef CONFIG_SYSCTL diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index a569ad1ee4d..c2bce22d403 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -336,6 +336,23 @@ static int tcp_print_conntrack(struct seq_file *s, return seq_printf(s, "%s ", tcp_conntrack_names[state]); } +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) +static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa, + const struct ip_conntrack *ct) +{ + read_lock_bh(&tcp_lock); + NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t), + &ct->proto.tcp.state); + read_unlock_bh(&tcp_lock); + + return 0; + +nfattr_failure: + return -1; +} +#endif + static unsigned int get_conntrack_index(const struct tcphdr *tcph) { if (tcph->rst) return TCP_RST_SET; @@ -1100,4 +1117,10 @@ struct ip_conntrack_protocol ip_conntrack_protocol_tcp = .packet = tcp_packet, .new = tcp_new, .error = tcp_error, +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) + .to_nfattr = tcp_to_nfattr, + .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr, + .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple, +#endif }; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c index 6066eaf4d82..14130169cbf 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c @@ -145,4 +145,9 @@ struct ip_conntrack_protocol ip_conntrack_protocol_udp = .packet = udp_packet, .new = udp_new, .error = udp_error, +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) + .tuple_to_nfattr = ip_ct_port_tuple_to_nfattr, + .nfattr_to_tuple = ip_ct_port_nfattr_to_tuple, +#endif }; diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index f0880004115..ca97c3ac2f2 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -5,7 +5,7 @@ */ /* (C) 1999-2001 Paul `Rusty' Russell - * (C) 2002-2004 Netfilter Core Team + * (C) 2002-2005 Netfilter Core Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -147,8 +147,7 @@ static int ct_seq_show(struct seq_file *s, void *v) if (DIRECTION(hash)) return 0; - proto = ip_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] - .tuple.dst.protonum); + proto = __ip_conntrack_proto_find(conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum); IP_NF_ASSERT(proto); if (seq_printf(s, "%-8s %u %ld ", @@ -283,7 +282,7 @@ static int exp_seq_show(struct seq_file *s, void *v) seq_printf(s, "proto=%u ", expect->tuple.dst.protonum); print_tuple(s, &expect->tuple, - ip_ct_find_proto(expect->tuple.dst.protonum)); + __ip_conntrack_proto_find(expect->tuple.dst.protonum)); return seq_putc(s, '\n'); } @@ -992,12 +991,16 @@ EXPORT_SYMBOL(ip_conntrack_helper_register); EXPORT_SYMBOL(ip_conntrack_helper_unregister); EXPORT_SYMBOL(ip_ct_iterate_cleanup); EXPORT_SYMBOL(ip_ct_refresh_acct); -EXPORT_SYMBOL(ip_ct_protos); -EXPORT_SYMBOL(ip_ct_find_proto); + EXPORT_SYMBOL(ip_conntrack_expect_alloc); EXPORT_SYMBOL(ip_conntrack_expect_put); +EXPORT_SYMBOL_GPL(ip_conntrack_expect_find_get); EXPORT_SYMBOL(ip_conntrack_expect_related); EXPORT_SYMBOL(ip_conntrack_unexpect_related); +EXPORT_SYMBOL_GPL(ip_conntrack_expect_list); +EXPORT_SYMBOL_GPL(__ip_conntrack_expect_find); +EXPORT_SYMBOL_GPL(__ip_ct_expect_unlink_destroy); + EXPORT_SYMBOL(ip_conntrack_tuple_taken); EXPORT_SYMBOL(ip_ct_gather_frags); EXPORT_SYMBOL(ip_conntrack_htable_size); @@ -1005,7 +1008,28 @@ EXPORT_SYMBOL(ip_conntrack_lock); EXPORT_SYMBOL(ip_conntrack_hash); EXPORT_SYMBOL(ip_conntrack_untracked); EXPORT_SYMBOL_GPL(ip_conntrack_find_get); -EXPORT_SYMBOL_GPL(ip_conntrack_put); #ifdef CONFIG_IP_NF_NAT_NEEDED EXPORT_SYMBOL(ip_conntrack_tcp_update); #endif + +EXPORT_SYMBOL_GPL(ip_conntrack_flush); +EXPORT_SYMBOL_GPL(__ip_conntrack_find); + +EXPORT_SYMBOL_GPL(ip_conntrack_alloc); +EXPORT_SYMBOL_GPL(ip_conntrack_free); +EXPORT_SYMBOL_GPL(ip_conntrack_hash_insert); + +EXPORT_SYMBOL_GPL(ip_ct_remove_expectations); + +EXPORT_SYMBOL_GPL(ip_conntrack_helper_find_get); +EXPORT_SYMBOL_GPL(ip_conntrack_helper_put); +EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname); + +EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get); +EXPORT_SYMBOL_GPL(ip_conntrack_proto_put); +EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find); +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) +EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr); +EXPORT_SYMBOL_GPL(ip_ct_port_nfattr_to_tuple); +#endif diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c index ed4d731880f..567c802fecf 100644 --- a/net/ipv4/netfilter/ip_nat_core.c +++ b/net/ipv4/netfilter/ip_nat_core.c @@ -47,8 +47,39 @@ DEFINE_RWLOCK(ip_nat_lock); static unsigned int ip_nat_htable_size; static struct list_head *bysource; + +#define MAX_IP_NAT_PROTO 256 struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO]; +static inline struct ip_nat_protocol * +__ip_nat_proto_find(u_int8_t protonum) +{ + return ip_nat_protos[protonum]; +} + +struct ip_nat_protocol * +ip_nat_proto_find_get(u_int8_t protonum) +{ + struct ip_nat_protocol *p; + + /* we need to disable preemption to make sure 'p' doesn't get + * removed until we've grabbed the reference */ + preempt_disable(); + p = __ip_nat_proto_find(protonum); + if (p) { + if (!try_module_get(p->me)) + p = &ip_nat_unknown_protocol; + } + preempt_enable(); + + return p; +} + +void +ip_nat_proto_put(struct ip_nat_protocol *p) +{ + module_put(p->me); +} /* We keep an extra hash for each conntrack, for fast searching. */ static inline unsigned int @@ -103,7 +134,8 @@ static int in_range(const struct ip_conntrack_tuple *tuple, const struct ip_nat_range *range) { - struct ip_nat_protocol *proto = ip_nat_find_proto(tuple->dst.protonum); + struct ip_nat_protocol *proto = + __ip_nat_proto_find(tuple->dst.protonum); /* If we are supposed to map IPs, then we must be in the range specified, otherwise let this drag us onto a new src IP. */ @@ -216,8 +248,7 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple, struct ip_conntrack *conntrack, enum ip_nat_manip_type maniptype) { - struct ip_nat_protocol *proto - = ip_nat_find_proto(orig_tuple->dst.protonum); + struct ip_nat_protocol *proto; /* 1) If this srcip/proto/src-proto-part is currently mapped, and that same mapping gives a unique tuple within the given @@ -242,14 +273,20 @@ get_unique_tuple(struct ip_conntrack_tuple *tuple, /* 3) The per-protocol part of the manip is made to map into the range to make a unique tuple. */ + proto = ip_nat_proto_find_get(orig_tuple->dst.protonum); + /* Only bother mapping if it's not already in range and unique */ if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) || proto->in_range(tuple, maniptype, &range->min, &range->max)) - && !ip_nat_used_tuple(tuple, conntrack)) + && !ip_nat_used_tuple(tuple, conntrack)) { + ip_nat_proto_put(proto); return; + } /* Last change: get protocol to try to obtain unique tuple. */ proto->unique_tuple(tuple, range, maniptype, conntrack); + + ip_nat_proto_put(proto); } unsigned int @@ -320,6 +357,7 @@ manip_pkt(u_int16_t proto, enum ip_nat_manip_type maniptype) { struct iphdr *iph; + struct ip_nat_protocol *p; if (!skb_ip_make_writable(pskb, iphdroff + sizeof(*iph))) return 0; @@ -327,9 +365,12 @@ manip_pkt(u_int16_t proto, iph = (void *)(*pskb)->data + iphdroff; /* Manipulate protcol part. */ - if (!ip_nat_find_proto(proto)->manip_pkt(pskb, iphdroff, - target, maniptype)) + p = ip_nat_proto_find_get(proto); + if (!p->manip_pkt(pskb, iphdroff, target, maniptype)) { + ip_nat_proto_put(p); return 0; + } + ip_nat_proto_put(p); iph = (void *)(*pskb)->data + iphdroff; @@ -425,7 +466,8 @@ int icmp_reply_translation(struct sk_buff **pskb, if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 + sizeof(struct icmphdr) + inside->ip.ihl*4, - &inner, ip_ct_find_proto(inside->ip.protocol))) + &inner, + __ip_conntrack_proto_find(inside->ip.protocol))) return 0; /* Change inner back to look like incoming packet. We do the @@ -495,6 +537,49 @@ void ip_nat_protocol_unregister(struct ip_nat_protocol *proto) synchronize_net(); } +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) +int +ip_nat_port_range_to_nfattr(struct sk_buff *skb, + const struct ip_nat_range *range) +{ + NFA_PUT(skb, CTA_PROTONAT_PORT_MIN, sizeof(u_int16_t), + &range->min.tcp.port); + NFA_PUT(skb, CTA_PROTONAT_PORT_MAX, sizeof(u_int16_t), + &range->max.tcp.port); + + return 0; + +nfattr_failure: + return -1; +} + +int +ip_nat_port_nfattr_to_range(struct nfattr *tb[], struct ip_nat_range *range) +{ + int ret = 0; + + /* we have to return whether we actually parsed something or not */ + + if (tb[CTA_PROTONAT_PORT_MIN-1]) { + ret = 1; + range->min.tcp.port = + *(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MIN-1]); + } + + if (!tb[CTA_PROTONAT_PORT_MAX-1]) { + if (ret) + range->max.tcp.port = range->min.tcp.port; + } else { + ret = 1; + range->max.tcp.port = + *(u_int16_t *)NFA_DATA(tb[CTA_PROTONAT_PORT_MAX-1]); + } + + return ret; +} +#endif + int __init ip_nat_init(void) { size_t i; diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c index 6596c9ee165..38fdfc2093c 100644 --- a/net/ipv4/netfilter/ip_nat_proto_icmp.c +++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c @@ -107,10 +107,15 @@ icmp_print_range(char *buffer, const struct ip_nat_range *range) } struct ip_nat_protocol ip_nat_protocol_icmp -= { "ICMP", IPPROTO_ICMP, += { "ICMP", IPPROTO_ICMP, THIS_MODULE, icmp_manip_pkt, icmp_in_range, icmp_unique_tuple, icmp_print, - icmp_print_range + icmp_print_range, +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) + ip_nat_port_range_to_nfattr, + ip_nat_port_nfattr_to_range, +#endif }; diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c index a98e36d2b3c..f03cd0f0c2b 100644 --- a/net/ipv4/netfilter/ip_nat_proto_tcp.c +++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -170,10 +171,15 @@ tcp_print_range(char *buffer, const struct ip_nat_range *range) } struct ip_nat_protocol ip_nat_protocol_tcp -= { "TCP", IPPROTO_TCP, += { "TCP", IPPROTO_TCP, THIS_MODULE, tcp_manip_pkt, tcp_in_range, tcp_unique_tuple, tcp_print, - tcp_print_range + tcp_print_range, +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) + ip_nat_port_range_to_nfattr, + ip_nat_port_nfattr_to_range, +#endif }; diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c index 9f66e562566..7a4e66ecbc0 100644 --- a/net/ipv4/netfilter/ip_nat_proto_udp.c +++ b/net/ipv4/netfilter/ip_nat_proto_udp.c @@ -157,10 +157,15 @@ udp_print_range(char *buffer, const struct ip_nat_range *range) } struct ip_nat_protocol ip_nat_protocol_udp -= { "UDP", IPPROTO_UDP, += { "UDP", IPPROTO_UDP, THIS_MODULE, udp_manip_pkt, udp_in_range, udp_unique_tuple, udp_print, - udp_print_range + udp_print_range, +#if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ + defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) + ip_nat_port_range_to_nfattr, + ip_nat_port_nfattr_to_range, +#endif }; diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c index f5525bd58d1..512d8f2fb82 100644 --- a/net/ipv4/netfilter/ip_nat_proto_unknown.c +++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c @@ -61,7 +61,7 @@ unknown_print_range(char *buffer, const struct ip_nat_range *range) } struct ip_nat_protocol ip_nat_unknown_protocol = { - "unknown", 0, + "unknown", 0, THIS_MODULE, unknown_manip_pkt, unknown_in_range, unknown_unique_tuple, diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c index 9ecba979033..89db052add8 100644 --- a/net/ipv4/netfilter/ip_nat_standalone.c +++ b/net/ipv4/netfilter/ip_nat_standalone.c @@ -394,6 +394,8 @@ module_exit(fini); EXPORT_SYMBOL(ip_nat_setup_info); EXPORT_SYMBOL(ip_nat_protocol_register); EXPORT_SYMBOL(ip_nat_protocol_unregister); +EXPORT_SYMBOL_GPL(ip_nat_proto_find_get); +EXPORT_SYMBOL_GPL(ip_nat_proto_put); EXPORT_SYMBOL(ip_nat_cheat_check); EXPORT_SYMBOL(ip_nat_mangle_tcp_packet); EXPORT_SYMBOL(ip_nat_mangle_udp_packet); -- cgit v1.2.3 From 83e3609eba3818f6e18b8bf9442195169ac306b7 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 19:33:31 -0700 Subject: [REQSK]: Move the syn_table destroy from tcp_listen_stop to reqsk_queue_destroy Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 35 ++++++++++------------------------- 1 file changed, 10 insertions(+), 25 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d2696af46c7..42a2e2ccd43 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -487,7 +487,7 @@ int tcp_listen_start(struct sock *sk) } sk->sk_state = TCP_CLOSE; - reqsk_queue_destroy(&tp->accept_queue); + __reqsk_queue_destroy(&tp->accept_queue); return -EADDRINUSE; } @@ -499,38 +499,23 @@ int tcp_listen_start(struct sock *sk) static void tcp_listen_stop (struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct listen_sock *lopt; struct request_sock *acc_req; struct request_sock *req; - int i; tcp_delete_keepalive_timer(sk); /* make all the listen_opt local to us */ - lopt = reqsk_queue_yank_listen_sk(&tp->accept_queue); acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue); - if (lopt->qlen) { - for (i = 0; i < TCP_SYNQ_HSIZE; i++) { - while ((req = lopt->syn_table[i]) != NULL) { - lopt->syn_table[i] = req->dl_next; - lopt->qlen--; - reqsk_free(req); - - /* Following specs, it would be better either to send FIN - * (and enter FIN-WAIT-1, it is normal close) - * or to send active reset (abort). - * Certainly, it is pretty dangerous while synflood, but it is - * bad justification for our negligence 8) - * To be honest, we are not able to make either - * of the variants now. --ANK - */ - } - } - } - BUG_TRAP(!lopt->qlen); - - kfree(lopt); + /* Following specs, it would be better either to send FIN + * (and enter FIN-WAIT-1, it is normal close) + * or to send active reset (abort). + * Certainly, it is pretty dangerous while synflood, but it is + * bad justification for our negligence 8) + * To be honest, we are not able to make either + * of the variants now. --ANK + */ + reqsk_queue_destroy(&tp->accept_queue); while ((req = acc_req) != NULL) { struct sock *child = req->sk; -- cgit v1.2.3 From f2ccd8fa06c8e302116e71df372f5c1f83432e03 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 9 Aug 2005 19:34:12 -0700 Subject: [NET]: Kill skb->real_dev Bonding just wants the device before the skb_bond() decapsulation occurs, so simply pass that original device into packet_type->func() as an argument. It remains to be seen whether we can use this same exact thing to get rid of skb->input_dev as well. Signed-off-by: David S. Miller --- net/ipv4/arp.c | 4 ++-- net/ipv4/ip_input.c | 2 +- net/ipv4/ipconfig.c | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index a642fd61285..6eb9c549d64 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -700,7 +700,7 @@ void arp_send(int type, int ptype, u32 dest_ip, static void parp_redo(struct sk_buff *skb) { nf_reset(skb); - arp_rcv(skb, skb->dev, NULL); + arp_rcv(skb, skb->dev, NULL, skb->dev); } /* @@ -927,7 +927,7 @@ out: * Receive an arp request from the device layer. */ -int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct arphdr *arp; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index c703528e0bc..d603247bdfe 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -358,7 +358,7 @@ drop: /* * Main IP Receive routine. */ -int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct iphdr *iph; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index d2bf8e1930a..63e106605f2 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -393,7 +393,7 @@ static int __init ic_defaults(void) #ifdef IPCONFIG_RARP -static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt); +static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); static struct packet_type rarp_packet_type __initdata = { .type = __constant_htons(ETH_P_RARP), @@ -414,7 +414,7 @@ static inline void ic_rarp_cleanup(void) * Process received RARP packet. */ static int __init -ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct arphdr *rarp; unsigned char *rarp_ptr; @@ -555,7 +555,7 @@ struct bootp_pkt { /* BOOTP packet format */ #define DHCPRELEASE 7 #define DHCPINFORM 8 -static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt); +static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); static struct packet_type bootp_packet_type __initdata = { .type = __constant_htons(ETH_P_IP), @@ -823,7 +823,7 @@ static void __init ic_do_bootp_ext(u8 *ext) /* * Receive BOOTP reply. */ -static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct bootp_pkt *b; struct iphdr *h; -- cgit v1.2.3 From 0742fd53a3774781255bd1e471e7aa2e4a82d5f7 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 9 Aug 2005 19:35:47 -0700 Subject: [IPV4]: possible cleanups This patch contains the following possible cleanups: - make needlessly global code static - #if 0 the following unused global function: - xfrm4_state.c: xfrm4_state_fini - remove the following unneeded EXPORT_SYMBOL's: - ip_output.c: ip_finish_output - ip_output.c: sysctl_ip_default_ttl - fib_frontend.c: ip_dev_find - inetpeer.c: inet_peer_idlock - ip_options.c: ip_options_compile - ip_options.c: ip_options_undo - net/core/request_sock.c: sysctl_max_syn_backlog Signed-off-by: Adrian Bunk Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 1 - net/ipv4/inetpeer.c | 2 -- net/ipv4/ip_options.c | 3 --- net/ipv4/ip_output.c | 7 +------ net/ipv4/multipath_drr.c | 2 +- net/ipv4/route.c | 4 +++- net/ipv4/xfrm4_state.c | 2 ++ 7 files changed, 7 insertions(+), 14 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index cd8e45ab958..e5722084239 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -662,5 +662,4 @@ void __init ip_fib_init(void) } EXPORT_SYMBOL(inet_addr_type); -EXPORT_SYMBOL(ip_dev_find); EXPORT_SYMBOL(ip_rt_ioctl); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index ab18a853d7c..3c513ceaca7 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -459,5 +459,3 @@ static void peer_check_expire(unsigned long dummy) peer_total / inet_peer_threshold * HZ; add_timer(&peer_periodic_timer); } - -EXPORT_SYMBOL(inet_peer_idlock); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 6d89f3f3e70..7e02ba58407 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -620,6 +620,3 @@ int ip_options_rcv_srr(struct sk_buff *skb) } return 0; } - -EXPORT_SYMBOL(ip_options_compile); -EXPORT_SYMBOL(ip_options_undo); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 766564cb420..c934f5316c3 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -205,7 +205,7 @@ static inline int ip_finish_output2(struct sk_buff *skb) return -EINVAL; } -int ip_finish_output(struct sk_buff *skb) +static int ip_finish_output(struct sk_buff *skb) { struct net_device *dev = skb->dst->dev; @@ -1328,12 +1328,7 @@ void __init ip_init(void) #endif } -EXPORT_SYMBOL(ip_finish_output); EXPORT_SYMBOL(ip_fragment); EXPORT_SYMBOL(ip_generic_getfrag); EXPORT_SYMBOL(ip_queue_xmit); EXPORT_SYMBOL(ip_send_check); - -#ifdef CONFIG_SYSCTL -EXPORT_SYMBOL(sysctl_ip_default_ttl); -#endif diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c index c9cf8726051..db67373f9b3 100644 --- a/net/ipv4/multipath_drr.c +++ b/net/ipv4/multipath_drr.c @@ -107,7 +107,7 @@ static int drr_dev_event(struct notifier_block *this, return NOTIFY_DONE; } -struct notifier_block drr_dev_notifier = { +static struct notifier_block drr_dev_notifier = { .notifier_call = drr_dev_event, }; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d675ff80b04..3aef0e15460 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -240,7 +240,9 @@ static unsigned rt_hash_mask; static int rt_hash_log; static unsigned int rt_hash_rnd; -struct rt_cache_stat *rt_cache_stat; +static struct rt_cache_stat *rt_cache_stat; +#define RT_CACHE_STAT_INC(field) \ + (per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++) static int rt_intern_hash(unsigned hash, struct rtable *rth, struct rtable **res); diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 050611d7a96..d23e07fc81f 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -128,8 +128,10 @@ void __init xfrm4_state_init(void) xfrm_state_register_afinfo(&xfrm4_state_afinfo); } +#if 0 void __exit xfrm4_state_fini(void) { xfrm_state_unregister_afinfo(&xfrm4_state_afinfo); } +#endif /* 0 */ -- cgit v1.2.3 From 373ac73595491b7c1f2f10cb37e9b7bae6901227 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 9 Aug 2005 19:36:53 -0700 Subject: [NETFILTER]: C99 initizalizers for NAT protocols Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_nat_proto_icmp.c | 20 +++++++++++--------- net/ipv4/netfilter/ip_nat_proto_tcp.c | 20 +++++++++++--------- net/ipv4/netfilter/ip_nat_proto_udp.c | 20 +++++++++++--------- net/ipv4/netfilter/ip_nat_proto_unknown.c | 13 +++++++------ 4 files changed, 40 insertions(+), 33 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c index 38fdfc2093c..7ed2fdb5345 100644 --- a/net/ipv4/netfilter/ip_nat_proto_icmp.c +++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c @@ -106,16 +106,18 @@ icmp_print_range(char *buffer, const struct ip_nat_range *range) else return 0; } -struct ip_nat_protocol ip_nat_protocol_icmp -= { "ICMP", IPPROTO_ICMP, THIS_MODULE, - icmp_manip_pkt, - icmp_in_range, - icmp_unique_tuple, - icmp_print, - icmp_print_range, +struct ip_nat_protocol ip_nat_protocol_icmp = { + .name = "ICMP", + .protonum = IPPROTO_ICMP, + .me = THIS_MODULE, + .manip_pkt = icmp_manip_pkt, + .in_range = icmp_in_range, + .unique_tuple = icmp_unique_tuple, + .print = icmp_print, + .print_range = icmp_print_range, #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) - ip_nat_port_range_to_nfattr, - ip_nat_port_nfattr_to_range, + .range_to_nfattr = ip_nat_port_range_to_nfattr, + .nfattr_to_range = ip_nat_port_nfattr_to_range, #endif }; diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c index f03cd0f0c2b..6113a16af86 100644 --- a/net/ipv4/netfilter/ip_nat_proto_tcp.c +++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c @@ -170,16 +170,18 @@ tcp_print_range(char *buffer, const struct ip_nat_range *range) else return 0; } -struct ip_nat_protocol ip_nat_protocol_tcp -= { "TCP", IPPROTO_TCP, THIS_MODULE, - tcp_manip_pkt, - tcp_in_range, - tcp_unique_tuple, - tcp_print, - tcp_print_range, +struct ip_nat_protocol ip_nat_protocol_tcp = { + .name = "TCP", + .protonum = IPPROTO_TCP, + .me = THIS_MODULE, + .manip_pkt = tcp_manip_pkt, + .in_range = tcp_in_range, + .unique_tuple = tcp_unique_tuple, + .print = tcp_print, + .print_range = tcp_print_range, #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) - ip_nat_port_range_to_nfattr, - ip_nat_port_nfattr_to_range, + .range_to_nfattr = ip_nat_port_range_to_nfattr, + .nfattr_to_range = ip_nat_port_nfattr_to_range, #endif }; diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c index 7a4e66ecbc0..689478e637a 100644 --- a/net/ipv4/netfilter/ip_nat_proto_udp.c +++ b/net/ipv4/netfilter/ip_nat_proto_udp.c @@ -156,16 +156,18 @@ udp_print_range(char *buffer, const struct ip_nat_range *range) else return 0; } -struct ip_nat_protocol ip_nat_protocol_udp -= { "UDP", IPPROTO_UDP, THIS_MODULE, - udp_manip_pkt, - udp_in_range, - udp_unique_tuple, - udp_print, - udp_print_range, +struct ip_nat_protocol ip_nat_protocol_udp = { + .name = "UDP", + .protonum = IPPROTO_UDP, + .me = THIS_MODULE, + .manip_pkt = udp_manip_pkt, + .in_range = udp_in_range, + .unique_tuple = udp_unique_tuple, + .print = udp_print, + .print_range = udp_print_range, #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) - ip_nat_port_range_to_nfattr, - ip_nat_port_nfattr_to_range, + .range_to_nfattr = ip_nat_port_range_to_nfattr, + .nfattr_to_range = ip_nat_port_nfattr_to_range, #endif }; diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c index 512d8f2fb82..99bbef56f84 100644 --- a/net/ipv4/netfilter/ip_nat_proto_unknown.c +++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c @@ -61,10 +61,11 @@ unknown_print_range(char *buffer, const struct ip_nat_range *range) } struct ip_nat_protocol ip_nat_unknown_protocol = { - "unknown", 0, THIS_MODULE, - unknown_manip_pkt, - unknown_in_range, - unknown_unique_tuple, - unknown_print, - unknown_print_range + .name = "unknown", + .me = THIS_MODULE, + .manip_pkt = unknown_manip_pkt, + .in_range = unknown_in_range, + .unique_tuple = unknown_unique_tuple, + .print = unknown_print, + .print_range = unknown_print_range }; -- cgit v1.2.3 From 089af26c706d1473f641c909fee7c878d29c1f1a Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 19:37:23 -0700 Subject: [NETFILTER]: Rename skb_ip_make_writable() to skb_make_writable() There is nothing IPv4-specific in it. In fact, it was already used by IPv6, too... Upcoming nfnetlink_queue code will use it for any kind of packet. Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_nat_core.c | 4 ++-- net/ipv4/netfilter/ip_nat_helper.c | 8 ++++---- net/ipv4/netfilter/ip_nat_proto_icmp.c | 2 +- net/ipv4/netfilter/ip_nat_proto_tcp.c | 2 +- net/ipv4/netfilter/ip_nat_proto_udp.c | 2 +- net/ipv4/netfilter/ip_nat_snmp_basic.c | 2 +- net/ipv4/netfilter/ip_queue.c | 2 +- net/ipv4/netfilter/ipt_DSCP.c | 2 +- net/ipv4/netfilter/ipt_ECN.c | 4 ++-- net/ipv4/netfilter/ipt_TCPMSS.c | 2 +- net/ipv4/netfilter/ipt_TOS.c | 2 +- 11 files changed, 16 insertions(+), 16 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c index 567c802fecf..1adedb743f6 100644 --- a/net/ipv4/netfilter/ip_nat_core.c +++ b/net/ipv4/netfilter/ip_nat_core.c @@ -359,7 +359,7 @@ manip_pkt(u_int16_t proto, struct iphdr *iph; struct ip_nat_protocol *p; - if (!skb_ip_make_writable(pskb, iphdroff + sizeof(*iph))) + if (!skb_make_writable(pskb, iphdroff + sizeof(*iph))) return 0; iph = (void *)(*pskb)->data + iphdroff; @@ -431,7 +431,7 @@ int icmp_reply_translation(struct sk_buff **pskb, struct ip_conntrack_tuple inner, target; int hdrlen = (*pskb)->nh.iph->ihl * 4; - if (!skb_ip_make_writable(pskb, hdrlen + sizeof(*inside))) + if (!skb_make_writable(pskb, hdrlen + sizeof(*inside))) return 0; inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c index 158f34f32c0..d2dd5d31355 100644 --- a/net/ipv4/netfilter/ip_nat_helper.c +++ b/net/ipv4/netfilter/ip_nat_helper.c @@ -168,7 +168,7 @@ ip_nat_mangle_tcp_packet(struct sk_buff **pskb, struct tcphdr *tcph; int datalen; - if (!skb_ip_make_writable(pskb, (*pskb)->len)) + if (!skb_make_writable(pskb, (*pskb)->len)) return 0; if (rep_len > match_len @@ -228,7 +228,7 @@ ip_nat_mangle_udp_packet(struct sk_buff **pskb, match_offset + match_len) return 0; - if (!skb_ip_make_writable(pskb, (*pskb)->len)) + if (!skb_make_writable(pskb, (*pskb)->len)) return 0; if (rep_len > match_len @@ -315,7 +315,7 @@ ip_nat_sack_adjust(struct sk_buff **pskb, optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr); optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4; - if (!skb_ip_make_writable(pskb, optend)) + if (!skb_make_writable(pskb, optend)) return 0; dir = CTINFO2DIR(ctinfo); @@ -363,7 +363,7 @@ ip_nat_seq_adjust(struct sk_buff **pskb, this_way = &ct->nat.info.seq[dir]; other_way = &ct->nat.info.seq[!dir]; - if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) + if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) return 0; tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c index 7ed2fdb5345..93871904399 100644 --- a/net/ipv4/netfilter/ip_nat_proto_icmp.c +++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c @@ -62,7 +62,7 @@ icmp_manip_pkt(struct sk_buff **pskb, struct icmphdr *hdr; unsigned int hdroff = iphdroff + iph->ihl*4; - if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr))) + if (!skb_make_writable(pskb, hdroff + sizeof(*hdr))) return 0; hdr = (struct icmphdr *)((*pskb)->data + hdroff); diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c index 6113a16af86..1d381bf6857 100644 --- a/net/ipv4/netfilter/ip_nat_proto_tcp.c +++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c @@ -103,7 +103,7 @@ tcp_manip_pkt(struct sk_buff **pskb, if ((*pskb)->len >= hdroff + sizeof(struct tcphdr)) hdrsize = sizeof(struct tcphdr); - if (!skb_ip_make_writable(pskb, hdroff + hdrsize)) + if (!skb_make_writable(pskb, hdroff + hdrsize)) return 0; iph = (struct iphdr *)((*pskb)->data + iphdroff); diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c index 689478e637a..c4906e1aa24 100644 --- a/net/ipv4/netfilter/ip_nat_proto_udp.c +++ b/net/ipv4/netfilter/ip_nat_proto_udp.c @@ -94,7 +94,7 @@ udp_manip_pkt(struct sk_buff **pskb, u32 oldip, newip; u16 *portptr, newport; - if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr))) + if (!skb_make_writable(pskb, hdroff + sizeof(*hdr))) return 0; iph = (struct iphdr *)((*pskb)->data + iphdroff); diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c index 2a48b6e635a..93b2c5111bb 100644 --- a/net/ipv4/netfilter/ip_nat_snmp_basic.c +++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c @@ -1275,7 +1275,7 @@ static int help(struct sk_buff **pskb, return NF_DROP; } - if (!skb_ip_make_writable(pskb, (*pskb)->len)) + if (!skb_make_writable(pskb, (*pskb)->len)) return NF_DROP; spin_lock_bh(&snmp_lock); diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index bc0af8d8e91..ae975ac59c6 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -388,7 +388,7 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e) } skb_put(e->skb, diff); } - if (!skb_ip_make_writable(&e->skb, v->data_len)) + if (!skb_make_writable(&e->skb, v->data_len)) return -ENOMEM; memcpy(e->skb->data, v->payload, v->data_len); e->skb->ip_summed = CHECKSUM_NONE; diff --git a/net/ipv4/netfilter/ipt_DSCP.c b/net/ipv4/netfilter/ipt_DSCP.c index 975476fef27..6e319570a28 100644 --- a/net/ipv4/netfilter/ipt_DSCP.c +++ b/net/ipv4/netfilter/ipt_DSCP.c @@ -39,7 +39,7 @@ target(struct sk_buff **pskb, if (((*pskb)->nh.iph->tos & IPT_DSCP_MASK) != sh_dscp) { u_int16_t diffs[2]; - if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) + if (!skb_make_writable(pskb, sizeof(struct iphdr))) return NF_DROP; diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index f63a9bc0e4d..a1319693f64 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -31,7 +31,7 @@ set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo) != (einfo->ip_ect & IPT_ECN_IP_MASK)) { u_int16_t diffs[2]; - if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) + if (!skb_make_writable(pskb, sizeof(struct iphdr))) return 0; diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; @@ -66,7 +66,7 @@ set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward) tcph->cwr == einfo->proto.tcp.cwr))) return 1; - if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) + if (!skb_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) return 0; tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4; diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c index 949288319ca..8db70d6908c 100644 --- a/net/ipv4/netfilter/ipt_TCPMSS.c +++ b/net/ipv4/netfilter/ipt_TCPMSS.c @@ -58,7 +58,7 @@ ipt_tcpmss_target(struct sk_buff **pskb, unsigned int i; u_int8_t *opt; - if (!skb_ip_make_writable(pskb, (*pskb)->len)) + if (!skb_make_writable(pskb, (*pskb)->len)) return NF_DROP; if ((*pskb)->ip_summed == CHECKSUM_HW && diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c index 49abb7eef0a..deadb36d442 100644 --- a/net/ipv4/netfilter/ipt_TOS.c +++ b/net/ipv4/netfilter/ipt_TOS.c @@ -33,7 +33,7 @@ target(struct sk_buff **pskb, if (((*pskb)->nh.iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) { u_int16_t diffs[2]; - if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) + if (!skb_make_writable(pskb, sizeof(struct iphdr))) return NF_DROP; diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; -- cgit v1.2.3 From 020b4c12dbe3868d792a01d7c1470cd837abe10f Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 19:39:00 -0700 Subject: [NETFILTER]: Move ipv4 specific code from net/core/netfilter.c to net/ipv4/netfilter.c Netfilter cleanup - Move ipv4 code from net/core/netfilter.c to net/ipv4/netfilter.c - Move ipv6 netfilter code from net/ipv6/ip6_output.c to net/ipv6/netfilter.c Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/Makefile | 2 +- net/ipv4/netfilter.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 1 deletion(-) create mode 100644 net/ipv4/netfilter.c (limited to 'net/ipv4') diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 55dc6cca1e7..61c7386bcd2 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -8,7 +8,7 @@ obj-y := route.o inetpeer.o protocol.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o \ datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ - sysctl_net_ipv4.o fib_frontend.o fib_semantics.o + sysctl_net_ipv4.o fib_frontend.o fib_semantics.o netfilter.o obj-$(CONFIG_IP_FIB_HASH) += fib_hash.o obj-$(CONFIG_IP_FIB_TRIE) += fib_trie.o diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c new file mode 100644 index 00000000000..6594d1c9697 --- /dev/null +++ b/net/ipv4/netfilter.c @@ -0,0 +1,79 @@ +#include + +#ifdef CONFIG_NETFILTER + +/* IPv4 specific functions of netfilter core */ +#include +#include + +#include +#include +#include +#include +#include + +/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ +int ip_route_me_harder(struct sk_buff **pskb) +{ + struct iphdr *iph = (*pskb)->nh.iph; + struct rtable *rt; + struct flowi fl = {}; + struct dst_entry *odst; + unsigned int hh_len; + + /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause + * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook. + */ + if (inet_addr_type(iph->saddr) == RTN_LOCAL) { + fl.nl_u.ip4_u.daddr = iph->daddr; + fl.nl_u.ip4_u.saddr = iph->saddr; + fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); + fl.oif = (*pskb)->sk ? (*pskb)->sk->sk_bound_dev_if : 0; +#ifdef CONFIG_IP_ROUTE_FWMARK + fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark; +#endif + fl.proto = iph->protocol; + if (ip_route_output_key(&rt, &fl) != 0) + return -1; + + /* Drop old route. */ + dst_release((*pskb)->dst); + (*pskb)->dst = &rt->u.dst; + } else { + /* non-local src, find valid iif to satisfy + * rp-filter when calling ip_route_input. */ + fl.nl_u.ip4_u.daddr = iph->saddr; + if (ip_route_output_key(&rt, &fl) != 0) + return -1; + + odst = (*pskb)->dst; + if (ip_route_input(*pskb, iph->daddr, iph->saddr, + RT_TOS(iph->tos), rt->u.dst.dev) != 0) { + dst_release(&rt->u.dst); + return -1; + } + dst_release(&rt->u.dst); + dst_release(odst); + } + + if ((*pskb)->dst->error) + return -1; + + /* Change in oif may mean change in hh_len. */ + hh_len = (*pskb)->dst->dev->hard_header_len; + if (skb_headroom(*pskb) < hh_len) { + struct sk_buff *nskb; + + nskb = skb_realloc_headroom(*pskb, hh_len); + if (!nskb) + return -1; + if ((*pskb)->sk) + skb_set_owner_w(nskb, (*pskb)->sk); + kfree_skb(*pskb); + *pskb = nskb; + } + + return 0; +} +EXPORT_SYMBOL(ip_route_me_harder); +#endif /* CONFIG_NETFILTER */ -- cgit v1.2.3 From 4fdb3bb723db469717c6d38fda667d8b0fa86ebd Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 19:40:55 -0700 Subject: [NETLINK]: Add properly module refcounting for kernel netlink sockets. - Remove bogus code for compiling netlink as module - Add module refcounting support for modules implementing a netlink protocol - Add support for autoloading modules that implement a netlink protocol as soon as someone opens a socket for that protocol Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 2 +- net/ipv4/netfilter/ip_queue.c | 3 ++- net/ipv4/netfilter/ipt_ULOG.c | 3 ++- net/ipv4/tcp_diag.c | 3 ++- 4 files changed, 7 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index e5722084239..b5e2f1550c9 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -567,7 +567,7 @@ static void nl_fib_input(struct sock *sk, int len) static void nl_fib_lookup_init(void) { - netlink_kernel_create(NETLINK_FIB_LOOKUP, nl_fib_input); + netlink_kernel_create(NETLINK_FIB_LOOKUP, nl_fib_input, THIS_MODULE); } static void fib_disable_ip(struct net_device *dev, int force) diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index ae975ac59c6..b237f7fcad9 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -692,7 +692,8 @@ init_or_cleanup(int init) goto cleanup; netlink_register_notifier(&ipq_nl_notifier); - ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk); + ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk, + THIS_MODULE); if (ipqnl == NULL) { printk(KERN_ERR "ip_queue: failed to create netlink socket\n"); goto cleanup_netlink_notifier; diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index 52a0076302a..4ea8371ab27 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -62,6 +62,7 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte "); MODULE_DESCRIPTION("iptables userspace logging module"); +MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NFLOG); #define ULOG_NL_EVENT 111 /* Harald's favorite number */ #define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */ @@ -372,7 +373,7 @@ static int __init init(void) ulog_buffers[i].timer.data = i; } - nflognl = netlink_kernel_create(NETLINK_NFLOG, NULL); + nflognl = netlink_kernel_create(NETLINK_NFLOG, NULL, THIS_MODULE); if (!nflognl) return -ENOMEM; diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index f66945cb158..f79bd11a470 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -774,7 +774,8 @@ static void tcpdiag_rcv(struct sock *sk, int len) static int __init tcpdiag_init(void) { - tcpnl = netlink_kernel_create(NETLINK_TCPDIAG, tcpdiag_rcv); + tcpnl = netlink_kernel_create(NETLINK_TCPDIAG, tcpdiag_rcv, + THIS_MODULE); if (tcpnl == NULL) return -ENOMEM; return 0; -- cgit v1.2.3 From 2cc7d5730957c4a3f3659d17d2ba5e06d5581c1f Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 19:42:34 -0700 Subject: [NETFILTER]: Move reroute-after-queue code up to the nf_queue layer. The rerouting functionality is required by the core, therefore it has to be implemented by the core and not in individual queue handlers. Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter.c | 64 +++++++++++++++++++++++++++++++++++++++++-- net/ipv4/netfilter/ip_queue.c | 27 ------------------ 2 files changed, 62 insertions(+), 29 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 6594d1c9697..ae0779d82c5 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -1,10 +1,11 @@ -#include +/* IPv4 specific functions of netfilter core */ +#include #ifdef CONFIG_NETFILTER -/* IPv4 specific functions of netfilter core */ #include #include +#include #include #include @@ -76,4 +77,63 @@ int ip_route_me_harder(struct sk_buff **pskb) return 0; } EXPORT_SYMBOL(ip_route_me_harder); + +/* + * Extra routing may needed on local out, as the QUEUE target never + * returns control to the table. + */ + +struct ip_rt_info { + u_int32_t daddr; + u_int32_t saddr; + u_int8_t tos; +}; + +static void queue_save(const struct sk_buff *skb, struct nf_info *info) +{ + struct ip_rt_info *rt_info = nf_info_reroute(info); + + if (info->hook == NF_IP_LOCAL_OUT) { + const struct iphdr *iph = skb->nh.iph; + + rt_info->tos = iph->tos; + rt_info->daddr = iph->daddr; + rt_info->saddr = iph->saddr; + } +} + +static int queue_reroute(struct sk_buff **pskb, const struct nf_info *info) +{ + const struct ip_rt_info *rt_info = nf_info_reroute(info); + + if (info->hook == NF_IP_LOCAL_OUT) { + struct iphdr *iph = (*pskb)->nh.iph; + + if (!(iph->tos == rt_info->tos + && iph->daddr == rt_info->daddr + && iph->saddr == rt_info->saddr)) + return ip_route_me_harder(pskb); + } + return 0; +} + +static struct nf_queue_rerouter ip_reroute = { + .rer_size = sizeof(struct ip_rt_info), + .save = queue_save, + .reroute = queue_reroute, +}; + +static int init(void) +{ + return nf_register_queue_rerouter(PF_INET, &ip_reroute); +} + +static void fini(void) +{ + nf_unregister_queue_rerouter(PF_INET); +} + +module_init(init); +module_exit(fini); + #endif /* CONFIG_NETFILTER */ diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index b237f7fcad9..78892980f42 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -43,17 +43,10 @@ #define NET_IPQ_QMAX 2088 #define NET_IPQ_QMAX_NAME "ip_queue_maxlen" -struct ipq_rt_info { - __u8 tos; - __u32 daddr; - __u32 saddr; -}; - struct ipq_queue_entry { struct list_head list; struct nf_info *info; struct sk_buff *skb; - struct ipq_rt_info rt_info; }; typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long); @@ -305,14 +298,6 @@ ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data) entry->info = info; entry->skb = skb; - if (entry->info->hook == NF_IP_LOCAL_OUT) { - struct iphdr *iph = skb->nh.iph; - - entry->rt_info.tos = iph->tos; - entry->rt_info.daddr = iph->daddr; - entry->rt_info.saddr = iph->saddr; - } - nskb = ipq_build_packet_message(entry, &status); if (nskb == NULL) goto err_out_free; @@ -393,18 +378,6 @@ ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e) memcpy(e->skb->data, v->payload, v->data_len); e->skb->ip_summed = CHECKSUM_NONE; - /* - * Extra routing may needed on local out, as the QUEUE target never - * returns control to the table. - */ - if (e->info->hook == NF_IP_LOCAL_OUT) { - struct iphdr *iph = e->skb->nh.iph; - - if (!(iph->tos == e->rt_info.tos - && iph->daddr == e->rt_info.daddr - && iph->saddr == e->rt_info.saddr)) - return ip_route_me_harder(&e->skb); - } return 0; } -- cgit v1.2.3 From 0ab43f84995f2c2fcc5cc58a9accaa1095e1317f Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 19:43:44 -0700 Subject: [NETFILTER]: Core changes required by upcoming nfnetlink_queue code - split netfiler verdict in 16bit verdict and 16bit queue number - add 'queuenum' argument to nf_queue_outfn_t and its users ip[6]_queue - move NFNL_SUBSYS_ definitions from enum to #define - introduce autoloading for nfnetlink subsystem modules - add MODULE_ALIAS_NFNL_SUBSYS macro - add nf_unregister_queue_handlers() to register all handlers for a given nf_queue_outfn_t - add more verbose DEBUGP macro definition to nfnetlink.c - make nfnetlink_subsys_register fail if subsys already exists - add some more comments and debug statements to nfnetlink.c Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_queue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 78892980f42..cfc886f382a 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -280,7 +280,8 @@ nlmsg_failure: } static int -ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data) +ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, + unsigned int queuenum, void *data) { int status = -EINVAL; struct sk_buff *nskb; -- cgit v1.2.3 From 7af4cc3fa158ff1dda6e7451c7e6afa6b0bb85cb Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 19:44:15 -0700 Subject: [NETFILTER]: Add "nfnetlink_queue" netfilter queue handler over nfnetlink - Add new nfnetlink_queue module - Add new ipt_NFQUEUE and ip6t_NFQUEUE modules to access queue numbers 1-65535 - Mark ip_queue and ip6_queue Kconfig options as OBSOLETE - Update feature-removal-schedule to remove ip[6]_queue in December Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/Kconfig | 6 +++- net/ipv4/netfilter/Makefile | 1 + net/ipv4/netfilter/ipt_NFQUEUE.c | 70 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 net/ipv4/netfilter/ipt_NFQUEUE.c (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index e47ba39eb65..2fa26a41fa4 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -110,11 +110,15 @@ config IP_NF_AMANDA To compile it as a module, choose M here. If unsure, say Y. config IP_NF_QUEUE - tristate "Userspace queueing via NETLINK" + tristate "IP Userspace queueing via NETLINK (OBSOLETE)" help Netfilter has the ability to queue packets to user space: the netlink device can be used to access them using this driver. + This option enables the old IPv4-only "ip_queue" implementation + which has been obsoleted by the new "nfnetlink_queue" code (see + CONFIG_NETFILTER_NETLINK_QUEUE). + To compile it as a module, choose M here. If unsure, say N. config IP_NF_IPTABLES diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index abf2a7d1a58..c2ae663b723 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -91,3 +91,4 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o +obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += ipt_NFQUEUE.o diff --git a/net/ipv4/netfilter/ipt_NFQUEUE.c b/net/ipv4/netfilter/ipt_NFQUEUE.c new file mode 100644 index 00000000000..3cedc9be880 --- /dev/null +++ b/net/ipv4/netfilter/ipt_NFQUEUE.c @@ -0,0 +1,70 @@ +/* iptables module for using new netfilter netlink queue + * + * (C) 2005 by Harald Welte + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include + +#include +#include +#include + +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("iptables NFQUEUE target"); +MODULE_LICENSE("GPL"); + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_NFQ_info *tinfo = targinfo; + + return NF_QUEUE_NR(tinfo->queuenum); +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_NFQ_info))) { + printk(KERN_WARNING "NFQUEUE: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_NFQ_info))); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_NFQ_reg = { + .name = "NFQUEUE", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_NFQ_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_NFQ_reg); +} + +module_init(init); +module_exit(fini); -- cgit v1.2.3 From d13964f4490157b8a290903362bfbc54f750a6bc Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 9 Aug 2005 19:45:02 -0700 Subject: [IPV4/6]: Check if packet was actually delivered to a raw socket to decide whether to send an ICMP unreachable Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 4 ++-- net/ipv4/raw.c | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index d603247bdfe..81e18023dc1 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -225,8 +225,8 @@ static inline int ip_local_deliver_finish(struct sk_buff *skb) /* If there maybe a raw socket we must check - if not we * don't care less */ - if (raw_sk) - raw_v4_input(skb, skb->nh.iph, hash); + if (raw_sk && !raw_v4_input(skb, skb->nh.iph, hash)) + raw_sk = NULL; if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) { int ret; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index d1835b1bc8c..e222c5c26b3 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -150,10 +150,11 @@ static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) * RFC 1122: SHOULD pass TOS value up to the transport layer. * -> It does. And not only TOS, but all IP header. */ -void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) +int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) { struct sock *sk; struct hlist_head *head; + int delivered = 0; read_lock(&raw_v4_lock); head = &raw_v4_htable[hash]; @@ -164,6 +165,7 @@ void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) skb->dev->ifindex); while (sk) { + delivered = 1; if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) { struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); @@ -177,6 +179,7 @@ void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) } out: read_unlock(&raw_v4_lock); + return delivered; } void raw_err (struct sock *sk, struct sk_buff *skb, u32 info) -- cgit v1.2.3 From e6848976b721eeb5551cd94673faafeef78d9f35 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 19:45:38 -0700 Subject: [NET]: Cleanup INET_REFCNT_DEBUG code Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 18 ++---------------- net/ipv4/tcp.c | 7 +------ net/ipv4/tcp_minisocks.c | 20 ++++++++++++++++---- 3 files changed, 19 insertions(+), 26 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 163ae4068b5..9e83d7773d8 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -114,10 +114,6 @@ DEFINE_SNMP_STAT(struct linux_mib, net_statistics); -#ifdef INET_REFCNT_DEBUG -atomic_t inet_sock_nr; -#endif - extern void ip_mc_drop_socket(struct sock *sk); /* The inetsw table contains everything that inet_create needs to @@ -153,11 +149,7 @@ void inet_sock_destruct(struct sock *sk) if (inet->opt) kfree(inet->opt); dst_release(sk->sk_dst_cache); -#ifdef INET_REFCNT_DEBUG - atomic_dec(&inet_sock_nr); - printk(KERN_DEBUG "INET socket %p released, %d are still alive\n", - sk, atomic_read(&inet_sock_nr)); -#endif + sk_refcnt_debug_dec(sk); } /* @@ -317,9 +309,7 @@ static int inet_create(struct socket *sock, int protocol) inet->mc_index = 0; inet->mc_list = NULL; -#ifdef INET_REFCNT_DEBUG - atomic_inc(&inet_sock_nr); -#endif + sk_refcnt_debug_inc(sk); if (inet->num) { /* It assumes that any protocol which allows @@ -1205,7 +1195,3 @@ EXPORT_SYMBOL(inet_stream_ops); EXPORT_SYMBOL(inet_unregister_protosw); EXPORT_SYMBOL(net_statistics); EXPORT_SYMBOL(sysctl_ip_nonlocal_bind); - -#ifdef INET_REFCNT_DEBUG -EXPORT_SYMBOL(inet_sock_nr); -#endif diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 42a2e2ccd43..20159a3dafb 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1580,12 +1580,7 @@ void tcp_destroy_sock(struct sock *sk) xfrm_sk_free_policy(sk); -#ifdef INET_REFCNT_DEBUG - if (atomic_read(&sk->sk_refcnt) != 1) { - printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n", - sk, atomic_read(&sk->sk_refcnt)); - } -#endif + sk_refcnt_debug_release(sk); atomic_dec(&tcp_orphan_count); sock_put(sk); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f42a284164b..f8e288c8d69 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -84,7 +84,7 @@ static void tcp_timewait_kill(struct tcp_tw_bucket *tw) tcp_bucket_destroy(tb); spin_unlock(&bhead->lock); -#ifdef INET_REFCNT_DEBUG +#ifdef SOCK_REFCNT_DEBUG if (atomic_read(&tw->tw_refcnt) != 1) { printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw, atomic_read(&tw->tw_refcnt)); @@ -799,9 +799,21 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newsk->sk_err = 0; newsk->sk_priority = 0; atomic_set(&newsk->sk_refcnt, 2); -#ifdef INET_REFCNT_DEBUG - atomic_inc(&inet_sock_nr); -#endif + + /* + * Increment the counter in the same struct proto as the master + * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that + * is the same as sk->sk_prot->socks, as this field was copied + * with memcpy), same rationale as the first comment in this + * function. + * + * This _changes_ the previous behaviour, where + * tcp_create_openreq_child always was incrementing the + * equivalent to tcp_prot->socks (inet_sock_nr), so this have + * to be taken into account in all callers. -acme + */ + sk_refcnt_debug_inc(newsk); + atomic_inc(&tcp_sockets_allocated); if (sock_flag(newsk, SOCK_KEEPOPEN)) -- cgit v1.2.3 From 614c6cb4f225a7da9f13e5dd0fac3b531078eb9f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 19:47:37 -0700 Subject: [SOCK]: Rename __tcp_v4_rehash to __sk_prot_rehash This operation was already generic and DCCP will use it. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 67c670886c1..c7c99d33636 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1834,15 +1834,6 @@ do_time_wait: goto discard_it; } -/* With per-bucket locks this operation is not-atomic, so that - * this version is not worse. - */ -static void __tcp_v4_rehash(struct sock *sk) -{ - sk->sk_prot->unhash(sk); - sk->sk_prot->hash(sk); -} - static int tcp_v4_reselect_saddr(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); @@ -1889,7 +1880,7 @@ static int tcp_v4_reselect_saddr(struct sock *sk) * Besides that, it does not check for connection * uniqueness. Wait for troubles. */ - __tcp_v4_rehash(sk); + __sk_prot_rehash(sk); return 0; } -- cgit v1.2.3 From 6cbb0df788b90777a7ed0f9d8261260353f48076 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 19:49:02 -0700 Subject: [SOCK]: Introduce sk_setup_caps From tcp_v4_setup_caps, that always is preceded by a call to __sk_dst_set, so coalesce this sequence into sk_setup_caps, removing one call to a TCP function in the IP layer. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 7 ++----- net/ipv4/tcp_ipv4.c | 12 ++++-------- net/ipv4/tcp_minisocks.c | 1 + 3 files changed, 7 insertions(+), 13 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index c934f5316c3..c72fc878f06 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -69,13 +69,10 @@ #include #include #include -#include -#include #include #include #include #include -#include #include #include #include @@ -84,6 +81,7 @@ #include #include #include +#include /* * Shall we try to damage output packets if routing dev changes? @@ -329,8 +327,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) if (ip_route_output_flow(&rt, &fl, sk, 0)) goto no_route; } - __sk_dst_set(sk, &rt->u.dst); - tcp_v4_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->u.dst); } skb->dst = dst_clone(&rt->u.dst); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c7c99d33636..4a5daecbd2a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -837,8 +837,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) goto failure; /* OK, now commit destination to socket. */ - __sk_dst_set(sk, &rt->u.dst); - tcp_v4_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->u.dst); if (!tp->write_seq) tp->write_seq = secure_tcp_sequence_number(inet->saddr, @@ -1553,8 +1552,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, if (!newsk) goto exit; - newsk->sk_dst_cache = dst; - tcp_v4_setup_caps(newsk, dst); + sk_setup_caps(newsk, dst); newtp = tcp_sk(newsk); newinet = inet_sk(newsk); @@ -1855,8 +1853,7 @@ static int tcp_v4_reselect_saddr(struct sock *sk) if (err) return err; - __sk_dst_set(sk, &rt->u.dst); - tcp_v4_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->u.dst); new_saddr = rt->rt_src; @@ -1914,8 +1911,7 @@ int tcp_v4_rebuild_header(struct sock *sk) err = ip_route_output_flow(&rt, &fl, sk, 0); } if (!err) { - __sk_dst_set(sk, &rt->u.dst); - tcp_v4_setup_caps(sk, &rt->u.dst); + sk_setup_caps(sk, &rt->u.dst); return 0; } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f8e288c8d69..7c46a553c4a 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -711,6 +711,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, bh_lock_sock(newsk); rwlock_init(&newsk->sk_dst_lock); + newsk->sk_dst_cache = NULL; atomic_set(&newsk->sk_rmem_alloc, 0); skb_queue_head_init(&newsk->sk_receive_queue); atomic_set(&newsk->sk_wmem_alloc, 0); -- cgit v1.2.3 From 32519f11d38ea8f4f60896763bacec7db1760f9c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 19:50:02 -0700 Subject: [INET]: Introduce inet_sk_rebuild_header From tcp_v4_rebuild_header, that already was pretty generic, I only needed to use sk->sk_protocol instead of the hardcoded IPPROTO_TCP and establish the requirement that INET transport layer protocols that want to use this function map TCP_SYN_SENT to its equivalent state. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 113 +++++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/ip_output.c | 5 --- net/ipv4/tcp_ipv4.c | 98 +------------------------------------------- 3 files changed, 114 insertions(+), 102 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 9e83d7773d8..7137e6420d6 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -951,6 +951,119 @@ void inet_unregister_protosw(struct inet_protosw *p) } } +/* + * Shall we try to damage output packets if routing dev changes? + */ + +int sysctl_ip_dynaddr; + +static int inet_sk_reselect_saddr(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + int err; + struct rtable *rt; + __u32 old_saddr = inet->saddr; + __u32 new_saddr; + __u32 daddr = inet->daddr; + + if (inet->opt && inet->opt->srr) + daddr = inet->opt->faddr; + + /* Query new route. */ + err = ip_route_connect(&rt, daddr, 0, + RT_CONN_FLAGS(sk), + sk->sk_bound_dev_if, + sk->sk_protocol, + inet->sport, inet->dport, sk); + if (err) + return err; + + sk_setup_caps(sk, &rt->u.dst); + + new_saddr = rt->rt_src; + + if (new_saddr == old_saddr) + return 0; + + if (sysctl_ip_dynaddr > 1) { + printk(KERN_INFO "%s(): shifting inet->" + "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n", + __FUNCTION__, + NIPQUAD(old_saddr), + NIPQUAD(new_saddr)); + } + + inet->saddr = inet->rcv_saddr = new_saddr; + + /* + * XXX The only one ugly spot where we need to + * XXX really change the sockets identity after + * XXX it has entered the hashes. -DaveM + * + * Besides that, it does not check for connection + * uniqueness. Wait for troubles. + */ + __sk_prot_rehash(sk); + return 0; +} + +int inet_sk_rebuild_header(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); + u32 daddr; + int err; + + /* Route is OK, nothing to do. */ + if (rt) + return 0; + + /* Reroute. */ + daddr = inet->daddr; + if (inet->opt && inet->opt->srr) + daddr = inet->opt->faddr; +{ + struct flowi fl = { + .oif = sk->sk_bound_dev_if, + .nl_u = { + .ip4_u = { + .daddr = daddr, + .saddr = inet->saddr, + .tos = RT_CONN_FLAGS(sk), + }, + }, + .proto = sk->sk_protocol, + .uli_u = { + .ports = { + .sport = inet->sport, + .dport = inet->dport, + }, + }, + }; + + err = ip_route_output_flow(&rt, &fl, sk, 0); +} + if (!err) + sk_setup_caps(sk, &rt->u.dst); + else { + /* Routing failed... */ + sk->sk_route_caps = 0; + /* + * Other protocols have to map its equivalent state to TCP_SYN_SENT. + * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme + */ + if (!sysctl_ip_dynaddr || + sk->sk_state != TCP_SYN_SENT || + (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || + (err = inet_sk_reselect_saddr(sk)) != 0) + sk->sk_err_soft = -err; + } + + return err; +} + +EXPORT_SYMBOL(inet_sk_rebuild_header); + #ifdef CONFIG_IP_MULTICAST static struct net_protocol igmp_protocol = { .handler = igmp_rcv, diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index c72fc878f06..dd568b0b706 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -83,11 +83,6 @@ #include #include -/* - * Shall we try to damage output packets if routing dev changes? - */ - -int sysctl_ip_dynaddr; int sysctl_ip_default_ttl = IPDEFTTL; /* Generate a checksum for an outgoing IP datagram. */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4a5daecbd2a..ae6fad99a9a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1832,101 +1832,6 @@ do_time_wait: goto discard_it; } -static int tcp_v4_reselect_saddr(struct sock *sk) -{ - struct inet_sock *inet = inet_sk(sk); - int err; - struct rtable *rt; - __u32 old_saddr = inet->saddr; - __u32 new_saddr; - __u32 daddr = inet->daddr; - - if (inet->opt && inet->opt->srr) - daddr = inet->opt->faddr; - - /* Query new route. */ - err = ip_route_connect(&rt, daddr, 0, - RT_CONN_FLAGS(sk), - sk->sk_bound_dev_if, - IPPROTO_TCP, - inet->sport, inet->dport, sk); - if (err) - return err; - - sk_setup_caps(sk, &rt->u.dst); - - new_saddr = rt->rt_src; - - if (new_saddr == old_saddr) - return 0; - - if (sysctl_ip_dynaddr > 1) { - printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->" - "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n", - NIPQUAD(old_saddr), - NIPQUAD(new_saddr)); - } - - inet->saddr = new_saddr; - inet->rcv_saddr = new_saddr; - - /* XXX The only one ugly spot where we need to - * XXX really change the sockets identity after - * XXX it has entered the hashes. -DaveM - * - * Besides that, it does not check for connection - * uniqueness. Wait for troubles. - */ - __sk_prot_rehash(sk); - return 0; -} - -int tcp_v4_rebuild_header(struct sock *sk) -{ - struct inet_sock *inet = inet_sk(sk); - struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); - u32 daddr; - int err; - - /* Route is OK, nothing to do. */ - if (rt) - return 0; - - /* Reroute. */ - daddr = inet->daddr; - if (inet->opt && inet->opt->srr) - daddr = inet->opt->faddr; - - { - struct flowi fl = { .oif = sk->sk_bound_dev_if, - .nl_u = { .ip4_u = - { .daddr = daddr, - .saddr = inet->saddr, - .tos = RT_CONN_FLAGS(sk) } }, - .proto = IPPROTO_TCP, - .uli_u = { .ports = - { .sport = inet->sport, - .dport = inet->dport } } }; - - err = ip_route_output_flow(&rt, &fl, sk, 0); - } - if (!err) { - sk_setup_caps(sk, &rt->u.dst); - return 0; - } - - /* Routing failed... */ - sk->sk_route_caps = 0; - - if (!sysctl_ip_dynaddr || - sk->sk_state != TCP_SYN_SENT || - (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || - (err = tcp_v4_reselect_saddr(sk)) != 0) - sk->sk_err_soft = -err; - - return err; -} - static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) { struct sockaddr_in *sin = (struct sockaddr_in *) uaddr; @@ -1998,7 +1903,7 @@ int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw) struct tcp_func ipv4_specific = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, - .rebuild_header = tcp_v4_rebuild_header, + .rebuild_header = inet_sk_rebuild_header, .conn_request = tcp_v4_conn_request, .syn_recv_sock = tcp_v4_syn_recv_sock, .remember_stamp = tcp_v4_remember_stamp, @@ -2630,7 +2535,6 @@ EXPORT_SYMBOL(tcp_unhash); EXPORT_SYMBOL(tcp_v4_conn_request); EXPORT_SYMBOL(tcp_v4_connect); EXPORT_SYMBOL(tcp_v4_do_rcv); -EXPORT_SYMBOL(tcp_v4_rebuild_header); EXPORT_SYMBOL(tcp_v4_remember_stamp); EXPORT_SYMBOL(tcp_v4_send_check); EXPORT_SYMBOL(tcp_v4_syn_recv_sock); -- cgit v1.2.3 From 608c8e4f7b6e61cc783283e9dff8a465a5ad59bb Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 19:58:27 -0700 Subject: [NETFILTER]: Extend netfilter logging API This patch is in preparation to nfnetlink_log: - loggers now have to register struct nf_logger instead of nf_logfn - nf_log_unregister() replaced by nf_log_unregister_pf() and nf_log_unregister_logger() - add comment to ip[6]t_LOG.h to assure nobody redefines flags - add /proc/net/netfilter/nf_log to tell user which logger is currently registered for which address family - if user has configured logging, but no logging backend (logger) is available, always spit a message to syslog, not just the first time. - split ip[6]t_LOG.c into two parts: Backend: Always try to register as logger for the respective address family Frontend: Always log via nf_log_packet() API - modify all users of nf_log_packet() to accomodate additional argument Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_proto_icmp.c | 8 +-- net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 21 +++---- net/ipv4/netfilter/ip_conntrack_proto_udp.c | 6 +- net/ipv4/netfilter/ipt_LOG.c | 86 ++++++++++++++++------------ net/ipv4/netfilter/ipt_ULOG.c | 33 ++++++++--- 5 files changed, 90 insertions(+), 64 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index 3f90cb9979a..838d1d69b36 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -217,7 +217,7 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih); if (icmph == NULL) { if (LOG_INVALID(IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_icmp: short packet "); return -NF_ACCEPT; } @@ -231,13 +231,13 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, if (!(u16)csum_fold(skb->csum)) break; if (LOG_INVALID(IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_icmp: bad HW ICMP checksum "); return -NF_ACCEPT; case CHECKSUM_NONE: if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) { if (LOG_INVALID(IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_icmp: bad ICMP checksum "); return -NF_ACCEPT; } @@ -254,7 +254,7 @@ checksum_skipped: */ if (icmph->type > NR_ICMP_TYPES) { if (LOG_INVALID(IPPROTO_ICMP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_icmp: invalid ICMP type "); return -NF_ACCEPT; } diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index c2bce22d403..f23ef1f88c4 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -716,7 +716,7 @@ static int tcp_in_window(struct ip_ct_tcp *state, res = 1; } else { if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_tcp: %s ", before(seq, sender->td_maxend + 1) ? after(end, sender->td_end - receiver->td_maxwin - 1) ? @@ -815,7 +815,7 @@ static int tcp_error(struct sk_buff *skb, sizeof(_tcph), &_tcph); if (th == NULL) { if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_tcp: short packet "); return -NF_ACCEPT; } @@ -823,7 +823,7 @@ static int tcp_error(struct sk_buff *skb, /* Not whole TCP header or malformed packet */ if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_tcp: truncated/malformed packet "); return -NF_ACCEPT; } @@ -840,7 +840,7 @@ static int tcp_error(struct sk_buff *skb, skb->ip_summed == CHECKSUM_HW ? skb->csum : skb_checksum(skb, iph->ihl*4, tcplen, 0))) { if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_tcp: bad TCP checksum "); return -NF_ACCEPT; } @@ -849,7 +849,7 @@ static int tcp_error(struct sk_buff *skb, tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR)); if (!tcp_valid_flags[tcpflags]) { if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_tcp: invalid TCP flag combination "); return -NF_ACCEPT; } @@ -897,8 +897,9 @@ static int tcp_packet(struct ip_conntrack *conntrack, */ write_unlock_bh(&tcp_lock); if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, - "ip_ct_tcp: killing out of sync session "); + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + NULL, "ip_ct_tcp: " + "killing out of sync session "); if (del_timer(&conntrack->timeout)) conntrack->timeout.function((unsigned long) conntrack); @@ -912,7 +913,7 @@ static int tcp_packet(struct ip_conntrack *conntrack, write_unlock_bh(&tcp_lock); if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_tcp: invalid packet ignored "); return NF_ACCEPT; case TCP_CONNTRACK_MAX: @@ -922,7 +923,7 @@ static int tcp_packet(struct ip_conntrack *conntrack, old_state); write_unlock_bh(&tcp_lock); if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_tcp: invalid state "); return -NF_ACCEPT; case TCP_CONNTRACK_SYN_SENT: @@ -943,7 +944,7 @@ static int tcp_packet(struct ip_conntrack *conntrack, write_unlock_bh(&tcp_lock); if (LOG_INVALID(IPPROTO_TCP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, - "ip_ct_tcp: invalid SYN"); + NULL, "ip_ct_tcp: invalid SYN"); return -NF_ACCEPT; } case TCP_CONNTRACK_CLOSE: diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c index 14130169cbf..f2dcac7c766 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c @@ -98,7 +98,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, hdr = skb_header_pointer(skb, iph->ihl*4, sizeof(_hdr), &_hdr); if (hdr == NULL) { if (LOG_INVALID(IPPROTO_UDP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_udp: short packet "); return -NF_ACCEPT; } @@ -106,7 +106,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, /* Truncated/malformed packets */ if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { if (LOG_INVALID(IPPROTO_UDP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_udp: truncated/malformed packet "); return -NF_ACCEPT; } @@ -126,7 +126,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, skb->ip_summed == CHECKSUM_HW ? skb->csum : skb_checksum(skb, iph->ihl*4, udplen, 0))) { if (LOG_INVALID(IPPROTO_UDP)) - nf_log_packet(PF_INET, 0, skb, NULL, NULL, + nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, "ip_ct_udp: bad UDP checksum "); return -NF_ACCEPT; } diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index ef08733d26d..92ed050fac6 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -27,10 +27,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team "); MODULE_DESCRIPTION("iptables syslog logging module"); -static unsigned int nflog = 1; -module_param(nflog, int, 0400); -MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); - #if 0 #define DEBUGP printk #else @@ -41,11 +37,17 @@ MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); static DEFINE_SPINLOCK(log_lock); /* One level of recursion won't kill us */ -static void dump_packet(const struct ipt_log_info *info, +static void dump_packet(const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int iphoff) { struct iphdr _iph, *ih; + unsigned int logflags; + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_MASK; ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); if (ih == NULL) { @@ -76,7 +78,7 @@ static void dump_packet(const struct ipt_log_info *info, if (ntohs(ih->frag_off) & IP_OFFSET) printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); - if ((info->logflags & IPT_LOG_IPOPT) + if ((logflags & IPT_LOG_IPOPT) && ih->ihl * 4 > sizeof(struct iphdr)) { unsigned char _opt[4 * 15 - sizeof(struct iphdr)], *op; unsigned int i, optsize; @@ -119,7 +121,7 @@ static void dump_packet(const struct ipt_log_info *info, printk("SPT=%u DPT=%u ", ntohs(th->source), ntohs(th->dest)); /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ - if (info->logflags & IPT_LOG_TCPSEQ) + if (logflags & IPT_LOG_TCPSEQ) printk("SEQ=%u ACK=%u ", ntohl(th->seq), ntohl(th->ack_seq)); /* Max length: 13 "WINDOW=65535 " */ @@ -146,7 +148,7 @@ static void dump_packet(const struct ipt_log_info *info, /* Max length: 11 "URGP=65535 " */ printk("URGP=%u ", ntohs(th->urg_ptr)); - if ((info->logflags & IPT_LOG_TCPOPT) + if ((logflags & IPT_LOG_TCPOPT) && th->doff * 4 > sizeof(struct tcphdr)) { unsigned char _opt[4 * 15 - sizeof(struct tcphdr)]; unsigned char *op; @@ -328,7 +330,7 @@ static void dump_packet(const struct ipt_log_info *info, } /* Max length: 15 "UID=4294967295 " */ - if ((info->logflags & IPT_LOG_UID) && !iphoff && skb->sk) { + if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) { read_lock_bh(&skb->sk->sk_callback_lock); if (skb->sk->sk_socket && skb->sk->sk_socket->file) printk("UID=%u ", skb->sk->sk_socket->file->f_uid); @@ -349,19 +351,31 @@ static void dump_packet(const struct ipt_log_info *info, /* maxlen = 230+ 91 + 230 + 252 = 803 */ } +struct nf_loginfo default_loginfo = { + .type = NF_LOG_TYPE_LOG, + .u = { + .log = { + .level = 0, + .logflags = NF_LOG_MASK, + }, + }, +}; + static void -ipt_log_packet(unsigned int hooknum, +ipt_log_packet(unsigned int pf, + unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, - const struct ipt_log_info *loginfo, - const char *level_string, + const struct nf_loginfo *loginfo, const char *prefix) { + if (!loginfo) + loginfo = &default_loginfo; + spin_lock_bh(&log_lock); - printk(level_string); - printk("%sIN=%s OUT=%s ", - prefix == NULL ? loginfo->prefix : prefix, + printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, + prefix, in ? in->name : "", out ? out->name : ""); #ifdef CONFIG_BRIDGE_NETFILTER @@ -405,28 +419,15 @@ ipt_log_target(struct sk_buff **pskb, void *userinfo) { const struct ipt_log_info *loginfo = targinfo; - char level_string[4] = "< >"; + struct nf_loginfo li; - level_string[1] = '0' + (loginfo->level % 8); - ipt_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL); + li.type = NF_LOG_TYPE_LOG; + li.u.log.level = loginfo->level; + li.u.log.logflags = loginfo->logflags; - return IPT_CONTINUE; -} + nf_log_packet(PF_INET, hooknum, *pskb, in, out, &li, loginfo->prefix); -static void -ipt_logfn(unsigned int hooknum, - const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const char *prefix) -{ - struct ipt_log_info loginfo = { - .level = 0, - .logflags = IPT_LOG_MASK, - .prefix = "" - }; - - ipt_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix); + return IPT_CONTINUE; } static int ipt_log_checkentry(const char *tablename, @@ -464,20 +465,29 @@ static struct ipt_target ipt_log_reg = { .me = THIS_MODULE, }; +static struct nf_logger ipt_log_logger ={ + .name = "ipt_LOG", + .logfn = &ipt_log_packet, + .me = THIS_MODULE, +}; + static int __init init(void) { if (ipt_register_target(&ipt_log_reg)) return -EINVAL; - if (nflog) - nf_log_register(PF_INET, &ipt_logfn); + if (nf_log_register(PF_INET, &ipt_log_logger) < 0) { + printk(KERN_WARNING "ipt_LOG: not logging via system console " + "since somebody else already registered for PF_INET\n"); + /* we cannot make module load fail here, since otherwise + * iptables userspace would abort */ + } return 0; } static void __exit fini(void) { - if (nflog) - nf_log_unregister(PF_INET, &ipt_logfn); + nf_log_unregister_logger(&ipt_log_logger); ipt_unregister_target(&ipt_log_reg); } diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index 4ea8371ab27..b86f06ec976 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -304,18 +304,27 @@ static unsigned int ipt_ulog_target(struct sk_buff **pskb, return IPT_CONTINUE; } -static void ipt_logfn(unsigned int hooknum, +static void ipt_logfn(unsigned int pf, + unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, + const struct nf_loginfo *li, const char *prefix) { - struct ipt_ulog_info loginfo = { - .nl_group = ULOG_DEFAULT_NLGROUP, - .copy_range = 0, - .qthreshold = ULOG_DEFAULT_QTHRESHOLD, - .prefix = "" - }; + struct ipt_ulog_info loginfo; + + if (!li || li->type != NF_LOG_TYPE_ULOG) { + loginfo.nl_group = ULOG_DEFAULT_NLGROUP; + loginfo.copy_range = 0; + loginfo.qthreshold = ULOG_DEFAULT_QTHRESHOLD; + loginfo.prefix[0] = '\0'; + } else { + loginfo.nl_group = li->u.ulog.group; + loginfo.copy_range = li->u.ulog.copy_len; + loginfo.qthreshold = li->u.ulog.qthreshold; + strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix)); + } ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); } @@ -355,6 +364,12 @@ static struct ipt_target ipt_ulog_reg = { .me = THIS_MODULE, }; +static struct nf_logger ipt_ulog_logger = { + .name = "ipt_ULOG", + .logfn = &ipt_logfn, + .me = THIS_MODULE, +}; + static int __init init(void) { int i; @@ -382,7 +397,7 @@ static int __init init(void) return -EINVAL; } if (nflog) - nf_log_register(PF_INET, &ipt_logfn); + nf_log_register(PF_INET, &ipt_ulog_logger); return 0; } @@ -395,7 +410,7 @@ static void __exit fini(void) DEBUGP("ipt_ULOG: cleanup_module\n"); if (nflog) - nf_log_unregister(PF_INET, &ipt_logfn); + nf_log_unregister_logger(&ipt_ulog_logger); ipt_unregister_target(&ipt_ulog_reg); sock_release(nflognl->sk_socket); -- cgit v1.2.3 From 304a16180fb6d2b153b45f6fbbcec1fa814496e5 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 19:59:20 -0700 Subject: [INET]: Move the TCP ehash functions to include/net/inet_hashtables.h To be shared with DCCP (and others), this is the start of a series of patches that will expose the already generic TCP hash table routines. The few changes noticed when calling gcc -S before/after on a pentium4 were of this type: movl 40(%esp), %edx cmpl %esi, 472(%edx) je .L168 - pushl $291 + pushl $272 pushl $.LC0 pushl $.LC1 pushl $.LC2 [acme@toy net-2.6.14]$ size net/ipv4/tcp_ipv4.before.o net/ipv4/tcp_ipv4.after.o text data bss dec hex filename 17804 516 140 18460 481c net/ipv4/tcp_ipv4.before.o 17804 516 140 18460 481c net/ipv4/tcp_ipv4.after.o Holler if some weird architecture has issues with things like this 8) Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 28 +++++----------------------- 1 file changed, 5 insertions(+), 23 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ae6fad99a9a..c03d7e9688c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -64,6 +64,7 @@ #include #include +#include #include #include #include @@ -104,26 +105,6 @@ struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = { int sysctl_local_port_range[2] = { 1024, 4999 }; int tcp_port_rover = 1024 - 1; -static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport, - __u32 faddr, __u16 fport) -{ - int h = (laddr ^ lport) ^ (faddr ^ fport); - h ^= h >> 16; - h ^= h >> 8; - return h & (tcp_ehash_size - 1); -} - -static __inline__ int tcp_sk_hashfn(struct sock *sk) -{ - struct inet_sock *inet = inet_sk(sk); - __u32 laddr = inet->rcv_saddr; - __u16 lport = inet->num; - __u32 faddr = inet->daddr; - __u16 fport = inet->dport; - - return tcp_hashfn(laddr, lport, faddr, fport); -} - /* Allocate and initialize a new TCP local port bind bucket. * The bindhash mutex for snum's hash chain must be held here. */ @@ -367,7 +348,8 @@ static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible) lock = &tcp_lhash_lock; tcp_listen_wlock(); } else { - list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain; + sk->sk_hashent = inet_sk_ehashfn(sk, tcp_ehash_size); + list = &tcp_ehash[sk->sk_hashent].chain; lock = &tcp_ehash[sk->sk_hashent].lock; write_lock(lock); } @@ -500,7 +482,7 @@ static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport, /* Optimize here for direct hit, only listening connections can * have wildcards anyways. */ - int hash = tcp_hashfn(daddr, hnum, saddr, sport); + const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_ehash_size); head = &tcp_ehash[hash]; read_lock(&head->lock); sk_for_each(sk, node, &head->chain) { @@ -563,7 +545,7 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport, int dif = sk->sk_bound_dev_if; TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport); - int hash = tcp_hashfn(daddr, lport, saddr, inet->dport); + const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_ehash_size); struct tcp_ehash_bucket *head = &tcp_ehash[hash]; struct sock *sk2; struct hlist_node *node; -- cgit v1.2.3 From 0f7ff9274e72fd254fbd1ab117bbc1db6e7cdb34 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 19:59:44 -0700 Subject: [INET]: Just rename the TCP hashtable functions/structs to inet_ This is to break down the complexity of the series of patches, making it very clear that this one just does: 1. renames tcp_ prefixed hashtable functions and data structures that were already mostly generic to inet_ to share it with DCCP and other INET transport protocols. 2. Removes not used functions (__tb_head & tb_head) 3. Removes some leftover prototypes in the headers (tcp_bucket_unlock & tcp_v4_build_header) Next changesets will move tcp_sk(sk)->bind_hash to inet_sock so that we can make functions such as tcp_inherit_port, __tcp_inherit_port, tcp_v4_get_port, __tcp_put_port, generic and get others like tcp_destroy_sock closer to generic (tcp_orphan_count will go to sk->sk_prot to allow this). Eventually most of these functions will be used passing the transport protocol inet_hashinfo structure. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 15 ++++--- net/ipv4/tcp_diag.c | 4 +- net/ipv4/tcp_ipv4.c | 106 +++++++++++++++++++++++++---------------------- net/ipv4/tcp_minisocks.c | 16 +++---- 4 files changed, 75 insertions(+), 66 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 20159a3dafb..1ec03db7dcd 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -272,6 +272,9 @@ int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics); kmem_cache_t *tcp_bucket_cachep; + +EXPORT_SYMBOL_GPL(tcp_bucket_cachep); + kmem_cache_t *tcp_timewait_cachep; atomic_t tcp_orphan_count = ATOMIC_INIT(0); @@ -2259,7 +2262,7 @@ void __init tcp_init(void) sizeof(skb->cb)); tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket", - sizeof(struct tcp_bind_bucket), + sizeof(struct inet_bind_bucket), 0, SLAB_HWCACHE_ALIGN, NULL, NULL); if (!tcp_bucket_cachep) @@ -2277,9 +2280,9 @@ void __init tcp_init(void) * * The methodology is similar to that of the buffer cache. */ - tcp_ehash = (struct tcp_ehash_bucket *) + tcp_ehash = alloc_large_system_hash("TCP established", - sizeof(struct tcp_ehash_bucket), + sizeof(struct inet_ehash_bucket), thash_entries, (num_physpages >= 128 * 1024) ? (25 - PAGE_SHIFT) : @@ -2294,9 +2297,9 @@ void __init tcp_init(void) INIT_HLIST_HEAD(&tcp_ehash[i].chain); } - tcp_bhash = (struct tcp_bind_hashbucket *) + tcp_bhash = alloc_large_system_hash("TCP bind", - sizeof(struct tcp_bind_hashbucket), + sizeof(struct inet_bind_hashbucket), tcp_ehash_size, (num_physpages >= 128 * 1024) ? (25 - PAGE_SHIFT) : @@ -2315,7 +2318,7 @@ void __init tcp_init(void) * on available memory. */ for (order = 0; ((1 << order) << PAGE_SHIFT) < - (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket)); + (tcp_bhash_size * sizeof(struct inet_bind_hashbucket)); order++) ; if (order >= 4) { diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index f79bd11a470..5bb6a0f1c77 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -590,7 +590,7 @@ static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb) if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV))) goto skip_listen_ht; tcp_listen_lock(); - for (i = s_i; i < TCP_LHTABLE_SIZE; i++) { + for (i = s_i; i < INET_LHTABLE_SIZE; i++) { struct sock *sk; struct hlist_node *node; @@ -646,7 +646,7 @@ skip_listen_ht: return skb->len; for (i = s_i; i < tcp_ehash_size; i++) { - struct tcp_ehash_bucket *head = &tcp_ehash[i]; + struct inet_ehash_bucket *head = &tcp_ehash[i]; struct sock *sk; struct hlist_node *node; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c03d7e9688c..4138630556e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -89,12 +89,11 @@ static struct socket *tcp_socket; void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, struct sk_buff *skb); -struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = { - .__tcp_lhash_lock = RW_LOCK_UNLOCKED, - .__tcp_lhash_users = ATOMIC_INIT(0), - .__tcp_lhash_wait - = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait), - .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED +struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { + .lhash_lock = RW_LOCK_UNLOCKED, + .lhash_users = ATOMIC_INIT(0), + .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait), + .portalloc_lock = SPIN_LOCK_UNLOCKED, }; /* @@ -105,14 +104,14 @@ struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = { int sysctl_local_port_range[2] = { 1024, 4999 }; int tcp_port_rover = 1024 - 1; -/* Allocate and initialize a new TCP local port bind bucket. +/* Allocate and initialize a new local port bind bucket. * The bindhash mutex for snum's hash chain must be held here. */ -struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head, - unsigned short snum) +struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep, + struct inet_bind_hashbucket *head, + const unsigned short snum) { - struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep, - SLAB_ATOMIC); + struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, SLAB_ATOMIC); if (tb) { tb->port = snum; tb->fastreuse = 0; @@ -123,20 +122,21 @@ struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head, } /* Caller must hold hashbucket lock for this tb with local BH disabled */ -void tcp_bucket_destroy(struct tcp_bind_bucket *tb) +void inet_bind_bucket_destroy(kmem_cache_t *cachep, struct inet_bind_bucket *tb) { if (hlist_empty(&tb->owners)) { __hlist_del(&tb->node); - kmem_cache_free(tcp_bucket_cachep, tb); + kmem_cache_free(cachep, tb); } } /* Caller must disable local BH processing. */ static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child) { - struct tcp_bind_hashbucket *head = - &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)]; - struct tcp_bind_bucket *tb; + struct inet_bind_hashbucket *head = + &tcp_bhash[inet_bhashfn(inet_sk(child)->num, + tcp_bhash_size)]; + struct inet_bind_bucket *tb; spin_lock(&head->lock); tb = tcp_sk(sk)->bind_hash; @@ -152,15 +152,15 @@ inline void tcp_inherit_port(struct sock *sk, struct sock *child) local_bh_enable(); } -void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, - unsigned short snum) +void tcp_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, + const unsigned short snum) { inet_sk(sk)->num = snum; sk_add_bind_node(sk, &tb->owners); tcp_sk(sk)->bind_hash = tb; } -static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb) +static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) { const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk); struct sock *sk2; @@ -190,9 +190,9 @@ static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb) */ static int tcp_v4_get_port(struct sock *sk, unsigned short snum) { - struct tcp_bind_hashbucket *head; + struct inet_bind_hashbucket *head; struct hlist_node *node; - struct tcp_bind_bucket *tb; + struct inet_bind_bucket *tb; int ret; local_bh_disable(); @@ -211,9 +211,9 @@ static int tcp_v4_get_port(struct sock *sk, unsigned short snum) rover++; if (rover > high) rover = low; - head = &tcp_bhash[tcp_bhashfn(rover)]; + head = &tcp_bhash[inet_bhashfn(rover, tcp_bhash_size)]; spin_lock(&head->lock); - tb_for_each(tb, node, &head->chain) + inet_bind_bucket_for_each(tb, node, &head->chain) if (tb->port == rover) goto next; break; @@ -238,9 +238,9 @@ static int tcp_v4_get_port(struct sock *sk, unsigned short snum) */ snum = rover; } else { - head = &tcp_bhash[tcp_bhashfn(snum)]; + head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)]; spin_lock(&head->lock); - tb_for_each(tb, node, &head->chain) + inet_bind_bucket_for_each(tb, node, &head->chain) if (tb->port == snum) goto tb_found; } @@ -261,7 +261,7 @@ tb_found: } tb_not_found: ret = 1; - if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL) + if (!tb && (tb = inet_bind_bucket_create(tcp_bucket_cachep, head, snum)) == NULL) goto fail_unlock; if (hlist_empty(&tb->owners)) { if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) @@ -290,15 +290,16 @@ fail: static void __tcp_put_port(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); - struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)]; - struct tcp_bind_bucket *tb; + struct inet_bind_hashbucket *head = &tcp_bhash[inet_bhashfn(inet->num, + tcp_bhash_size)]; + struct inet_bind_bucket *tb; spin_lock(&head->lock); tb = tcp_sk(sk)->bind_hash; __sk_del_bind_node(sk); tcp_sk(sk)->bind_hash = NULL; inet->num = 0; - tcp_bucket_destroy(tb); + inet_bind_bucket_destroy(tcp_bucket_cachep, tb); spin_unlock(&head->lock); } @@ -344,7 +345,7 @@ static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible) BUG_TRAP(sk_unhashed(sk)); if (listen_possible && sk->sk_state == TCP_LISTEN) { - list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)]; + list = &tcp_listening_hash[inet_sk_listen_hashfn(sk)]; lock = &tcp_lhash_lock; tcp_listen_wlock(); } else { @@ -381,7 +382,7 @@ void tcp_unhash(struct sock *sk) tcp_listen_wlock(); lock = &tcp_lhash_lock; } else { - struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent]; + struct inet_ehash_bucket *head = &tcp_ehash[sk->sk_hashent]; lock = &head->lock; write_lock_bh(&head->lock); } @@ -401,8 +402,10 @@ void tcp_unhash(struct sock *sk) * connection. So always assume those are both wildcarded * during the search since they can never be otherwise. */ -static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr, - unsigned short hnum, int dif) +static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, + const u32 daddr, + const unsigned short hnum, + const int dif) { struct sock *result = NULL, *sk; struct hlist_node *node; @@ -438,14 +441,15 @@ static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr, } /* Optimize the common listener case. */ -static inline struct sock *tcp_v4_lookup_listener(u32 daddr, - unsigned short hnum, int dif) +static inline struct sock *tcp_v4_lookup_listener(const u32 daddr, + const unsigned short hnum, + const int dif) { struct sock *sk = NULL; struct hlist_head *head; read_lock(&tcp_lhash_lock); - head = &tcp_listening_hash[tcp_lhashfn(hnum)]; + head = &tcp_listening_hash[inet_lhashfn(hnum)]; if (!hlist_empty(head)) { struct inet_sock *inet = inet_sk((sk = __sk_head(head))); @@ -470,11 +474,13 @@ sherry_cache: * Local BH must be disabled here. */ -static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport, - u32 daddr, u16 hnum, - int dif) +static inline struct sock *__tcp_v4_lookup_established(const u32 saddr, + const u16 sport, + const u32 daddr, + const u16 hnum, + const int dif) { - struct tcp_ehash_bucket *head; + struct inet_ehash_bucket *head; TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) __u32 ports = TCP_COMBINED_PORTS(sport, hnum); struct sock *sk; @@ -546,7 +552,7 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport, TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport); const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_ehash_size); - struct tcp_ehash_bucket *head = &tcp_ehash[hash]; + struct inet_ehash_bucket *head = &tcp_ehash[hash]; struct sock *sk2; struct hlist_node *node; struct tcp_tw_bucket *tw; @@ -639,9 +645,9 @@ static inline u32 connect_port_offset(const struct sock *sk) */ static inline int tcp_v4_hash_connect(struct sock *sk) { - unsigned short snum = inet_sk(sk)->num; - struct tcp_bind_hashbucket *head; - struct tcp_bind_bucket *tb; + const unsigned short snum = inet_sk(sk)->num; + struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; int ret; if (!snum) { @@ -658,14 +664,14 @@ static inline int tcp_v4_hash_connect(struct sock *sk) local_bh_disable(); for (i = 1; i <= range; i++) { port = low + (i + offset) % range; - head = &tcp_bhash[tcp_bhashfn(port)]; + head = &tcp_bhash[inet_bhashfn(port, tcp_bhash_size)]; spin_lock(&head->lock); /* Does not bother with rcv_saddr checks, * because the established check is already * unique enough. */ - tb_for_each(tb, node, &head->chain) { + inet_bind_bucket_for_each(tb, node, &head->chain) { if (tb->port == port) { BUG_TRAP(!hlist_empty(&tb->owners)); if (tb->fastreuse >= 0) @@ -678,7 +684,7 @@ static inline int tcp_v4_hash_connect(struct sock *sk) } } - tb = tcp_bucket_create(head, port); + tb = inet_bind_bucket_create(tcp_bucket_cachep, head, port); if (!tb) { spin_unlock(&head->lock); break; @@ -713,7 +719,7 @@ ok: goto out; } - head = &tcp_bhash[tcp_bhashfn(snum)]; + head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)]; tb = tcp_sk(sk)->bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { @@ -2055,7 +2061,7 @@ start_req: } read_unlock_bh(&tp->accept_queue.syn_wait_lock); } - if (++st->bucket < TCP_LHTABLE_SIZE) { + if (++st->bucket < INET_LHTABLE_SIZE) { sk = sk_head(&tcp_listening_hash[st->bucket]); goto get_sk; } @@ -2506,7 +2512,7 @@ void __init tcp_v4_init(struct net_proto_family *ops) EXPORT_SYMBOL(ipv4_specific); EXPORT_SYMBOL(tcp_bind_hash); -EXPORT_SYMBOL(tcp_bucket_create); +EXPORT_SYMBOL(inet_bind_bucket_create); EXPORT_SYMBOL(tcp_hashinfo); EXPORT_SYMBOL(tcp_inherit_port); EXPORT_SYMBOL(tcp_listen_wlock); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 7c46a553c4a..1df6cd46066 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -60,9 +60,9 @@ int tcp_tw_count; /* Must be called with locally disabled BHs. */ static void tcp_timewait_kill(struct tcp_tw_bucket *tw) { - struct tcp_ehash_bucket *ehead; - struct tcp_bind_hashbucket *bhead; - struct tcp_bind_bucket *tb; + struct inet_ehash_bucket *ehead; + struct inet_bind_hashbucket *bhead; + struct inet_bind_bucket *tb; /* Unlink from established hashes. */ ehead = &tcp_ehash[tw->tw_hashent]; @@ -76,12 +76,12 @@ static void tcp_timewait_kill(struct tcp_tw_bucket *tw) write_unlock(&ehead->lock); /* Disassociate with bind bucket. */ - bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)]; + bhead = &tcp_bhash[inet_bhashfn(tw->tw_num, tcp_bhash_size)]; spin_lock(&bhead->lock); tb = tw->tw_tb; __hlist_del(&tw->tw_bind_node); tw->tw_tb = NULL; - tcp_bucket_destroy(tb); + inet_bind_bucket_destroy(tcp_bucket_cachep, tb); spin_unlock(&bhead->lock); #ifdef SOCK_REFCNT_DEBUG @@ -296,14 +296,14 @@ kill: */ static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) { - struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent]; - struct tcp_bind_hashbucket *bhead; + struct inet_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent]; + struct inet_bind_hashbucket *bhead; /* Step 1: Put TW into bind hash. Original socket stays there too. Note, that any socket with inet_sk(sk)->num != 0 MUST be bound in binding cache, even if it is closed. */ - bhead = &tcp_bhash[tcp_bhashfn(inet_sk(sk)->num)]; + bhead = &tcp_bhash[inet_bhashfn(inet_sk(sk)->num, tcp_bhash_size)]; spin_lock(&bhead->lock); tw->tw_tb = tcp_sk(sk)->bind_hash; BUG_TRAP(tcp_sk(sk)->bind_hash); -- cgit v1.2.3 From 77d8bf9c6208eb535f05718168ffcc476be0ca8c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:00:51 -0700 Subject: [INET]: Move the TCP hashtable functions/structs to inet_hashtables.[ch] Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/Makefile | 2 +- net/ipv4/inet_hashtables.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_ipv4.c | 26 ----------------------- 3 files changed, 52 insertions(+), 27 deletions(-) create mode 100644 net/ipv4/inet_hashtables.c (limited to 'net/ipv4') diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 61c7386bcd2..2d8d30e83eb 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -4,7 +4,7 @@ obj-y := route.o inetpeer.o protocol.o \ ip_input.o ip_fragment.o ip_forward.o ip_options.o \ - ip_output.o ip_sockglue.o \ + ip_output.o ip_sockglue.o inet_hashtables.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o \ datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c new file mode 100644 index 00000000000..343a890bd61 --- /dev/null +++ b/net/ipv4/inet_hashtables.c @@ -0,0 +1,51 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Generic INET transport hashtables + * + * Authors: Lotsa people, from code originally in tcp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include + +#include + +/* + * Allocate and initialize a new local port bind bucket. + * The bindhash mutex for snum's hash chain must be held here. + */ +struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep, + struct inet_bind_hashbucket *head, + const unsigned short snum) +{ + struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, SLAB_ATOMIC); + + if (tb != NULL) { + tb->port = snum; + tb->fastreuse = 0; + INIT_HLIST_HEAD(&tb->owners); + hlist_add_head(&tb->node, &head->chain); + } + return tb; +} + +EXPORT_SYMBOL(inet_bind_bucket_create); + +/* + * Caller must hold hashbucket lock for this tb with local BH disabled + */ +void inet_bind_bucket_destroy(kmem_cache_t *cachep, struct inet_bind_bucket *tb) +{ + if (hlist_empty(&tb->owners)) { + __hlist_del(&tb->node); + kmem_cache_free(cachep, tb); + } +} diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4138630556e..58e36ed88f2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -104,32 +104,6 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { int sysctl_local_port_range[2] = { 1024, 4999 }; int tcp_port_rover = 1024 - 1; -/* Allocate and initialize a new local port bind bucket. - * The bindhash mutex for snum's hash chain must be held here. - */ -struct inet_bind_bucket *inet_bind_bucket_create(kmem_cache_t *cachep, - struct inet_bind_hashbucket *head, - const unsigned short snum) -{ - struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, SLAB_ATOMIC); - if (tb) { - tb->port = snum; - tb->fastreuse = 0; - INIT_HLIST_HEAD(&tb->owners); - hlist_add_head(&tb->node, &head->chain); - } - return tb; -} - -/* Caller must hold hashbucket lock for this tb with local BH disabled */ -void inet_bind_bucket_destroy(kmem_cache_t *cachep, struct inet_bind_bucket *tb) -{ - if (hlist_empty(&tb->owners)) { - __hlist_del(&tb->node); - kmem_cache_free(cachep, tb); - } -} - /* Caller must disable local BH processing. */ static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child) { -- cgit v1.2.3 From a55ebcc4c4532107ad9eee1c9bb698ab5f12c00f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:01:14 -0700 Subject: [INET]: Move bind_hash from tcp_sk to inet_sk This should really be in a inet_connection_sock, but I'm leaving it for a later optimization, when some more fields common to INET transport protocols now in tcp_sk or inet_sk will be chunked out into inet_connection_sock, for now its better to concentrate on getting the changes in the core merged to leave the DCCP tree with only DCCP specific code. Next changesets will take advantage of this move to generalise things like tcp_bind_hash, tcp_put_port, tcp_inherit_port, making the later receive a inet_hashinfo parameter, and even __tcp_tw_hashdance, etc in the future, when tcp_tw_bucket gets transformed into the struct timewait_sock hierarchy. tcp_destroy_sock also is eligible as soon as tcp_orphan_count gets moved to sk_prot. A cascade of incremental changes will ultimately make the tcp_lookup functions be fully generic. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_ipv4.c | 21 +++++++++++---------- net/ipv4/tcp_minisocks.c | 15 ++++++++------- 3 files changed, 21 insertions(+), 19 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1ec03db7dcd..e54a410ca70 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1575,7 +1575,7 @@ void tcp_destroy_sock(struct sock *sk) BUG_TRAP(sk_unhashed(sk)); /* If it has not 0 inet_sk(sk)->num, it must be bound */ - BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash); + BUG_TRAP(!inet_sk(sk)->num || inet_sk(sk)->bind_hash); sk->sk_prot->destroy(sk); @@ -1802,7 +1802,7 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_sack_reset(&tp->rx_opt); __sk_dst_reset(sk); - BUG_TRAP(!inet->num || tp->bind_hash); + BUG_TRAP(!inet->num || inet->bind_hash); sk->sk_error_report(sk); return err; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 58e36ed88f2..10a9b3ae344 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -113,9 +113,9 @@ static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child) struct inet_bind_bucket *tb; spin_lock(&head->lock); - tb = tcp_sk(sk)->bind_hash; + tb = inet_sk(sk)->bind_hash; sk_add_bind_node(child, &tb->owners); - tcp_sk(child)->bind_hash = tb; + inet_sk(child)->bind_hash = tb; spin_unlock(&head->lock); } @@ -129,9 +129,10 @@ inline void tcp_inherit_port(struct sock *sk, struct sock *child) void tcp_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, const unsigned short snum) { - inet_sk(sk)->num = snum; + struct inet_sock *inet = inet_sk(sk); + inet->num = snum; sk_add_bind_node(sk, &tb->owners); - tcp_sk(sk)->bind_hash = tb; + inet->bind_hash = tb; } static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) @@ -246,9 +247,9 @@ tb_not_found: (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) tb->fastreuse = 0; success: - if (!tcp_sk(sk)->bind_hash) + if (!inet_sk(sk)->bind_hash) tcp_bind_hash(sk, tb, snum); - BUG_TRAP(tcp_sk(sk)->bind_hash == tb); + BUG_TRAP(inet_sk(sk)->bind_hash == tb); ret = 0; fail_unlock: @@ -269,9 +270,9 @@ static void __tcp_put_port(struct sock *sk) struct inet_bind_bucket *tb; spin_lock(&head->lock); - tb = tcp_sk(sk)->bind_hash; + tb = inet->bind_hash; __sk_del_bind_node(sk); - tcp_sk(sk)->bind_hash = NULL; + inet->bind_hash = NULL; inet->num = 0; inet_bind_bucket_destroy(tcp_bucket_cachep, tb); spin_unlock(&head->lock); @@ -694,7 +695,7 @@ ok: } head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)]; - tb = tcp_sk(sk)->bind_hash; + tb = inet_sk(sk)->bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { __tcp_v4_hash(sk, 0); @@ -1940,7 +1941,7 @@ int tcp_v4_destroy_sock(struct sock *sk) __skb_queue_purge(&tp->ucopy.prequeue); /* Clean up a referenced TCP bind bucket. */ - if (tp->bind_hash) + if (inet_sk(sk)->bind_hash) tcp_put_port(sk); /* diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1df6cd46066..267cea1087e 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -296,17 +296,17 @@ kill: */ static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) { + const struct inet_sock *inet = inet_sk(sk); struct inet_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent]; struct inet_bind_hashbucket *bhead; - /* Step 1: Put TW into bind hash. Original socket stays there too. - Note, that any socket with inet_sk(sk)->num != 0 MUST be bound in + Note, that any socket with inet->num != 0 MUST be bound in binding cache, even if it is closed. */ - bhead = &tcp_bhash[inet_bhashfn(inet_sk(sk)->num, tcp_bhash_size)]; + bhead = &tcp_bhash[inet_bhashfn(inet->num, tcp_bhash_size)]; spin_lock(&bhead->lock); - tw->tw_tb = tcp_sk(sk)->bind_hash; - BUG_TRAP(tcp_sk(sk)->bind_hash); + tw->tw_tb = inet->bind_hash; + BUG_TRAP(inet->bind_hash); tw_add_bind_node(tw, &tw->tw_tb->owners); spin_unlock(&bhead->lock); @@ -694,6 +694,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, if(newsk != NULL) { struct inet_request_sock *ireq = inet_rsk(req); struct tcp_request_sock *treq = tcp_rsk(req); + struct inet_sock *newinet = inet_sk(newsk); struct tcp_sock *newtp; struct sk_filter *filter; @@ -702,10 +703,10 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, /* SANITY */ sk_node_init(&newsk->sk_node); - tcp_sk(newsk)->bind_hash = NULL; + newinet->bind_hash = NULL; /* Clone the TCP header template */ - inet_sk(newsk)->dport = ireq->rmt_port; + newinet->dport = ireq->rmt_port; sock_lock_init(newsk); bh_lock_sock(newsk); -- cgit v1.2.3 From a86888b925299330053d20e0eba03ac4d2648c4b Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 9 Aug 2005 20:02:13 -0700 Subject: [NETFILTER]: Fix multiple problems with the conntrack event cache refcnt underflow: the reference count is decremented when a conntrack entry is removed from the hash but it is not incremented when entering new entries. missing protection of process context against softirq context: all cache operations need to locally disable softirqs to avoid races. Additionally the event cache can't be initialized when a packet enteres the conntrack code but needs to be initialized whenever we cache an event and the stored conntrack entry doesn't match the current one. incorrect flushing of the event cache in ip_ct_iterate_cleanup: without real locking we can't flush the cache for different CPUs without incurring races. The cache for different CPUs can only be flushed when no packets are going through the code. ip_ct_iterate_cleanup doesn't need to drop all references, so flushing is moved to the cleanup path. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_core.c | 105 ++++++++++----------------- net/ipv4/netfilter/ip_conntrack_standalone.c | 3 +- 2 files changed, 39 insertions(+), 69 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index d9fddae8d78..5c3f16eae2d 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -85,73 +85,62 @@ struct notifier_block *ip_conntrack_expect_chain; DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache); -static inline void __deliver_cached_events(struct ip_conntrack_ecache *ecache) +/* deliver cached events and clear cache entry - must be called with locally + * disabled softirqs */ +static inline void +__ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache) { + DEBUGP("ecache: delivering events for %p\n", ecache->ct); if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events) notifier_call_chain(&ip_conntrack_chain, ecache->events, ecache->ct); ecache->events = 0; -} - -void __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache) -{ - __deliver_cached_events(ecache); + ip_conntrack_put(ecache->ct); + ecache->ct = NULL; } /* Deliver all cached events for a particular conntrack. This is called * by code prior to async packet handling or freeing the skb */ -void -ip_conntrack_deliver_cached_events_for(const struct ip_conntrack *ct) +void ip_ct_deliver_cached_events(const struct ip_conntrack *ct) { - struct ip_conntrack_ecache *ecache = - &__get_cpu_var(ip_conntrack_ecache); - - if (!ct) - return; + struct ip_conntrack_ecache *ecache; + + local_bh_disable(); + ecache = &__get_cpu_var(ip_conntrack_ecache); + if (ecache->ct == ct) + __ip_ct_deliver_cached_events(ecache); + local_bh_enable(); +} - if (ecache->ct == ct) { - DEBUGP("ecache: delivering event for %p\n", ct); - __deliver_cached_events(ecache); - } else { - if (net_ratelimit()) - printk(KERN_WARNING "ecache: want to deliver for %p, " - "but cache has %p\n", ct, ecache->ct); - } +void __ip_ct_event_cache_init(struct ip_conntrack *ct) +{ + struct ip_conntrack_ecache *ecache; - /* signalize that events have already been delivered */ - ecache->ct = NULL; + /* take care of delivering potentially old events */ + ecache = &__get_cpu_var(ip_conntrack_ecache); + BUG_ON(ecache->ct == ct); + if (ecache->ct) + __ip_ct_deliver_cached_events(ecache); + /* initialize for this conntrack/packet */ + ecache->ct = ct; + nf_conntrack_get(&ct->ct_general); } -/* Deliver cached events for old pending events, if current conntrack != old */ -void ip_conntrack_event_cache_init(const struct sk_buff *skb) +/* flush the event cache - touches other CPU's data and must not be called while + * packets are still passing through the code */ +static void ip_ct_event_cache_flush(void) { - struct ip_conntrack *ct = (struct ip_conntrack *) skb->nfct; - struct ip_conntrack_ecache *ecache = - &__get_cpu_var(ip_conntrack_ecache); + struct ip_conntrack_ecache *ecache; + int cpu; - /* take care of delivering potentially old events */ - if (ecache->ct != ct) { - enum ip_conntrack_info ctinfo; - /* we have to check, since at startup the cache is NULL */ - if (likely(ecache->ct)) { - DEBUGP("ecache: entered for different conntrack: " - "ecache->ct=%p, skb->nfct=%p. delivering " - "events\n", ecache->ct, ct); - __deliver_cached_events(ecache); + for_each_cpu(cpu) { + ecache = &per_cpu(ip_conntrack_ecache, cpu); + if (ecache->ct) ip_conntrack_put(ecache->ct); - } else { - DEBUGP("ecache: entered for conntrack %p, " - "cache was clean before\n", ct); - } - - /* initialize for this conntrack/packet */ - ecache->ct = ip_conntrack_get(skb, &ctinfo); - /* ecache->events cleared by __deliver_cached_devents() */ - } else { - DEBUGP("ecache: re-entered for conntrack %p.\n", ct); } } - +#else +static inline void ip_ct_event_cache_flush(void) {} #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */ DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); @@ -878,8 +867,6 @@ unsigned int ip_conntrack_in(unsigned int hooknum, IP_NF_ASSERT((*pskb)->nfct); - ip_conntrack_event_cache_init(*pskb); - ret = proto->packet(ct, *pskb, ctinfo); if (ret < 0) { /* Invalid: inverse of the return code tells @@ -1278,23 +1265,6 @@ ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data) ip_conntrack_put(ct); } - -#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS - { - /* we need to deliver all cached events in order to drop - * the reference counts */ - int cpu; - for_each_cpu(cpu) { - struct ip_conntrack_ecache *ecache = - &per_cpu(ip_conntrack_ecache, cpu); - if (ecache->ct) { - __ip_ct_deliver_cached_events(ecache); - ip_conntrack_put(ecache->ct); - ecache->ct = NULL; - } - } - } -#endif } /* Fast function for those who don't want to parse /proc (and I don't @@ -1381,6 +1351,7 @@ void ip_conntrack_flush() delete... */ synchronize_net(); + ip_ct_event_cache_flush(); i_see_dead_people: ip_ct_iterate_cleanup(kill_all, NULL); if (atomic_read(&ip_conntrack_count) != 0) { diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index ca97c3ac2f2..ee5895afd0c 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -401,7 +401,6 @@ static unsigned int ip_confirm(unsigned int hooknum, const struct net_device *out, int (*okfn)(struct sk_buff *)) { - ip_conntrack_event_cache_init(*pskb); /* We've seen it coming out the other side: confirm it */ return ip_conntrack_confirm(pskb); } @@ -419,7 +418,6 @@ static unsigned int ip_conntrack_help(unsigned int hooknum, ct = ip_conntrack_get(*pskb, &ctinfo); if (ct && ct->helper) { unsigned int ret; - ip_conntrack_event_cache_init(*pskb); ret = ct->helper->help(pskb, ct, ctinfo); if (ret != NF_ACCEPT) return ret; @@ -978,6 +976,7 @@ EXPORT_SYMBOL_GPL(ip_conntrack_chain); EXPORT_SYMBOL_GPL(ip_conntrack_expect_chain); EXPORT_SYMBOL_GPL(ip_conntrack_register_notifier); EXPORT_SYMBOL_GPL(ip_conntrack_unregister_notifier); +EXPORT_SYMBOL_GPL(__ip_ct_event_cache_init); EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache); #endif EXPORT_SYMBOL(ip_conntrack_protocol_register); -- cgit v1.2.3 From 94cd2b67641e7ddc2e6ed71d76e00116957423db Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Tue, 9 Aug 2005 20:02:36 -0700 Subject: [NETFILTER]: remove bogus memset() calls from ip_conntrack_netlink.c nfattr_parse_nested() calls nfattr_parse() which in turn does a memset on the 'tb' array. All callers therefore don't need to memset before calling it. Signed-off-by: Pablo Neira Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index f43ec18c916..36a046f2210 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -479,7 +479,6 @@ ctnetlink_parse_tuple_ip(struct nfattr *attr, struct ip_conntrack_tuple *tuple) DEBUGP("entered %s\n", __FUNCTION__); - memset(tb, 0, CTA_IP_MAX * sizeof(tb)); if (nfattr_parse_nested(tb, CTA_IP_MAX, attr) < 0) goto nfattr_failure; @@ -522,8 +521,6 @@ ctnetlink_parse_tuple_proto(struct nfattr *attr, DEBUGP("entered %s\n", __FUNCTION__); - memset(tb, 0, CTA_PROTO_MAX * sizeof(tb)); - if (nfattr_parse_nested(tb, CTA_PROTO_MAX, attr) < 0) goto nfattr_failure; @@ -556,7 +553,6 @@ ctnetlink_parse_tuple(struct nfattr *cda[], struct ip_conntrack_tuple *tuple, DEBUGP("entered %s\n", __FUNCTION__); - memset(tb, 0, CTA_TUPLE_MAX * sizeof(tb)); memset(tuple, 0, sizeof(*tuple)); if (nfattr_parse_nested(tb, CTA_TUPLE_MAX, cda[type-1]) < 0) @@ -607,8 +603,6 @@ static int ctnetlink_parse_nat_proto(struct nfattr *attr, DEBUGP("entered %s\n", __FUNCTION__); - memset(tb, 0, CTA_PROTONAT_MAX * sizeof(tb)); - if (nfattr_parse_nested(tb, CTA_PROTONAT_MAX, attr) < 0) goto nfattr_failure; @@ -646,7 +640,6 @@ ctnetlink_parse_nat(struct nfattr *cda[], DEBUGP("entered %s\n", __FUNCTION__); - memset(tb, 0, CTA_NAT_MAX * sizeof(tb)); memset(range, 0, sizeof(*range)); if (nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]) < 0) @@ -684,7 +677,6 @@ ctnetlink_parse_help(struct nfattr *attr, char **helper_name) struct nfattr *tb[CTA_HELP_MAX]; DEBUGP("entered %s\n", __FUNCTION__); - memset(tb, 0, CTA_HELP_MAX * sizeof(tb)); if (nfattr_parse_nested(tb, CTA_HELP_MAX, attr) < 0) goto nfattr_failure; -- cgit v1.2.3 From 88aa0429048d08c18f2772782588f953bbbd79be Mon Sep 17 00:00:00 2001 From: Pablo Neira Date: Tue, 9 Aug 2005 20:02:55 -0700 Subject: [NETFILTER]: conntrack_netlink: Fix locking during conntrack_create The current codepath allowed for ip_conntrack_lock to be unlock'ed twice. Signed-off-by: Pablo Neira Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 36a046f2210..0ab2d7df6bc 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -1052,13 +1052,14 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, err = -ENOENT; if (nlh->nlmsg_flags & NLM_F_CREATE) err = ctnetlink_create_conntrack(cda, &otuple, &rtuple); + return err; + } + /* implicit 'else' */ + + /* we only allow nat config for new conntracks */ + if (cda[CTA_NAT-1]) { + err = -EINVAL; goto out_unlock; - } else { - /* we only allow nat config for new conntracks */ - if (cda[CTA_NAT-1]) { - err = -EINVAL; - goto out_unlock; - } } /* We manipulate the conntrack inside the global conntrack table lock, -- cgit v1.2.3 From bd9a26b7f2ee7567571bb5b7acc1a256c544a0dd Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 20:03:22 -0700 Subject: [NETFILTER]: fix ctnetlink 'create_expect' parsing There was a stupid copy+paste mistake where we parse the MASK nfattr into the "tuple" variable instead of the "mask" variable. This patch fixes it. Thanks to Pablo Neira. Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 0ab2d7df6bc..23f18f6a553 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -1388,7 +1388,7 @@ ctnetlink_create_expect(struct nfattr *cda[]) err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); if (err < 0) return err; - err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASK); + err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK); if (err < 0) return err; -- cgit v1.2.3 From 927ccbcc28dceee29dad876982768cca29738564 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 20:03:40 -0700 Subject: [NETFILTER]: attribute count is an attribute of message type, not subsytem Prior to this patch, every nfnetlink subsystem had to specify it's attribute count. However, in reality the attribute count depends on the message type within the subsystem, not the subsystem itself. This patch moves 'attr_count' from 'struct nfnetlink_subsys' into nfnl_callback to fix this. Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 23f18f6a553..53d98974dcf 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -1484,21 +1484,28 @@ static struct notifier_block ctnl_notifier_exp = { static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { [IPCTNL_MSG_CT_NEW] = { .call = ctnetlink_new_conntrack, + .attr_count = CTA_MAX, .cap_required = CAP_NET_ADMIN }, [IPCTNL_MSG_CT_GET] = { .call = ctnetlink_get_conntrack, + .attr_count = CTA_MAX, .cap_required = CAP_NET_ADMIN }, [IPCTNL_MSG_CT_DELETE] = { .call = ctnetlink_del_conntrack, + .attr_count = CTA_MAX, .cap_required = CAP_NET_ADMIN }, [IPCTNL_MSG_CT_GET_CTRZERO] = { .call = ctnetlink_get_conntrack, + .attr_count = CTA_MAX, .cap_required = CAP_NET_ADMIN }, }; static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_MAX] = { [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect, + .attr_count = CTA_EXPECT_MAX, .cap_required = CAP_NET_ADMIN }, [IPCTNL_MSG_EXP_NEW] = { .call = ctnetlink_new_expect, + .attr_count = CTA_EXPECT_MAX, .cap_required = CAP_NET_ADMIN }, [IPCTNL_MSG_EXP_DELETE] = { .call = ctnetlink_del_expect, + .attr_count = CTA_EXPECT_MAX, .cap_required = CAP_NET_ADMIN }, }; @@ -1506,7 +1513,6 @@ static struct nfnetlink_subsystem ctnl_subsys = { .name = "conntrack", .subsys_id = NFNL_SUBSYS_CTNETLINK, .cb_count = IPCTNL_MSG_MAX, - .attr_count = CTA_MAX, .cb = ctnl_cb, }; @@ -1514,7 +1520,6 @@ static struct nfnetlink_subsystem ctnl_exp_subsys = { .name = "conntrack_expect", .subsys_id = NFNL_SUBSYS_CTNETLINK_EXP, .cb_count = IPCTNL_MSG_EXP_MAX, - .attr_count = CTA_MAX, .cb = ctnl_exp_cb, }; -- cgit v1.2.3 From 1444fc559b01aa5d4fedf4ee4f306a9e9cd56f95 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 20:04:07 -0700 Subject: [NETFILTER]: don't use nested attributes for conntrack_expect We used to use nested nfattr structures for ip_conntrack_expect. This is bogus, since ip_conntrack and ip_conntrack_expect are communicated in different netlink message types. both should be encoded at the top level attributes, no extra nesting required. This patch addresses the issue. Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 85 ++++++++++++++----------------- 1 file changed, 39 insertions(+), 46 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 53d98974dcf..f5bda82c287 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -1100,18 +1100,21 @@ static inline int ctnetlink_exp_dump_expect(struct sk_buff *skb, const struct ip_conntrack_expect *exp) { + struct ip_conntrack *master = exp->master; u_int32_t timeout = htonl((exp->timeout.expires - jiffies) / HZ); u_int32_t id = htonl(exp->id); - struct nfattr *nest_parms = NFA_NEST(skb, CTA_EXPECT); if (ctnetlink_exp_dump_tuple(skb, &exp->tuple, CTA_EXPECT_TUPLE) < 0) goto nfattr_failure; if (ctnetlink_exp_dump_tuple(skb, &exp->mask, CTA_EXPECT_MASK) < 0) goto nfattr_failure; + if (ctnetlink_exp_dump_tuple(skb, + &master->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + CTA_EXPECT_MASTER) < 0) + goto nfattr_failure; NFA_PUT(skb, CTA_EXPECT_TIMEOUT, sizeof(timeout), &timeout); NFA_PUT(skb, CTA_EXPECT_ID, sizeof(u_int32_t), &id); - NFA_NEST_END(skb, nest_parms); return 0; @@ -1259,10 +1262,8 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb, return 0; } - if (cda[CTA_TUPLE_ORIG-1]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); - else if (cda[CTA_TUPLE_REPLY-1]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY); + if (cda[CTA_EXPECT_MASTER-1]) + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER); else return -EINVAL; @@ -1310,13 +1311,33 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, struct ip_conntrack_helper *h; int err; - /* delete by tuple needs either orig or reply tuple */ - if (cda[CTA_TUPLE_ORIG-1]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_ORIG); - else if (cda[CTA_TUPLE_REPLY-1]) - err = ctnetlink_parse_tuple(cda, &tuple, CTA_TUPLE_REPLY); - else if (cda[CTA_HELP_NAME-1]) { - char *name = NFA_DATA(cda[CTA_HELP_NAME-1]); + if (cda[CTA_EXPECT_TUPLE-1]) { + /* delete a single expect by tuple */ + err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); + if (err < 0) + return err; + + /* bump usage count to 2 */ + exp = ip_conntrack_expect_find_get(&tuple); + if (!exp) + return -ENOENT; + + if (cda[CTA_EXPECT_ID-1]) { + u_int32_t id = + *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]); + if (exp->id != ntohl(id)) { + ip_conntrack_expect_put(exp); + return -ENOENT; + } + } + + /* after list removal, usage count == 1 */ + ip_conntrack_unexpect_related(exp); + /* have to put what we 'get' above. + * after this line usage count == 0 */ + ip_conntrack_expect_put(exp); + } else if (cda[CTA_EXPECT_HELP_NAME-1]) { + char *name = NFA_DATA(cda[CTA_EXPECT_HELP_NAME-1]); /* delete all expectations for this helper */ write_lock_bh(&ip_conntrack_lock); @@ -1332,7 +1353,6 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, __ip_ct_expect_unlink_destroy(exp); } write_unlock(&ip_conntrack_lock); - return 0; } else { /* This basically means we have to flush everything*/ write_lock_bh(&ip_conntrack_lock); @@ -1342,30 +1362,8 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, __ip_ct_expect_unlink_destroy(exp); } write_unlock_bh(&ip_conntrack_lock); - return 0; } - if (err < 0) - return err; - - /* bump usage count to 2 */ - exp = ip_conntrack_expect_find_get(&tuple); - if (!exp) - return -ENOENT; - - if (cda[CTA_EXPECT_ID-1]) { - u_int32_t id = *(u_int32_t *)NFA_DATA(cda[CTA_EXPECT_ID-1]); - if (exp->id != ntohl(id)) { - ip_conntrack_expect_put(exp); - return -ENOENT; - } - } - - /* after list removal, usage count == 1 */ - ip_conntrack_unexpect_related(exp); - /* have to put what we 'get' above. after this line usage count == 0 */ - ip_conntrack_expect_put(exp); - return 0; } static int @@ -1385,21 +1383,14 @@ ctnetlink_create_expect(struct nfattr *cda[]) DEBUGP("entered %s\n", __FUNCTION__); + /* caller guarantees that those three CTA_EXPECT_* exist */ err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); if (err < 0) return err; err = ctnetlink_parse_tuple(cda, &mask, CTA_EXPECT_MASK); if (err < 0) return err; - - if (cda[CTA_TUPLE_ORIG-1]) - err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_TUPLE_ORIG); - else if (cda[CTA_TUPLE_REPLY-1]) - err = ctnetlink_parse_tuple(cda, &master_tuple, - CTA_TUPLE_REPLY); - else - return -EINVAL; - + err = ctnetlink_parse_tuple(cda, &master_tuple, CTA_EXPECT_MASTER); if (err < 0) return err; @@ -1444,7 +1435,9 @@ ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb, DEBUGP("entered %s\n", __FUNCTION__); - if (!cda[CTA_EXPECT_TUPLE-1] || !cda[CTA_EXPECT_MASK-1]) + if (!cda[CTA_EXPECT_TUPLE-1] + || !cda[CTA_EXPECT_MASK-1] + || !cda[CTA_EXPECT_MASTER-1]) return -EINVAL; err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_TUPLE); -- cgit v1.2.3 From 14a50bbaa51202b676a95e9b41bc5ed6c77aa9cc Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 9 Aug 2005 20:05:52 -0700 Subject: [NETFILTER]: ctnetlink: make sure event order is correct The following sequence is displayed during events dumping of an ICMP connection: [NEW] [DESTROY] [UPDATE] This happens because the event IPCT_DESTROY is delivered in death_by_timeout(), that is called from the icmp protocol helper (ct->timeout.function) once we see the reply. To fix this, we move this event to destroy_conntrack(). Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 5c3f16eae2d..dace93eacc5 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -316,6 +316,7 @@ destroy_conntrack(struct nf_conntrack *nfct) IP_NF_ASSERT(atomic_read(&nfct->use) == 0); IP_NF_ASSERT(!timer_pending(&ct->timeout)); + ip_conntrack_event(IPCT_DESTROY, ct); set_bit(IPS_DYING_BIT, &ct->status); /* To make sure we don't get any weird locking issues here: @@ -355,7 +356,6 @@ static void death_by_timeout(unsigned long ul_conntrack) { struct ip_conntrack *ct = (void *)ul_conntrack; - ip_conntrack_event(IPCT_DESTROY, ct); write_lock_bh(&ip_conntrack_lock); /* Inside lock so preempt is disabled on module removal path. * Otherwise we can get spurious warnings. */ -- cgit v1.2.3 From 37012f7fd326eb3c959428a4fe7e203e6304fe43 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 9 Aug 2005 20:06:11 -0700 Subject: [NETFILTER]: fix conntrack refcount leak in unlink_expect() In unlink_expect(), the expectation is removed from the list so the refcount must be dropped as well. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index dace93eacc5..9261388d5ac 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -204,6 +204,7 @@ static void unlink_expect(struct ip_conntrack_expect *exp) list_del(&exp->list); CONNTRACK_STAT_INC(expect_delete); exp->master->expecting--; + ip_conntrack_expect_put(exp); } void __ip_ct_expect_unlink_destroy(struct ip_conntrack_expect *exp) -- cgit v1.2.3 From 28b19d99ac6d92e4fb2fe34144c495019a638a64 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 9 Aug 2005 20:06:27 -0700 Subject: [NETFILTER]: Fix typo in ctnl_exp_cb array (no bug, just memory waste) This fixes the size of the ctnl_exp_cb array that is IPCTNL_MSG_EXP_MAX instead of IPCTNL_MSG_MAX. Simple typo. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index f5bda82c287..e3ba449e3e1 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -1490,7 +1490,7 @@ static struct nfnl_callback ctnl_cb[IPCTNL_MSG_MAX] = { .cap_required = CAP_NET_ADMIN }, }; -static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_MAX] = { +static struct nfnl_callback ctnl_exp_cb[IPCTNL_MSG_EXP_MAX] = { [IPCTNL_MSG_EXP_GET] = { .call = ctnetlink_get_expect, .attr_count = CTA_EXPECT_MAX, .cap_required = CAP_NET_ADMIN }, -- cgit v1.2.3 From ff21d5774b4a186c98be6398eacde75d896db804 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 9 Aug 2005 20:06:42 -0700 Subject: [NETFILTER]: fix list traversal order in ctnetlink Currently conntracks are inserted after the head. That means that conntracks are sorted from the biggest to the smallest id. This happens because we use list_prepend (list_add) instead list_add_tail. This can result in problems during the list iteration. list_for_each(i, &ip_conntrack_hash[cb->args[0]]) { h = (struct ip_conntrack_tuple_hash *) i; if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; ct = tuplehash_to_ctrack(h); if (ct->id <= *id) continue; In that case just the first conntrack in the bucket will be dumped. To fix this, we iterate the list from the tail to the head via list_for_each_prev. Same thing for the list of expectations. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index e3ba449e3e1..1221a9c8bac 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -404,7 +404,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) read_lock_bh(&ip_conntrack_lock); for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { - list_for_each(i, &ip_conntrack_hash[cb->args[0]]) { + list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { h = (struct ip_conntrack_tuple_hash *) i; if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; @@ -441,7 +441,7 @@ ctnetlink_dump_table_w(struct sk_buff *skb, struct netlink_callback *cb) write_lock_bh(&ip_conntrack_lock); for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { - list_for_each(i, &ip_conntrack_hash[cb->args[0]]) { + list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { h = (struct ip_conntrack_tuple_hash *) i; if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; @@ -1214,7 +1214,7 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) DEBUGP("entered %s, last id=%llu\n", __FUNCTION__, *id); read_lock_bh(&ip_conntrack_lock); - list_for_each(i, &ip_conntrack_expect_list) { + list_for_each_prev(i, &ip_conntrack_expect_list) { exp = (struct ip_conntrack_expect *) i; if (exp->id <= *id) continue; -- cgit v1.2.3 From 2d8c4ce51903636ce0f60addc8134aa50ab8fa76 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:07:13 -0700 Subject: [INET]: Generalise tcp_bind_hash & tcp_inherit_port This required moving tcp_bucket_cachep to inet_hashinfo. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/inet_hashtables.c | 40 +++++++++++++++++++++++++++ net/ipv4/tcp.c | 4 --- net/ipv4/tcp_ipv4.c | 68 +++------------------------------------------- 3 files changed, 44 insertions(+), 68 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 343a890bd61..33d6cbe32cd 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -14,6 +14,7 @@ */ #include +#include #include #include @@ -49,3 +50,42 @@ void inet_bind_bucket_destroy(kmem_cache_t *cachep, struct inet_bind_bucket *tb) kmem_cache_free(cachep, tb); } } + +void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, + const unsigned short snum) +{ + struct inet_sock *inet = inet_sk(sk); + inet->num = snum; + sk_add_bind_node(sk, &tb->owners); + inet->bind_hash = tb; +} + +EXPORT_SYMBOL(inet_bind_hash); + +/* + * Get rid of any references to a local port held by the given sock. + */ +static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + const int bhash = inet_bhashfn(inet->num, hashinfo->bhash_size); + struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; + struct inet_bind_bucket *tb; + + spin_lock(&head->lock); + tb = inet->bind_hash; + __sk_del_bind_node(sk); + inet->bind_hash = NULL; + inet->num = 0; + inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + spin_unlock(&head->lock); +} + +void inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk) +{ + local_bh_disable(); + __inet_put_port(hashinfo, sk); + local_bh_enable(); +} + +EXPORT_SYMBOL(inet_put_port); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e54a410ca70..38c04c1a754 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -271,10 +271,6 @@ int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics); -kmem_cache_t *tcp_bucket_cachep; - -EXPORT_SYMBOL_GPL(tcp_bucket_cachep); - kmem_cache_t *tcp_timewait_cachep; atomic_t tcp_orphan_count = ATOMIC_INIT(0); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 10a9b3ae344..40fe4f5fca1 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -104,37 +104,6 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { int sysctl_local_port_range[2] = { 1024, 4999 }; int tcp_port_rover = 1024 - 1; -/* Caller must disable local BH processing. */ -static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child) -{ - struct inet_bind_hashbucket *head = - &tcp_bhash[inet_bhashfn(inet_sk(child)->num, - tcp_bhash_size)]; - struct inet_bind_bucket *tb; - - spin_lock(&head->lock); - tb = inet_sk(sk)->bind_hash; - sk_add_bind_node(child, &tb->owners); - inet_sk(child)->bind_hash = tb; - spin_unlock(&head->lock); -} - -inline void tcp_inherit_port(struct sock *sk, struct sock *child) -{ - local_bh_disable(); - __tcp_inherit_port(sk, child); - local_bh_enable(); -} - -void tcp_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, - const unsigned short snum) -{ - struct inet_sock *inet = inet_sk(sk); - inet->num = snum; - sk_add_bind_node(sk, &tb->owners); - inet->bind_hash = tb; -} - static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) { const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk); @@ -248,7 +217,7 @@ tb_not_found: tb->fastreuse = 0; success: if (!inet_sk(sk)->bind_hash) - tcp_bind_hash(sk, tb, snum); + inet_bind_hash(sk, tb, snum); BUG_TRAP(inet_sk(sk)->bind_hash == tb); ret = 0; @@ -259,32 +228,6 @@ fail: return ret; } -/* Get rid of any references to a local port held by the - * given sock. - */ -static void __tcp_put_port(struct sock *sk) -{ - struct inet_sock *inet = inet_sk(sk); - struct inet_bind_hashbucket *head = &tcp_bhash[inet_bhashfn(inet->num, - tcp_bhash_size)]; - struct inet_bind_bucket *tb; - - spin_lock(&head->lock); - tb = inet->bind_hash; - __sk_del_bind_node(sk); - inet->bind_hash = NULL; - inet->num = 0; - inet_bind_bucket_destroy(tcp_bucket_cachep, tb); - spin_unlock(&head->lock); -} - -void tcp_put_port(struct sock *sk) -{ - local_bh_disable(); - __tcp_put_port(sk); - local_bh_enable(); -} - /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. * Look, when several writers sleep and reader wakes them up, all but one * immediately hit write lock and grab all the cpus. Exclusive sleep solves @@ -678,7 +621,7 @@ ok: hint += i; /* Head lock still held and bh's disabled */ - tcp_bind_hash(sk, tb, port); + inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->sport = htons(port); __tcp_v4_hash(sk, 0); @@ -1537,7 +1480,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, tcp_initialize_rcv_mss(newsk); __tcp_v4_hash(newsk, 0); - __tcp_inherit_port(sk, newsk); + __inet_inherit_port(&tcp_hashinfo, sk, newsk); return newsk; @@ -1942,7 +1885,7 @@ int tcp_v4_destroy_sock(struct sock *sk) /* Clean up a referenced TCP bind bucket. */ if (inet_sk(sk)->bind_hash) - tcp_put_port(sk); + inet_put_port(&tcp_hashinfo, sk); /* * If sendmsg cached page exists, toss it. @@ -2486,14 +2429,11 @@ void __init tcp_v4_init(struct net_proto_family *ops) } EXPORT_SYMBOL(ipv4_specific); -EXPORT_SYMBOL(tcp_bind_hash); EXPORT_SYMBOL(inet_bind_bucket_create); EXPORT_SYMBOL(tcp_hashinfo); -EXPORT_SYMBOL(tcp_inherit_port); EXPORT_SYMBOL(tcp_listen_wlock); EXPORT_SYMBOL(tcp_port_rover); EXPORT_SYMBOL(tcp_prot); -EXPORT_SYMBOL(tcp_put_port); EXPORT_SYMBOL(tcp_unhash); EXPORT_SYMBOL(tcp_v4_conn_request); EXPORT_SYMBOL(tcp_v4_connect); -- cgit v1.2.3 From 6e04e02165a7209a71db553b7bc48d68421e5ebf Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:07:35 -0700 Subject: [INET]: Move tcp_port_rover to inet_hashinfo Also expose all of the tcp_hashinfo members, i.e. killing those tcp_ehash, etc macros, this will more clearly expose already generic functions and some that need just a bit of work to become generic, as we'll see in the upcoming changesets. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 42 ++++++++++---------- net/ipv4/tcp_diag.c | 8 ++-- net/ipv4/tcp_ipv4.c | 101 +++++++++++++++++++++++------------------------ net/ipv4/tcp_minisocks.c | 15 ++++--- 4 files changed, 82 insertions(+), 84 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 38c04c1a754..2f4b1a374bb 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2257,11 +2257,11 @@ void __init tcp_init(void) __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb), sizeof(skb->cb)); - tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket", - sizeof(struct inet_bind_bucket), - 0, SLAB_HWCACHE_ALIGN, - NULL, NULL); - if (!tcp_bucket_cachep) + tcp_hashinfo.bind_bucket_cachep = + kmem_cache_create("tcp_bind_bucket", + sizeof(struct inet_bind_bucket), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!tcp_hashinfo.bind_bucket_cachep) panic("tcp_init: Cannot alloc tcp_bind_bucket cache."); tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket", @@ -2276,7 +2276,7 @@ void __init tcp_init(void) * * The methodology is similar to that of the buffer cache. */ - tcp_ehash = + tcp_hashinfo.ehash = alloc_large_system_hash("TCP established", sizeof(struct inet_ehash_bucket), thash_entries, @@ -2284,37 +2284,37 @@ void __init tcp_init(void) (25 - PAGE_SHIFT) : (27 - PAGE_SHIFT), HASH_HIGHMEM, - &tcp_ehash_size, + &tcp_hashinfo.ehash_size, NULL, 0); - tcp_ehash_size = (1 << tcp_ehash_size) >> 1; - for (i = 0; i < (tcp_ehash_size << 1); i++) { - rwlock_init(&tcp_ehash[i].lock); - INIT_HLIST_HEAD(&tcp_ehash[i].chain); + tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1; + for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) { + rwlock_init(&tcp_hashinfo.ehash[i].lock); + INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); } - tcp_bhash = + tcp_hashinfo.bhash = alloc_large_system_hash("TCP bind", sizeof(struct inet_bind_hashbucket), - tcp_ehash_size, + tcp_hashinfo.ehash_size, (num_physpages >= 128 * 1024) ? (25 - PAGE_SHIFT) : (27 - PAGE_SHIFT), HASH_HIGHMEM, - &tcp_bhash_size, + &tcp_hashinfo.bhash_size, NULL, 64 * 1024); - tcp_bhash_size = 1 << tcp_bhash_size; - for (i = 0; i < tcp_bhash_size; i++) { - spin_lock_init(&tcp_bhash[i].lock); - INIT_HLIST_HEAD(&tcp_bhash[i].chain); + tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size; + for (i = 0; i < tcp_hashinfo.bhash_size; i++) { + spin_lock_init(&tcp_hashinfo.bhash[i].lock); + INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); } /* Try to be a bit smarter and adjust defaults depending * on available memory. */ for (order = 0; ((1 << order) << PAGE_SHIFT) < - (tcp_bhash_size * sizeof(struct inet_bind_hashbucket)); + (tcp_hashinfo.bhash_size * sizeof(struct inet_bind_hashbucket)); order++) ; if (order >= 4) { @@ -2329,7 +2329,7 @@ void __init tcp_init(void) sysctl_tcp_max_orphans >>= (3 - order); sysctl_max_syn_backlog = 128; } - tcp_port_rover = sysctl_local_port_range[0] - 1; + tcp_hashinfo.port_rover = sysctl_local_port_range[0] - 1; sysctl_tcp_mem[0] = 768 << order; sysctl_tcp_mem[1] = 1024 << order; @@ -2344,7 +2344,7 @@ void __init tcp_init(void) printk(KERN_INFO "TCP: Hash tables configured " "(established %d bind %d)\n", - tcp_ehash_size << 1, tcp_bhash_size); + tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size); tcp_register_congestion_control(&tcp_reno); } diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 5bb6a0f1c77..0ae738b455f 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -595,7 +595,7 @@ static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb) struct hlist_node *node; num = 0; - sk_for_each(sk, node, &tcp_listening_hash[i]) { + sk_for_each(sk, node, &tcp_hashinfo.listening_hash[i]) { struct inet_sock *inet = inet_sk(sk); if (num < s_num) { @@ -645,8 +645,8 @@ skip_listen_ht: if (!(r->tcpdiag_states&~(TCPF_LISTEN|TCPF_SYN_RECV))) return skb->len; - for (i = s_i; i < tcp_ehash_size; i++) { - struct inet_ehash_bucket *head = &tcp_ehash[i]; + for (i = s_i; i < tcp_hashinfo.ehash_size; i++) { + struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[i]; struct sock *sk; struct hlist_node *node; @@ -678,7 +678,7 @@ next_normal: if (r->tcpdiag_states&TCPF_TIME_WAIT) { sk_for_each(sk, node, - &tcp_ehash[i + tcp_ehash_size].chain) { + &tcp_hashinfo.ehash[i + tcp_hashinfo.ehash_size].chain) { struct inet_sock *inet = inet_sk(sk); if (num < s_num) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 40fe4f5fca1..f5373f9f00a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -94,6 +94,7 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { .lhash_users = ATOMIC_INIT(0), .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait), .portalloc_lock = SPIN_LOCK_UNLOCKED, + .port_rover = 1024 - 1, }; /* @@ -102,7 +103,6 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { * 32768-61000 */ int sysctl_local_port_range[2] = { 1024, 4999 }; -int tcp_port_rover = 1024 - 1; static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) { @@ -146,16 +146,16 @@ static int tcp_v4_get_port(struct sock *sk, unsigned short snum) int remaining = (high - low) + 1; int rover; - spin_lock(&tcp_portalloc_lock); - if (tcp_port_rover < low) + spin_lock(&tcp_hashinfo.portalloc_lock); + if (tcp_hashinfo.port_rover < low) rover = low; else - rover = tcp_port_rover; + rover = tcp_hashinfo.port_rover; do { rover++; if (rover > high) rover = low; - head = &tcp_bhash[inet_bhashfn(rover, tcp_bhash_size)]; + head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) if (tb->port == rover) @@ -164,8 +164,8 @@ static int tcp_v4_get_port(struct sock *sk, unsigned short snum) next: spin_unlock(&head->lock); } while (--remaining > 0); - tcp_port_rover = rover; - spin_unlock(&tcp_portalloc_lock); + tcp_hashinfo.port_rover = rover; + spin_unlock(&tcp_hashinfo.portalloc_lock); /* Exhausted local port range during search? It is not * possible for us to be holding one of the bind hash @@ -182,7 +182,7 @@ static int tcp_v4_get_port(struct sock *sk, unsigned short snum) */ snum = rover; } else { - head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)]; + head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) if (tb->port == snum) @@ -205,7 +205,7 @@ tb_found: } tb_not_found: ret = 1; - if (!tb && (tb = inet_bind_bucket_create(tcp_bucket_cachep, head, snum)) == NULL) + if (!tb && (tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum)) == NULL) goto fail_unlock; if (hlist_empty(&tb->owners)) { if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) @@ -237,22 +237,22 @@ fail: void tcp_listen_wlock(void) { - write_lock(&tcp_lhash_lock); + write_lock(&tcp_hashinfo.lhash_lock); - if (atomic_read(&tcp_lhash_users)) { + if (atomic_read(&tcp_hashinfo.lhash_users)) { DEFINE_WAIT(wait); for (;;) { - prepare_to_wait_exclusive(&tcp_lhash_wait, + prepare_to_wait_exclusive(&tcp_hashinfo.lhash_wait, &wait, TASK_UNINTERRUPTIBLE); - if (!atomic_read(&tcp_lhash_users)) + if (!atomic_read(&tcp_hashinfo.lhash_users)) break; - write_unlock_bh(&tcp_lhash_lock); + write_unlock_bh(&tcp_hashinfo.lhash_lock); schedule(); - write_lock_bh(&tcp_lhash_lock); + write_lock_bh(&tcp_hashinfo.lhash_lock); } - finish_wait(&tcp_lhash_wait, &wait); + finish_wait(&tcp_hashinfo.lhash_wait, &wait); } } @@ -263,20 +263,20 @@ static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible) BUG_TRAP(sk_unhashed(sk)); if (listen_possible && sk->sk_state == TCP_LISTEN) { - list = &tcp_listening_hash[inet_sk_listen_hashfn(sk)]; - lock = &tcp_lhash_lock; + list = &tcp_hashinfo.listening_hash[inet_sk_listen_hashfn(sk)]; + lock = &tcp_hashinfo.lhash_lock; tcp_listen_wlock(); } else { - sk->sk_hashent = inet_sk_ehashfn(sk, tcp_ehash_size); - list = &tcp_ehash[sk->sk_hashent].chain; - lock = &tcp_ehash[sk->sk_hashent].lock; + sk->sk_hashent = inet_sk_ehashfn(sk, tcp_hashinfo.ehash_size); + list = &tcp_hashinfo.ehash[sk->sk_hashent].chain; + lock = &tcp_hashinfo.ehash[sk->sk_hashent].lock; write_lock(lock); } __sk_add_node(sk, list); sock_prot_inc_use(sk->sk_prot); write_unlock(lock); if (listen_possible && sk->sk_state == TCP_LISTEN) - wake_up(&tcp_lhash_wait); + wake_up(&tcp_hashinfo.lhash_wait); } static void tcp_v4_hash(struct sock *sk) @@ -298,9 +298,9 @@ void tcp_unhash(struct sock *sk) if (sk->sk_state == TCP_LISTEN) { local_bh_disable(); tcp_listen_wlock(); - lock = &tcp_lhash_lock; + lock = &tcp_hashinfo.lhash_lock; } else { - struct inet_ehash_bucket *head = &tcp_ehash[sk->sk_hashent]; + struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[sk->sk_hashent]; lock = &head->lock; write_lock_bh(&head->lock); } @@ -311,7 +311,7 @@ void tcp_unhash(struct sock *sk) ende: if (sk->sk_state == TCP_LISTEN) - wake_up(&tcp_lhash_wait); + wake_up(&tcp_hashinfo.lhash_wait); } /* Don't inline this cruft. Here are some nice properties to @@ -366,8 +366,8 @@ static inline struct sock *tcp_v4_lookup_listener(const u32 daddr, struct sock *sk = NULL; struct hlist_head *head; - read_lock(&tcp_lhash_lock); - head = &tcp_listening_hash[inet_lhashfn(hnum)]; + read_lock(&tcp_hashinfo.lhash_lock); + head = &tcp_hashinfo.listening_hash[inet_lhashfn(hnum)]; if (!hlist_empty(head)) { struct inet_sock *inet = inet_sk((sk = __sk_head(head))); @@ -382,7 +382,7 @@ static inline struct sock *tcp_v4_lookup_listener(const u32 daddr, sherry_cache: sock_hold(sk); } - read_unlock(&tcp_lhash_lock); + read_unlock(&tcp_hashinfo.lhash_lock); return sk; } @@ -406,8 +406,8 @@ static inline struct sock *__tcp_v4_lookup_established(const u32 saddr, /* Optimize here for direct hit, only listening connections can * have wildcards anyways. */ - const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_ehash_size); - head = &tcp_ehash[hash]; + const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_hashinfo.ehash_size); + head = &tcp_hashinfo.ehash[hash]; read_lock(&head->lock); sk_for_each(sk, node, &head->chain) { if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) @@ -415,7 +415,7 @@ static inline struct sock *__tcp_v4_lookup_established(const u32 saddr, } /* Must check for a TIME_WAIT'er before going to listener hash. */ - sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) { + sk_for_each(sk, node, &(head + tcp_hashinfo.ehash_size)->chain) { if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif)) goto hit; } @@ -469,8 +469,8 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport, int dif = sk->sk_bound_dev_if; TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport); - const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_ehash_size); - struct inet_ehash_bucket *head = &tcp_ehash[hash]; + const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size); + struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash]; struct sock *sk2; struct hlist_node *node; struct tcp_tw_bucket *tw; @@ -478,7 +478,7 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport, write_lock(&head->lock); /* Check TIME-WAIT sockets first. */ - sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) { + sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) { tw = (struct tcp_tw_bucket *)sk2; if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) { @@ -582,7 +582,7 @@ static inline int tcp_v4_hash_connect(struct sock *sk) local_bh_disable(); for (i = 1; i <= range; i++) { port = low + (i + offset) % range; - head = &tcp_bhash[inet_bhashfn(port, tcp_bhash_size)]; + head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)]; spin_lock(&head->lock); /* Does not bother with rcv_saddr checks, @@ -602,7 +602,7 @@ static inline int tcp_v4_hash_connect(struct sock *sk) } } - tb = inet_bind_bucket_create(tcp_bucket_cachep, head, port); + tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port); if (!tb) { spin_unlock(&head->lock); break; @@ -637,7 +637,7 @@ ok: goto out; } - head = &tcp_bhash[inet_bhashfn(snum, tcp_bhash_size)]; + head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; tb = inet_sk(sk)->bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { @@ -1926,7 +1926,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) if (!sk) { st->bucket = 0; - sk = sk_head(&tcp_listening_hash[0]); + sk = sk_head(&tcp_hashinfo.listening_hash[0]); goto get_sk; } @@ -1980,7 +1980,7 @@ start_req: read_unlock_bh(&tp->accept_queue.syn_wait_lock); } if (++st->bucket < INET_LHTABLE_SIZE) { - sk = sk_head(&tcp_listening_hash[st->bucket]); + sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]); goto get_sk; } cur = NULL; @@ -2004,7 +2004,7 @@ static void *established_get_first(struct seq_file *seq) struct tcp_iter_state* st = seq->private; void *rc = NULL; - for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) { + for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { struct sock *sk; struct hlist_node *node; struct tcp_tw_bucket *tw; @@ -2012,8 +2012,8 @@ static void *established_get_first(struct seq_file *seq) /* We can reschedule _before_ having picked the target: */ cond_resched_softirq(); - read_lock(&tcp_ehash[st->bucket].lock); - sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) { + read_lock(&tcp_hashinfo.ehash[st->bucket].lock); + sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { if (sk->sk_family != st->family) { continue; } @@ -2022,14 +2022,14 @@ static void *established_get_first(struct seq_file *seq) } st->state = TCP_SEQ_STATE_TIME_WAIT; tw_for_each(tw, node, - &tcp_ehash[st->bucket + tcp_ehash_size].chain) { + &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) { if (tw->tw_family != st->family) { continue; } rc = tw; goto out; } - read_unlock(&tcp_ehash[st->bucket].lock); + read_unlock(&tcp_hashinfo.ehash[st->bucket].lock); st->state = TCP_SEQ_STATE_ESTABLISHED; } out: @@ -2056,15 +2056,15 @@ get_tw: cur = tw; goto out; } - read_unlock(&tcp_ehash[st->bucket].lock); + read_unlock(&tcp_hashinfo.ehash[st->bucket].lock); st->state = TCP_SEQ_STATE_ESTABLISHED; /* We can reschedule between buckets: */ cond_resched_softirq(); - if (++st->bucket < tcp_ehash_size) { - read_lock(&tcp_ehash[st->bucket].lock); - sk = sk_head(&tcp_ehash[st->bucket].chain); + if (++st->bucket < tcp_hashinfo.ehash_size) { + read_lock(&tcp_hashinfo.ehash[st->bucket].lock); + sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain); } else { cur = NULL; goto out; @@ -2078,7 +2078,7 @@ get_tw: } st->state = TCP_SEQ_STATE_TIME_WAIT; - tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain); + tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain); goto get_tw; found: cur = sk; @@ -2173,7 +2173,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) case TCP_SEQ_STATE_TIME_WAIT: case TCP_SEQ_STATE_ESTABLISHED: if (v) - read_unlock(&tcp_ehash[st->bucket].lock); + read_unlock(&tcp_hashinfo.ehash[st->bucket].lock); local_bh_enable(); break; } @@ -2432,7 +2432,6 @@ EXPORT_SYMBOL(ipv4_specific); EXPORT_SYMBOL(inet_bind_bucket_create); EXPORT_SYMBOL(tcp_hashinfo); EXPORT_SYMBOL(tcp_listen_wlock); -EXPORT_SYMBOL(tcp_port_rover); EXPORT_SYMBOL(tcp_prot); EXPORT_SYMBOL(tcp_unhash); EXPORT_SYMBOL(tcp_v4_conn_request); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 267cea1087e..f29e2f6ebe1 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -60,12 +60,11 @@ int tcp_tw_count; /* Must be called with locally disabled BHs. */ static void tcp_timewait_kill(struct tcp_tw_bucket *tw) { - struct inet_ehash_bucket *ehead; struct inet_bind_hashbucket *bhead; struct inet_bind_bucket *tb; - /* Unlink from established hashes. */ - ehead = &tcp_ehash[tw->tw_hashent]; + struct inet_ehash_bucket *ehead = &tcp_hashinfo.ehash[tw->tw_hashent]; + write_lock(&ehead->lock); if (hlist_unhashed(&tw->tw_node)) { write_unlock(&ehead->lock); @@ -76,12 +75,12 @@ static void tcp_timewait_kill(struct tcp_tw_bucket *tw) write_unlock(&ehead->lock); /* Disassociate with bind bucket. */ - bhead = &tcp_bhash[inet_bhashfn(tw->tw_num, tcp_bhash_size)]; + bhead = &tcp_hashinfo.bhash[inet_bhashfn(tw->tw_num, tcp_hashinfo.bhash_size)]; spin_lock(&bhead->lock); tb = tw->tw_tb; __hlist_del(&tw->tw_bind_node); tw->tw_tb = NULL; - inet_bind_bucket_destroy(tcp_bucket_cachep, tb); + inet_bind_bucket_destroy(tcp_hashinfo.bind_bucket_cachep, tb); spin_unlock(&bhead->lock); #ifdef SOCK_REFCNT_DEBUG @@ -297,13 +296,13 @@ kill: static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) { const struct inet_sock *inet = inet_sk(sk); - struct inet_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent]; + struct inet_ehash_bucket *ehead = &tcp_hashinfo.ehash[sk->sk_hashent]; struct inet_bind_hashbucket *bhead; /* Step 1: Put TW into bind hash. Original socket stays there too. Note, that any socket with inet->num != 0 MUST be bound in binding cache, even if it is closed. */ - bhead = &tcp_bhash[inet_bhashfn(inet->num, tcp_bhash_size)]; + bhead = &tcp_hashinfo.bhash[inet_bhashfn(inet->num, tcp_hashinfo.bhash_size)]; spin_lock(&bhead->lock); tw->tw_tb = inet->bind_hash; BUG_TRAP(inet->bind_hash); @@ -317,7 +316,7 @@ static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) sock_prot_dec_use(sk->sk_prot); /* Step 3: Hash TW into TIMEWAIT half of established hash table. */ - tw_add_node(tw, &(ehead + tcp_ehash_size)->chain); + tw_add_node(tw, &(ehead + tcp_hashinfo.ehash_size)->chain); atomic_inc(&tw->tw_refcnt); write_unlock(&ehead->lock); -- cgit v1.2.3 From f3f05f7046e7c85b04af390d95a82a27160dd5d0 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:08:09 -0700 Subject: [INET]: Generalise the tcp_listen_ lock routines Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/inet_hashtables.c | 32 +++++++++++++++++++++ net/ipv4/tcp_diag.c | 8 +++--- net/ipv4/tcp_ipv4.c | 70 ++++++---------------------------------------- 3 files changed, 45 insertions(+), 65 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 33d6cbe32cd..06cbc6f689c 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -15,7 +15,9 @@ #include #include +#include #include +#include #include @@ -89,3 +91,33 @@ void inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk) } EXPORT_SYMBOL(inet_put_port); + +/* + * This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. + * Look, when several writers sleep and reader wakes them up, all but one + * immediately hit write lock and grab all the cpus. Exclusive sleep solves + * this, _but_ remember, it adds useless work on UP machines (wake up each + * exclusive lock release). It should be ifdefed really. + */ +void inet_listen_wlock(struct inet_hashinfo *hashinfo) +{ + write_lock(&hashinfo->lhash_lock); + + if (atomic_read(&hashinfo->lhash_users)) { + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait_exclusive(&hashinfo->lhash_wait, + &wait, TASK_UNINTERRUPTIBLE); + if (!atomic_read(&hashinfo->lhash_users)) + break; + write_unlock_bh(&hashinfo->lhash_lock); + schedule(); + write_lock_bh(&hashinfo->lhash_lock); + } + + finish_wait(&hashinfo->lhash_wait, &wait); + } +} + +EXPORT_SYMBOL(inet_listen_wlock); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 0ae738b455f..1a89a03c449 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -589,7 +589,7 @@ static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb) if (cb->args[0] == 0) { if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV))) goto skip_listen_ht; - tcp_listen_lock(); + inet_listen_lock(&tcp_hashinfo); for (i = s_i; i < INET_LHTABLE_SIZE; i++) { struct sock *sk; struct hlist_node *node; @@ -613,7 +613,7 @@ static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb) goto syn_recv; if (tcpdiag_dump_sock(skb, sk, cb) < 0) { - tcp_listen_unlock(); + inet_listen_unlock(&tcp_hashinfo); goto done; } @@ -622,7 +622,7 @@ syn_recv: goto next_listen; if (tcpdiag_dump_reqs(skb, sk, cb) < 0) { - tcp_listen_unlock(); + inet_listen_unlock(&tcp_hashinfo); goto done; } @@ -636,7 +636,7 @@ next_listen: cb->args[3] = 0; cb->args[4] = 0; } - tcp_listen_unlock(); + inet_listen_unlock(&tcp_hashinfo); skip_listen_ht: cb->args[0] = 1; s_i = num = s_num = 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f5373f9f00a..5f9ad95304c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -228,62 +228,11 @@ fail: return ret; } -/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. - * Look, when several writers sleep and reader wakes them up, all but one - * immediately hit write lock and grab all the cpus. Exclusive sleep solves - * this, _but_ remember, it adds useless work on UP machines (wake up each - * exclusive lock release). It should be ifdefed really. - */ - -void tcp_listen_wlock(void) -{ - write_lock(&tcp_hashinfo.lhash_lock); - - if (atomic_read(&tcp_hashinfo.lhash_users)) { - DEFINE_WAIT(wait); - - for (;;) { - prepare_to_wait_exclusive(&tcp_hashinfo.lhash_wait, - &wait, TASK_UNINTERRUPTIBLE); - if (!atomic_read(&tcp_hashinfo.lhash_users)) - break; - write_unlock_bh(&tcp_hashinfo.lhash_lock); - schedule(); - write_lock_bh(&tcp_hashinfo.lhash_lock); - } - - finish_wait(&tcp_hashinfo.lhash_wait, &wait); - } -} - -static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible) -{ - struct hlist_head *list; - rwlock_t *lock; - - BUG_TRAP(sk_unhashed(sk)); - if (listen_possible && sk->sk_state == TCP_LISTEN) { - list = &tcp_hashinfo.listening_hash[inet_sk_listen_hashfn(sk)]; - lock = &tcp_hashinfo.lhash_lock; - tcp_listen_wlock(); - } else { - sk->sk_hashent = inet_sk_ehashfn(sk, tcp_hashinfo.ehash_size); - list = &tcp_hashinfo.ehash[sk->sk_hashent].chain; - lock = &tcp_hashinfo.ehash[sk->sk_hashent].lock; - write_lock(lock); - } - __sk_add_node(sk, list); - sock_prot_inc_use(sk->sk_prot); - write_unlock(lock); - if (listen_possible && sk->sk_state == TCP_LISTEN) - wake_up(&tcp_hashinfo.lhash_wait); -} - static void tcp_v4_hash(struct sock *sk) { if (sk->sk_state != TCP_CLOSE) { local_bh_disable(); - __tcp_v4_hash(sk, 1); + __inet_hash(&tcp_hashinfo, sk, 1); local_bh_enable(); } } @@ -297,7 +246,7 @@ void tcp_unhash(struct sock *sk) if (sk->sk_state == TCP_LISTEN) { local_bh_disable(); - tcp_listen_wlock(); + inet_listen_wlock(&tcp_hashinfo); lock = &tcp_hashinfo.lhash_lock; } else { struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[sk->sk_hashent]; @@ -624,7 +573,7 @@ ok: inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->sport = htons(port); - __tcp_v4_hash(sk, 0); + __inet_hash(&tcp_hashinfo, sk, 0); } spin_unlock(&head->lock); @@ -641,7 +590,7 @@ ok: tb = inet_sk(sk)->bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - __tcp_v4_hash(sk, 0); + __inet_hash(&tcp_hashinfo, sk, 0); spin_unlock_bh(&head->lock); return 0; } else { @@ -1479,7 +1428,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newtp->advmss = dst_metric(dst, RTAX_ADVMSS); tcp_initialize_rcv_mss(newsk); - __tcp_v4_hash(newsk, 0); + __inet_hash(&tcp_hashinfo, newsk, 0); __inet_inherit_port(&tcp_hashinfo, sk, newsk); return newsk; @@ -2102,12 +2051,12 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos) void *rc; struct tcp_iter_state* st = seq->private; - tcp_listen_lock(); + inet_listen_lock(&tcp_hashinfo); st->state = TCP_SEQ_STATE_LISTENING; rc = listening_get_idx(seq, &pos); if (!rc) { - tcp_listen_unlock(); + inet_listen_unlock(&tcp_hashinfo); local_bh_disable(); st->state = TCP_SEQ_STATE_ESTABLISHED; rc = established_get_idx(seq, pos); @@ -2140,7 +2089,7 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) case TCP_SEQ_STATE_LISTENING: rc = listening_get_next(seq, v); if (!rc) { - tcp_listen_unlock(); + inet_listen_unlock(&tcp_hashinfo); local_bh_disable(); st->state = TCP_SEQ_STATE_ESTABLISHED; rc = established_get_first(seq); @@ -2168,7 +2117,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) } case TCP_SEQ_STATE_LISTENING: if (v != SEQ_START_TOKEN) - tcp_listen_unlock(); + inet_listen_unlock(&tcp_hashinfo); break; case TCP_SEQ_STATE_TIME_WAIT: case TCP_SEQ_STATE_ESTABLISHED: @@ -2431,7 +2380,6 @@ void __init tcp_v4_init(struct net_proto_family *ops) EXPORT_SYMBOL(ipv4_specific); EXPORT_SYMBOL(inet_bind_bucket_create); EXPORT_SYMBOL(tcp_hashinfo); -EXPORT_SYMBOL(tcp_listen_wlock); EXPORT_SYMBOL(tcp_prot); EXPORT_SYMBOL(tcp_unhash); EXPORT_SYMBOL(tcp_v4_conn_request); -- cgit v1.2.3 From c752f0739f09b803aed191c4765a3b6650a08653 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:08:28 -0700 Subject: [TCP]: Move the tcp sock states to net/tcp_states.h Lots of places just needs the states, not even linux/tcp.h, where this enum was, needs it. This speeds up development of the refactorings as less sources are rebuilt when things get moved from net/tcp.h. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/datagram.c | 2 +- net/ipv4/ipvs/ip_vs_app.c | 1 + net/ipv4/protocol.c | 1 - net/ipv4/raw.c | 2 +- net/ipv4/udp.c | 3 ++- 5 files changed, 5 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index b1db561f254..3fd49f4282a 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -17,8 +17,8 @@ #include #include #include -#include #include +#include int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c index d9212addd19..6e092dadb38 100644 --- a/net/ipv4/ipvs/ip_vs_app.c +++ b/net/ipv4/ipvs/ip_vs_app.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 0db405a869f..291831e792a 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index e222c5c26b3..304bb0a1d4f 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -59,7 +59,6 @@ #include #include #include -#include #include #include #include @@ -71,6 +70,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index dc4d07357e3..a8135e1f528 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -95,7 +95,8 @@ #include #include #include -#include +#include +#include #include #include #include -- cgit v1.2.3 From 81849d106b1fb97f8e2d311c0c4d36347def55b8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:08:50 -0700 Subject: [INET]: Generalise tcp_v4_hash & tcp_unhash It really just makes the existing code be a helper function that tcp_v4_hash and tcp_unhash uses, specifying the right inet_hashinfo, tcp_hashinfo. One thing I'll investigate at some point is to have the inet_hashinfo pointer in sk_prot, so that we get all the hashtable information from the sk pointer, this can lead to some extra indirections that may well hurt performance/code size, we'll see. Ultimate idea would be that sk_prot would provide _all_ the information about a protocol implementation. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5f9ad95304c..dca1be67164 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -230,37 +230,12 @@ fail: static void tcp_v4_hash(struct sock *sk) { - if (sk->sk_state != TCP_CLOSE) { - local_bh_disable(); - __inet_hash(&tcp_hashinfo, sk, 1); - local_bh_enable(); - } + inet_hash(&tcp_hashinfo, sk); } void tcp_unhash(struct sock *sk) { - rwlock_t *lock; - - if (sk_unhashed(sk)) - goto ende; - - if (sk->sk_state == TCP_LISTEN) { - local_bh_disable(); - inet_listen_wlock(&tcp_hashinfo); - lock = &tcp_hashinfo.lhash_lock; - } else { - struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[sk->sk_hashent]; - lock = &head->lock; - write_lock_bh(&head->lock); - } - - if (__sk_del_node_init(sk)) - sock_prot_dec_use(sk->sk_prot); - write_unlock_bh(lock); - - ende: - if (sk->sk_state == TCP_LISTEN) - wake_up(&tcp_hashinfo.lhash_wait); + inet_unhash(&tcp_hashinfo, sk); } /* Don't inline this cruft. Here are some nice properties to -- cgit v1.2.3 From 33b62231908c58ae04185e4f1063d1e35a7c8576 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:09:06 -0700 Subject: [INET]: Generalise tcp_v4_lookup_listener [acme@toy net-2.6.14]$ grep built-in /tmp/before /tmp/after /tmp/before: 282560 13122 9312 304994 4a762 net/ipv4/built-in.o /tmp/after: 282560 13122 9312 304994 4a762 net/ipv4/built-in.o Will be used in DCCP, not exporting it right now not to get in Adrian Bunk's exported-but-not-used-on-modules radar 8) Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/inet_hashtables.c | 41 +++++++++++++++++++++++ net/ipv4/tcp_ipv4.c | 81 +++------------------------------------------- 2 files changed, 46 insertions(+), 76 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 06cbc6f689c..88fcba05b7d 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -121,3 +121,44 @@ void inet_listen_wlock(struct inet_hashinfo *hashinfo) } EXPORT_SYMBOL(inet_listen_wlock); + +/* + * Don't inline this cruft. Here are some nice properties to exploit here. The + * BSD API does not allow a listening sock to specify the remote port nor the + * remote address for the connection. So always assume those are both + * wildcarded during the search since they can never be otherwise. + */ +struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 daddr, + const unsigned short hnum, const int dif) +{ + struct sock *result = NULL, *sk; + const struct hlist_node *node; + int hiscore = -1; + + sk_for_each(sk, node, head) { + const struct inet_sock *inet = inet_sk(sk); + + if (inet->num == hnum && !ipv6_only_sock(sk)) { + const __u32 rcv_saddr = inet->rcv_saddr; + int score = sk->sk_family == PF_INET ? 1 : 0; + + if (rcv_saddr) { + if (rcv_saddr != daddr) + continue; + score += 2; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + continue; + score += 2; + } + if (score == 5) + return sk; + if (score > hiscore) { + hiscore = score; + result = sk; + } + } + } + return result; +} diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index dca1be67164..a678709b36f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -238,78 +238,6 @@ void tcp_unhash(struct sock *sk) inet_unhash(&tcp_hashinfo, sk); } -/* Don't inline this cruft. Here are some nice properties to - * exploit here. The BSD API does not allow a listening TCP - * to specify the remote port nor the remote address for the - * connection. So always assume those are both wildcarded - * during the search since they can never be otherwise. - */ -static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, - const u32 daddr, - const unsigned short hnum, - const int dif) -{ - struct sock *result = NULL, *sk; - struct hlist_node *node; - int score, hiscore; - - hiscore=-1; - sk_for_each(sk, node, head) { - struct inet_sock *inet = inet_sk(sk); - - if (inet->num == hnum && !ipv6_only_sock(sk)) { - __u32 rcv_saddr = inet->rcv_saddr; - - score = (sk->sk_family == PF_INET ? 1 : 0); - if (rcv_saddr) { - if (rcv_saddr != daddr) - continue; - score+=2; - } - if (sk->sk_bound_dev_if) { - if (sk->sk_bound_dev_if != dif) - continue; - score+=2; - } - if (score == 5) - return sk; - if (score > hiscore) { - hiscore = score; - result = sk; - } - } - } - return result; -} - -/* Optimize the common listener case. */ -static inline struct sock *tcp_v4_lookup_listener(const u32 daddr, - const unsigned short hnum, - const int dif) -{ - struct sock *sk = NULL; - struct hlist_head *head; - - read_lock(&tcp_hashinfo.lhash_lock); - head = &tcp_hashinfo.listening_hash[inet_lhashfn(hnum)]; - if (!hlist_empty(head)) { - struct inet_sock *inet = inet_sk((sk = __sk_head(head))); - - if (inet->num == hnum && !sk->sk_node.next && - (!inet->rcv_saddr || inet->rcv_saddr == daddr) && - (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && - !sk->sk_bound_dev_if) - goto sherry_cache; - sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif); - } - if (sk) { -sherry_cache: - sock_hold(sk); - } - read_unlock(&tcp_hashinfo.lhash_lock); - return sk; -} - /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM * @@ -358,7 +286,7 @@ static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport, struct sock *sk = __tcp_v4_lookup_established(saddr, sport, daddr, hnum, dif); - return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif); + return sk ? : inet_lookup_listener(&tcp_hashinfo, daddr, hnum, dif); } inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, @@ -1641,9 +1569,10 @@ do_time_wait: switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk, skb, th, skb->len)) { case TCP_TW_SYN: { - struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, - ntohs(th->dest), - tcp_v4_iif(skb)); + struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo, + skb->nh.iph->daddr, + ntohs(th->dest), + tcp_v4_iif(skb)); if (sk2) { tcp_tw_deschedule((struct tcp_tw_bucket *)sk); tcp_tw_put((struct tcp_tw_bucket *)sk); -- cgit v1.2.3 From 8feaf0c0a5488b3d898a9c207eb6678f44ba3f26 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:09:30 -0700 Subject: [INET]: Generalise tcp_tw_bucket, aka TIME_WAIT sockets This paves the way to generalise the rest of the sock ID lookup routines and saves some bytes in TCPv4 TIME_WAIT sockets on distro kernels (where IPv6 is always built as a module): [root@qemu ~]# grep tw_sock /proc/slabinfo tw_sock_TCPv6 0 0 128 31 1 tw_sock_TCP 0 0 96 41 1 [root@qemu ~]# Now if a protocol wants to use the TIME_WAIT generic infrastructure it only has to set the sk_prot->twsk_obj_size field with the size of its inet_timewait_sock derived sock and proto_register will create sk_prot->twsk_slab, for now its only for INET sockets, but we can introduce timewait_sock later if some non INET transport protocolo wants to use this stuff. Next changesets will take advantage of this new infrastructure to generalise even more TCP code. [acme@toy net-2.6.14]$ grep built-in /tmp/before.size /tmp/after.size /tmp/before.size: 188646 11764 5068 205478 322a6 net/ipv4/built-in.o /tmp/after.size: 188144 11764 5068 204976 320b0 net/ipv4/built-in.o [acme@toy net-2.6.14]$ Tested with both IPv4 & IPv6 (::1 (localhost) & ::ffff:172.20.0.1 (qemu host)). Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 10 ---- net/ipv4/tcp_diag.c | 10 ++-- net/ipv4/tcp_ipv4.c | 107 ++++++++++++++++++----------------- net/ipv4/tcp_minisocks.c | 142 ++++++++++++++++++++++++----------------------- 4 files changed, 134 insertions(+), 135 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2f4b1a374bb..f1a708bf7a9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -271,8 +271,6 @@ int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics); -kmem_cache_t *tcp_timewait_cachep; - atomic_t tcp_orphan_count = ATOMIC_INIT(0); int sysctl_tcp_mem[3]; @@ -2264,13 +2262,6 @@ void __init tcp_init(void) if (!tcp_hashinfo.bind_bucket_cachep) panic("tcp_init: Cannot alloc tcp_bind_bucket cache."); - tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket", - sizeof(struct tcp_tw_bucket), - 0, SLAB_HWCACHE_ALIGN, - NULL, NULL); - if (!tcp_timewait_cachep) - panic("tcp_init: Cannot alloc tcp_tw_bucket cache."); - /* Size and allocate the main established and bind bucket * hash tables. * @@ -2363,4 +2354,3 @@ EXPORT_SYMBOL(tcp_sendpage); EXPORT_SYMBOL(tcp_setsockopt); EXPORT_SYMBOL(tcp_shutdown); EXPORT_SYMBOL(tcp_statistics); -EXPORT_SYMBOL(tcp_timewait_cachep); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 1a89a03c449..6f2d6f2276b 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -81,7 +81,7 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); if (r->tcpdiag_state == TCP_TIME_WAIT) { - struct tcp_tw_bucket *tw = (struct tcp_tw_bucket*)sk; + const struct inet_timewait_sock *tw = inet_twsk(sk); long tmo = tw->tw_ttd - jiffies; if (tmo < 0) tmo = 0; @@ -99,10 +99,12 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, r->tcpdiag_inode = 0; #ifdef CONFIG_IP_TCPDIAG_IPV6 if (r->tcpdiag_family == AF_INET6) { + const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); + ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src, - &tw->tw_v6_rcv_saddr); + &tcp6tw->tw_v6_rcv_saddr); ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst, - &tw->tw_v6_daddr); + &tcp6tw->tw_v6_daddr); } #endif nlh->nlmsg_len = skb->tail - b; @@ -239,7 +241,7 @@ static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) out: if (sk) { if (sk->sk_state == TCP_TIME_WAIT) - tcp_tw_put((struct tcp_tw_bucket*)sk); + inet_twsk_put((struct inet_timewait_sock *)sk); else sock_put(sk); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a678709b36f..ce423e48ebe 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -106,7 +106,7 @@ int sysctl_local_port_range[2] = { 1024, 4999 }; static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) { - const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk); + const u32 sk_rcv_saddr = inet_rcv_saddr(sk); struct sock *sk2; struct hlist_node *node; int reuse = sk->sk_reuse; @@ -119,7 +119,7 @@ static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { if (!reuse || !sk2->sk_reuse || sk2->sk_state == TCP_LISTEN) { - const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2); + const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2); if (!sk2_rcv_saddr || !sk_rcv_saddr || sk2_rcv_saddr == sk_rcv_saddr) break; @@ -251,10 +251,10 @@ static inline struct sock *__tcp_v4_lookup_established(const u32 saddr, const int dif) { struct inet_ehash_bucket *head; - TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) - __u32 ports = TCP_COMBINED_PORTS(sport, hnum); + INET_ADDR_COOKIE(acookie, saddr, daddr) + const __u32 ports = INET_COMBINED_PORTS(sport, hnum); struct sock *sk; - struct hlist_node *node; + const struct hlist_node *node; /* Optimize here for direct hit, only listening connections can * have wildcards anyways. */ @@ -262,13 +262,13 @@ static inline struct sock *__tcp_v4_lookup_established(const u32 saddr, head = &tcp_hashinfo.ehash[hash]; read_lock(&head->lock); sk_for_each(sk, node, &head->chain) { - if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) + if (INET_MATCH(sk, acookie, saddr, daddr, ports, dif)) goto hit; /* You sunk my battleship! */ } /* Must check for a TIME_WAIT'er before going to listener hash. */ sk_for_each(sk, node, &(head + tcp_hashinfo.ehash_size)->chain) { - if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif)) + if (INET_TW_MATCH(sk, acookie, saddr, daddr, ports, dif)) goto hit; } sk = NULL; @@ -313,27 +313,28 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) /* called with local bh disabled */ static int __tcp_v4_check_established(struct sock *sk, __u16 lport, - struct tcp_tw_bucket **twp) + struct inet_timewait_sock **twp) { struct inet_sock *inet = inet_sk(sk); u32 daddr = inet->rcv_saddr; u32 saddr = inet->daddr; int dif = sk->sk_bound_dev_if; - TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) - __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport); + INET_ADDR_COOKIE(acookie, saddr, daddr) + const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size); struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash]; struct sock *sk2; - struct hlist_node *node; - struct tcp_tw_bucket *tw; + const struct hlist_node *node; + struct inet_timewait_sock *tw; write_lock(&head->lock); /* Check TIME-WAIT sockets first. */ sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) { - tw = (struct tcp_tw_bucket *)sk2; + tw = inet_twsk(sk2); - if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) { + if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) { + const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2); struct tcp_sock *tp = tcp_sk(sk); /* With PAWS, it is safe from the viewpoint @@ -350,15 +351,15 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport, fall back to VJ's scheme and use initial timestamp retrieved from peer table. */ - if (tw->tw_ts_recent_stamp && + if (tcptw->tw_ts_recent_stamp && (!twp || (sysctl_tcp_tw_reuse && xtime.tv_sec - - tw->tw_ts_recent_stamp > 1))) { - if ((tp->write_seq = - tw->tw_snd_nxt + 65535 + 2) == 0) + tcptw->tw_ts_recent_stamp > 1))) { + tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; + if (tp->write_seq == 0) tp->write_seq = 1; - tp->rx_opt.ts_recent = tw->tw_ts_recent; - tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp; + tp->rx_opt.ts_recent = tcptw->tw_ts_recent; + tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; sock_hold(sk2); goto unique; } else @@ -369,7 +370,7 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport, /* And established part... */ sk_for_each(sk2, node, &head->chain) { - if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) + if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif)) goto not_unique; } @@ -392,7 +393,7 @@ unique: tcp_tw_deschedule(tw); NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - tcp_tw_put(tw); + inet_twsk_put(tw); } return 0; @@ -429,7 +430,7 @@ static inline int tcp_v4_hash_connect(struct sock *sk) static u32 hint; u32 offset = hint + connect_port_offset(sk); struct hlist_node *node; - struct tcp_tw_bucket *tw = NULL; + struct inet_timewait_sock *tw = NULL; local_bh_disable(); for (i = 1; i <= range; i++) { @@ -482,7 +483,7 @@ ok: if (tw) { tcp_tw_deschedule(tw); - tcp_tw_put(tw); + inet_twsk_put(tw); } ret = 0; @@ -757,7 +758,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) return; } if (sk->sk_state == TCP_TIME_WAIT) { - tcp_tw_put((struct tcp_tw_bucket *)sk); + inet_twsk_put((struct inet_timewait_sock *)sk); return; } @@ -1002,12 +1003,13 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) { - struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; + struct inet_timewait_sock *tw = inet_twsk(sk); + const struct tcp_timewait_sock *tcptw = tcp_twsk(sk); - tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt, - tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent); + tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, + tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcptw->tw_ts_recent); - tcp_tw_put(tw); + inet_twsk_put(tw); } static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req) @@ -1368,7 +1370,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) bh_lock_sock(nsk); return nsk; } - tcp_tw_put((struct tcp_tw_bucket *)nsk); + inet_twsk_put((struct inet_timewait_sock *)nsk); return NULL; } @@ -1557,25 +1559,25 @@ discard_and_relse: do_time_wait: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { - tcp_tw_put((struct tcp_tw_bucket *) sk); + inet_twsk_put((struct inet_timewait_sock *) sk); goto discard_it; } if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { TCP_INC_STATS_BH(TCP_MIB_INERRS); - tcp_tw_put((struct tcp_tw_bucket *) sk); + inet_twsk_put((struct inet_timewait_sock *) sk); goto discard_it; } - switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk, - skb, th, skb->len)) { + switch (tcp_timewait_state_process((struct inet_timewait_sock *)sk, + skb, th)) { case TCP_TW_SYN: { struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo, skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb)); if (sk2) { - tcp_tw_deschedule((struct tcp_tw_bucket *)sk); - tcp_tw_put((struct tcp_tw_bucket *)sk); + tcp_tw_deschedule((struct inet_timewait_sock *)sk); + inet_twsk_put((struct inet_timewait_sock *)sk); sk = sk2; goto process; } @@ -1639,18 +1641,18 @@ int tcp_v4_remember_stamp(struct sock *sk) return 0; } -int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw) +int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) { - struct inet_peer *peer = NULL; - - peer = inet_getpeer(tw->tw_daddr, 1); + struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1); if (peer) { - if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 || + const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); + + if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec && - peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) { - peer->tcp_ts_stamp = tw->tw_ts_recent_stamp; - peer->tcp_ts = tw->tw_ts_recent; + peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) { + peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp; + peer->tcp_ts = tcptw->tw_ts_recent; } inet_putpeer(peer); return 1; @@ -1758,13 +1760,13 @@ EXPORT_SYMBOL(tcp_v4_destroy_sock); #ifdef CONFIG_PROC_FS /* Proc filesystem TCP sock list dumping. */ -static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head) +static inline struct inet_timewait_sock *tw_head(struct hlist_head *head) { return hlist_empty(head) ? NULL : - list_entry(head->first, struct tcp_tw_bucket, tw_node); + list_entry(head->first, struct inet_timewait_sock, tw_node); } -static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw) +static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) { return tw->tw_node.next ? hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; @@ -1860,7 +1862,7 @@ static void *established_get_first(struct seq_file *seq) for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { struct sock *sk; struct hlist_node *node; - struct tcp_tw_bucket *tw; + struct inet_timewait_sock *tw; /* We can reschedule _before_ having picked the target: */ cond_resched_softirq(); @@ -1874,8 +1876,8 @@ static void *established_get_first(struct seq_file *seq) goto out; } st->state = TCP_SEQ_STATE_TIME_WAIT; - tw_for_each(tw, node, - &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) { + inet_twsk_for_each(tw, node, + &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) { if (tw->tw_family != st->family) { continue; } @@ -1892,7 +1894,7 @@ out: static void *established_get_next(struct seq_file *seq, void *cur) { struct sock *sk = cur; - struct tcp_tw_bucket *tw; + struct inet_timewait_sock *tw; struct hlist_node *node; struct tcp_iter_state* st = seq->private; @@ -2159,7 +2161,7 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i) tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh); } -static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i) +static void get_timewait4_sock(struct inet_timewait_sock *tw, char *tmpbuf, int i) { unsigned int dest, src; __u16 destp, srcp; @@ -2261,6 +2263,7 @@ struct proto tcp_prot = { .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), + .twsk_obj_size = sizeof(struct tcp_timewait_sock), .rsk_prot = &tcp_request_sock_ops, }; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f29e2f6ebe1..5b5a49335fb 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -41,7 +41,7 @@ int sysctl_tcp_max_tw_buckets = NR_FILE*2; int sysctl_tcp_syncookies = SYNC_INIT; int sysctl_tcp_abort_on_overflow; -static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo); +static void tcp_tw_schedule(struct inet_timewait_sock *tw, int timeo); static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { @@ -58,7 +58,7 @@ int tcp_tw_count; /* Must be called with locally disabled BHs. */ -static void tcp_timewait_kill(struct tcp_tw_bucket *tw) +static void tcp_timewait_kill(struct inet_timewait_sock *tw) { struct inet_bind_hashbucket *bhead; struct inet_bind_bucket *tb; @@ -85,11 +85,11 @@ static void tcp_timewait_kill(struct tcp_tw_bucket *tw) #ifdef SOCK_REFCNT_DEBUG if (atomic_read(&tw->tw_refcnt) != 1) { - printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw, - atomic_read(&tw->tw_refcnt)); + printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n", + tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); } #endif - tcp_tw_put(tw); + inet_twsk_put(tw); } /* @@ -121,19 +121,20 @@ static void tcp_timewait_kill(struct tcp_tw_bucket *tw) * to avoid misread sequence numbers, states etc. --ANK */ enum tcp_tw_status -tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, - struct tcphdr *th, unsigned len) +tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, + const struct tcphdr *th) { + struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); struct tcp_options_received tmp_opt; int paws_reject = 0; tmp_opt.saw_tstamp = 0; - if (th->doff > (sizeof(struct tcphdr) >> 2) && tw->tw_ts_recent_stamp) { + if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { tcp_parse_options(skb, &tmp_opt, 0); if (tmp_opt.saw_tstamp) { - tmp_opt.ts_recent = tw->tw_ts_recent; - tmp_opt.ts_recent_stamp = tw->tw_ts_recent_stamp; + tmp_opt.ts_recent = tcptw->tw_ts_recent; + tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; paws_reject = tcp_paws_check(&tmp_opt, th->rst); } } @@ -144,20 +145,20 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, /* Out of window, send ACK */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - tw->tw_rcv_nxt, - tw->tw_rcv_nxt + tw->tw_rcv_wnd)) + tcptw->tw_rcv_nxt, + tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd)) return TCP_TW_ACK; if (th->rst) goto kill; - if (th->syn && !before(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt)) + if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt)) goto kill_with_rst; /* Dup ACK? */ - if (!after(TCP_SKB_CB(skb)->end_seq, tw->tw_rcv_nxt) || + if (!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) || TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { - tcp_tw_put(tw); + inet_twsk_put(tw); return TCP_TW_SUCCESS; } @@ -165,19 +166,19 @@ tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, * reset. */ if (!th->fin || - TCP_SKB_CB(skb)->end_seq != tw->tw_rcv_nxt + 1) { + TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { kill_with_rst: tcp_tw_deschedule(tw); - tcp_tw_put(tw); + inet_twsk_put(tw); return TCP_TW_RST; } /* FIN arrived, enter true time-wait state. */ - tw->tw_substate = TCP_TIME_WAIT; - tw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; + tw->tw_substate = TCP_TIME_WAIT; + tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (tmp_opt.saw_tstamp) { - tw->tw_ts_recent_stamp = xtime.tv_sec; - tw->tw_ts_recent = tmp_opt.rcv_tsval; + tcptw->tw_ts_recent_stamp = xtime.tv_sec; + tcptw->tw_ts_recent = tmp_opt.rcv_tsval; } /* I am shamed, but failed to make it more elegant. @@ -186,7 +187,7 @@ kill_with_rst: * do not undertsnad recycling in any case, it not * a big problem in practice. --ANK */ if (tw->tw_family == AF_INET && - sysctl_tcp_tw_recycle && tw->tw_ts_recent_stamp && + sysctl_tcp_tw_recycle && tcptw->tw_ts_recent_stamp && tcp_v4_tw_remember_stamp(tw)) tcp_tw_schedule(tw, tw->tw_timeout); else @@ -212,7 +213,7 @@ kill_with_rst: */ if (!paws_reject && - (TCP_SKB_CB(skb)->seq == tw->tw_rcv_nxt && + (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt && (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { /* In window segment, it may be only reset or bare ack. */ @@ -224,18 +225,18 @@ kill_with_rst: if (sysctl_tcp_rfc1337 == 0) { kill: tcp_tw_deschedule(tw); - tcp_tw_put(tw); + inet_twsk_put(tw); return TCP_TW_SUCCESS; } } tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); if (tmp_opt.saw_tstamp) { - tw->tw_ts_recent = tmp_opt.rcv_tsval; - tw->tw_ts_recent_stamp = xtime.tv_sec; + tcptw->tw_ts_recent = tmp_opt.rcv_tsval; + tcptw->tw_ts_recent_stamp = xtime.tv_sec; } - tcp_tw_put(tw); + inet_twsk_put(tw); return TCP_TW_SUCCESS; } @@ -257,9 +258,10 @@ kill: */ if (th->syn && !th->rst && !th->ack && !paws_reject && - (after(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt) || - (tmp_opt.saw_tstamp && (s32)(tw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) { - u32 isn = tw->tw_snd_nxt + 65535 + 2; + (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) || + (tmp_opt.saw_tstamp && + (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) { + u32 isn = tcptw->tw_snd_nxt + 65535 + 2; if (isn == 0) isn++; TCP_SKB_CB(skb)->when = isn; @@ -284,7 +286,7 @@ kill: */ return TCP_TW_ACK; } - tcp_tw_put(tw); + inet_twsk_put(tw); return TCP_TW_SUCCESS; } @@ -293,7 +295,7 @@ kill: * relevant info into it from the SK, and mess with hash chains * and list linkage. */ -static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) +static void __tcp_tw_hashdance(struct sock *sk, struct inet_timewait_sock *tw) { const struct inet_sock *inet = inet_sk(sk); struct inet_ehash_bucket *ehead = &tcp_hashinfo.ehash[sk->sk_hashent]; @@ -306,7 +308,7 @@ static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) spin_lock(&bhead->lock); tw->tw_tb = inet->bind_hash; BUG_TRAP(inet->bind_hash); - tw_add_bind_node(tw, &tw->tw_tb->owners); + inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); spin_unlock(&bhead->lock); write_lock(&ehead->lock); @@ -316,7 +318,7 @@ static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) sock_prot_dec_use(sk->sk_prot); /* Step 3: Hash TW into TIMEWAIT half of established hash table. */ - tw_add_node(tw, &(ehead + tcp_hashinfo.ehash_size)->chain); + inet_twsk_add_node(tw, &(ehead + tcp_hashinfo.ehash_size)->chain); atomic_inc(&tw->tw_refcnt); write_unlock(&ehead->lock); @@ -327,19 +329,23 @@ static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) */ void tcp_time_wait(struct sock *sk, int state, int timeo) { - struct tcp_tw_bucket *tw = NULL; - struct tcp_sock *tp = tcp_sk(sk); + struct inet_timewait_sock *tw = NULL; + const struct tcp_sock *tp = tcp_sk(sk); int recycle_ok = 0; if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp) recycle_ok = tp->af_specific->remember_stamp(sk); if (tcp_tw_count < sysctl_tcp_max_tw_buckets) - tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC); + tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab, SLAB_ATOMIC); + + if (tw != NULL) { + struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); + const struct inet_sock *inet = inet_sk(sk); + const int rto = (tp->rto << 2) - (tp->rto >> 1); - if(tw != NULL) { - struct inet_sock *inet = inet_sk(sk); - int rto = (tp->rto<<2) - (tp->rto>>1); + /* Remember our protocol */ + tw->tw_prot = sk->sk_prot_creator; /* Give us an identity. */ tw->tw_daddr = inet->daddr; @@ -356,25 +362,23 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) atomic_set(&tw->tw_refcnt, 1); tw->tw_hashent = sk->sk_hashent; - tw->tw_rcv_nxt = tp->rcv_nxt; - tw->tw_snd_nxt = tp->snd_nxt; - tw->tw_rcv_wnd = tcp_receive_window(tp); - tw->tw_ts_recent = tp->rx_opt.ts_recent; - tw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; - tw_dead_node_init(tw); + tcptw->tw_rcv_nxt = tp->rcv_nxt; + tcptw->tw_snd_nxt = tp->snd_nxt; + tcptw->tw_rcv_wnd = tcp_receive_window(tp); + tcptw->tw_ts_recent = tp->rx_opt.ts_recent; + tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; + inet_twsk_dead_node_init(tw); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); + struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw); - ipv6_addr_copy(&tw->tw_v6_daddr, &np->daddr); - ipv6_addr_copy(&tw->tw_v6_rcv_saddr, &np->rcv_saddr); - tw->tw_v6_ipv6only = np->ipv6only; - } else { - memset(&tw->tw_v6_daddr, 0, sizeof(tw->tw_v6_daddr)); - memset(&tw->tw_v6_rcv_saddr, 0, sizeof(tw->tw_v6_rcv_saddr)); - tw->tw_v6_ipv6only = 0; - } + ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr); + ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr); + tw->tw_ipv6only = np->ipv6only; + } else + tw->tw_ipv6only = 0; #endif /* Linkage updates. */ __tcp_tw_hashdance(sk, tw); @@ -392,7 +396,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) } tcp_tw_schedule(tw, timeo); - tcp_tw_put(tw); + inet_twsk_put(tw); } else { /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than @@ -427,7 +431,7 @@ static u32 twkill_thread_slots; /* Returns non-zero if quota exceeded. */ static int tcp_do_twkill_work(int slot, unsigned int quota) { - struct tcp_tw_bucket *tw; + struct inet_timewait_sock *tw; struct hlist_node *node; unsigned int killed; int ret; @@ -441,11 +445,11 @@ static int tcp_do_twkill_work(int slot, unsigned int quota) killed = 0; ret = 0; rescan: - tw_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) { - __tw_del_dead_node(tw); + inet_twsk_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) { + __inet_twsk_del_dead_node(tw); spin_unlock(&tw_death_lock); tcp_timewait_kill(tw); - tcp_tw_put(tw); + inet_twsk_put(tw); killed++; spin_lock(&tw_death_lock); if (killed > quota) { @@ -531,11 +535,11 @@ static void twkill_work(void *dummy) */ /* This is for handling early-kills of TIME_WAIT sockets. */ -void tcp_tw_deschedule(struct tcp_tw_bucket *tw) +void tcp_tw_deschedule(struct inet_timewait_sock *tw) { spin_lock(&tw_death_lock); - if (tw_del_dead_node(tw)) { - tcp_tw_put(tw); + if (inet_twsk_del_dead_node(tw)) { + inet_twsk_put(tw); if (--tcp_tw_count == 0) del_timer(&tcp_tw_timer); } @@ -552,7 +556,7 @@ static struct timer_list tcp_twcal_timer = TIMER_INITIALIZER(tcp_twcal_tick, 0, 0); static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS]; -static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo) +static void tcp_tw_schedule(struct inet_timewait_sock *tw, const int timeo) { struct hlist_head *list; int slot; @@ -586,7 +590,7 @@ static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo) spin_lock(&tw_death_lock); /* Unlink it, if it was scheduled */ - if (tw_del_dead_node(tw)) + if (inet_twsk_del_dead_node(tw)) tcp_tw_count--; else atomic_inc(&tw->tw_refcnt); @@ -644,13 +648,13 @@ void tcp_twcal_tick(unsigned long dummy) for (n=0; n Date: Tue, 9 Aug 2005 20:09:46 -0700 Subject: [INET]: Generalise the TCP sock ID lookup routines And also some TIME_WAIT functions. [acme@toy net-2.6.14]$ grep built-in /tmp/before.size /tmp/after.size /tmp/before.size: 282955 13122 9312 305389 4a8ed net/ipv4/built-in.o /tmp/after.size: 281566 13122 9312 304000 4a380 net/ipv4/built-in.o [acme@toy net-2.6.14]$ I kept them still inlined, will uninline at some point to see what would be the performance difference. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/Makefile | 1 + net/ipv4/inet_hashtables.c | 2 ++ net/ipv4/inet_timewait_sock.c | 83 +++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_diag.c | 8 ++--- net/ipv4/tcp_ipv4.c | 83 +++++-------------------------------------- net/ipv4/tcp_minisocks.c | 78 +++------------------------------------- 6 files changed, 101 insertions(+), 154 deletions(-) create mode 100644 net/ipv4/inet_timewait_sock.c (limited to 'net/ipv4') diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 2d8d30e83eb..6650d18e400 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -5,6 +5,7 @@ obj-y := route.o inetpeer.o protocol.o \ ip_input.o ip_fragment.o ip_forward.o ip_options.o \ ip_output.o ip_sockglue.o inet_hashtables.o \ + inet_timewait_sock.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o \ datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 88fcba05b7d..d94e962958a 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -162,3 +162,5 @@ struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 dad } return result; } + +EXPORT_SYMBOL_GPL(__inet_lookup_listener); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c new file mode 100644 index 00000000000..d38d160faeb --- /dev/null +++ b/net/ipv4/inet_timewait_sock.c @@ -0,0 +1,83 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Generic TIME_WAIT sockets functions + * + * From code orinally in TCP + */ + +#include + +#include +#include + +/* Must be called with locally disabled BHs. */ +void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo) +{ + struct inet_bind_hashbucket *bhead; + struct inet_bind_bucket *tb; + /* Unlink from established hashes. */ + struct inet_ehash_bucket *ehead = &hashinfo->ehash[tw->tw_hashent]; + + write_lock(&ehead->lock); + if (hlist_unhashed(&tw->tw_node)) { + write_unlock(&ehead->lock); + return; + } + __hlist_del(&tw->tw_node); + sk_node_init(&tw->tw_node); + write_unlock(&ehead->lock); + + /* Disassociate with bind bucket. */ + bhead = &hashinfo->bhash[inet_bhashfn(tw->tw_num, hashinfo->bhash_size)]; + spin_lock(&bhead->lock); + tb = tw->tw_tb; + __hlist_del(&tw->tw_bind_node); + tw->tw_tb = NULL; + inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + spin_unlock(&bhead->lock); +#ifdef SOCK_REFCNT_DEBUG + if (atomic_read(&tw->tw_refcnt) != 1) { + printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n", + tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); + } +#endif + inet_twsk_put(tw); +} + +/* + * Enter the time wait state. This is called with locally disabled BH. + * Essentially we whip up a timewait bucket, copy the relevant info into it + * from the SK, and mess with hash chains and list linkage. + */ +void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, + struct inet_hashinfo *hashinfo) +{ + const struct inet_sock *inet = inet_sk(sk); + struct inet_ehash_bucket *ehead = &hashinfo->ehash[sk->sk_hashent]; + struct inet_bind_hashbucket *bhead; + /* Step 1: Put TW into bind hash. Original socket stays there too. + Note, that any socket with inet->num != 0 MUST be bound in + binding cache, even if it is closed. + */ + bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)]; + spin_lock(&bhead->lock); + tw->tw_tb = inet->bind_hash; + BUG_TRAP(inet->bind_hash); + inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); + spin_unlock(&bhead->lock); + + write_lock(&ehead->lock); + + /* Step 2: Remove SK from established hash. */ + if (__sk_del_node_init(sk)) + sock_prot_dec_use(sk->sk_prot); + + /* Step 3: Hash TW into TIMEWAIT half of established hash table. */ + inet_twsk_add_node(tw, &(ehead + hashinfo->ehash_size)->chain); + atomic_inc(&tw->tw_refcnt); + + write_unlock(&ehead->lock); +} diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 6f2d6f2276b..60c6a797cc5 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -174,8 +174,6 @@ nlmsg_failure: return -1; } -extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, - int dif); #ifdef CONFIG_IP_TCPDIAG_IPV6 extern struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport, struct in6_addr *daddr, u16 dport, @@ -197,9 +195,9 @@ static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) struct sk_buff *rep; if (req->tcpdiag_family == AF_INET) { - sk = tcp_v4_lookup(req->id.tcpdiag_dst[0], req->id.tcpdiag_dport, - req->id.tcpdiag_src[0], req->id.tcpdiag_sport, - req->id.tcpdiag_if); + sk = inet_lookup(&tcp_hashinfo, req->id.tcpdiag_dst[0], + req->id.tcpdiag_dport, req->id.tcpdiag_src[0], + req->id.tcpdiag_sport, req->id.tcpdiag_if); } #ifdef CONFIG_IP_TCPDIAG_IPV6 else if (req->tcpdiag_family == AF_INET6) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ce423e48ebe..e7e91e60ac7 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -238,71 +238,6 @@ void tcp_unhash(struct sock *sk) inet_unhash(&tcp_hashinfo, sk); } -/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so - * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM - * - * Local BH must be disabled here. - */ - -static inline struct sock *__tcp_v4_lookup_established(const u32 saddr, - const u16 sport, - const u32 daddr, - const u16 hnum, - const int dif) -{ - struct inet_ehash_bucket *head; - INET_ADDR_COOKIE(acookie, saddr, daddr) - const __u32 ports = INET_COMBINED_PORTS(sport, hnum); - struct sock *sk; - const struct hlist_node *node; - /* Optimize here for direct hit, only listening connections can - * have wildcards anyways. - */ - const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_hashinfo.ehash_size); - head = &tcp_hashinfo.ehash[hash]; - read_lock(&head->lock); - sk_for_each(sk, node, &head->chain) { - if (INET_MATCH(sk, acookie, saddr, daddr, ports, dif)) - goto hit; /* You sunk my battleship! */ - } - - /* Must check for a TIME_WAIT'er before going to listener hash. */ - sk_for_each(sk, node, &(head + tcp_hashinfo.ehash_size)->chain) { - if (INET_TW_MATCH(sk, acookie, saddr, daddr, ports, dif)) - goto hit; - } - sk = NULL; -out: - read_unlock(&head->lock); - return sk; -hit: - sock_hold(sk); - goto out; -} - -static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport, - u32 daddr, u16 hnum, int dif) -{ - struct sock *sk = __tcp_v4_lookup_established(saddr, sport, - daddr, hnum, dif); - - return sk ? : inet_lookup_listener(&tcp_hashinfo, daddr, hnum, dif); -} - -inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, - u16 dport, int dif) -{ - struct sock *sk; - - local_bh_disable(); - sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif); - local_bh_enable(); - - return sk; -} - -EXPORT_SYMBOL_GPL(tcp_v4_lookup); - static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) { return secure_tcp_sequence_number(skb->nh.iph->daddr, @@ -751,8 +686,8 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) return; } - sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, - th->source, tcp_v4_iif(skb)); + sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr, + th->source, tcp_v4_iif(skb)); if (!sk) { ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); return; @@ -1359,11 +1294,9 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) if (req) return tcp_check_req(sk, skb, req, prev); - nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr, - th->source, - skb->nh.iph->daddr, - ntohs(th->dest), - tcp_v4_iif(skb)); + nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr, + th->source, skb->nh.iph->daddr, + ntohs(th->dest), tcp_v4_iif(skb)); if (nsk) { if (nsk->sk_state != TCP_TIME_WAIT) { @@ -1505,9 +1438,9 @@ int tcp_v4_rcv(struct sk_buff *skb) TCP_SKB_CB(skb)->flags = skb->nh.iph->tos; TCP_SKB_CB(skb)->sacked = 0; - sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source, - skb->nh.iph->daddr, ntohs(th->dest), - tcp_v4_iif(skb)); + sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source, + skb->nh.iph->daddr, ntohs(th->dest), + tcp_v4_iif(skb)); if (!sk) goto no_tcp_socket; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 5b5a49335fb..4112f7a6d10 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -56,42 +56,6 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) int tcp_tw_count; - -/* Must be called with locally disabled BHs. */ -static void tcp_timewait_kill(struct inet_timewait_sock *tw) -{ - struct inet_bind_hashbucket *bhead; - struct inet_bind_bucket *tb; - /* Unlink from established hashes. */ - struct inet_ehash_bucket *ehead = &tcp_hashinfo.ehash[tw->tw_hashent]; - - write_lock(&ehead->lock); - if (hlist_unhashed(&tw->tw_node)) { - write_unlock(&ehead->lock); - return; - } - __hlist_del(&tw->tw_node); - sk_node_init(&tw->tw_node); - write_unlock(&ehead->lock); - - /* Disassociate with bind bucket. */ - bhead = &tcp_hashinfo.bhash[inet_bhashfn(tw->tw_num, tcp_hashinfo.bhash_size)]; - spin_lock(&bhead->lock); - tb = tw->tw_tb; - __hlist_del(&tw->tw_bind_node); - tw->tw_tb = NULL; - inet_bind_bucket_destroy(tcp_hashinfo.bind_bucket_cachep, tb); - spin_unlock(&bhead->lock); - -#ifdef SOCK_REFCNT_DEBUG - if (atomic_read(&tw->tw_refcnt) != 1) { - printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n", - tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); - } -#endif - inet_twsk_put(tw); -} - /* * * Main purpose of TIME-WAIT state is to close connection gracefully, * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN @@ -290,40 +254,6 @@ kill: return TCP_TW_SUCCESS; } -/* Enter the time wait state. This is called with locally disabled BH. - * Essentially we whip up a timewait bucket, copy the - * relevant info into it from the SK, and mess with hash chains - * and list linkage. - */ -static void __tcp_tw_hashdance(struct sock *sk, struct inet_timewait_sock *tw) -{ - const struct inet_sock *inet = inet_sk(sk); - struct inet_ehash_bucket *ehead = &tcp_hashinfo.ehash[sk->sk_hashent]; - struct inet_bind_hashbucket *bhead; - /* Step 1: Put TW into bind hash. Original socket stays there too. - Note, that any socket with inet->num != 0 MUST be bound in - binding cache, even if it is closed. - */ - bhead = &tcp_hashinfo.bhash[inet_bhashfn(inet->num, tcp_hashinfo.bhash_size)]; - spin_lock(&bhead->lock); - tw->tw_tb = inet->bind_hash; - BUG_TRAP(inet->bind_hash); - inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); - spin_unlock(&bhead->lock); - - write_lock(&ehead->lock); - - /* Step 2: Remove SK from established hash. */ - if (__sk_del_node_init(sk)) - sock_prot_dec_use(sk->sk_prot); - - /* Step 3: Hash TW into TIMEWAIT half of established hash table. */ - inet_twsk_add_node(tw, &(ehead + tcp_hashinfo.ehash_size)->chain); - atomic_inc(&tw->tw_refcnt); - - write_unlock(&ehead->lock); -} - /* * Move a socket to time-wait or dead fin-wait-2 state. */ @@ -381,7 +311,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tw->tw_ipv6only = 0; #endif /* Linkage updates. */ - __tcp_tw_hashdance(sk, tw); + __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); /* Get the TIME_WAIT timeout firing. */ if (timeo < rto) @@ -448,7 +378,7 @@ rescan: inet_twsk_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) { __inet_twsk_del_dead_node(tw); spin_unlock(&tw_death_lock); - tcp_timewait_kill(tw); + __inet_twsk_kill(tw, &tcp_hashinfo); inet_twsk_put(tw); killed++; spin_lock(&tw_death_lock); @@ -544,7 +474,7 @@ void tcp_tw_deschedule(struct inet_timewait_sock *tw) del_timer(&tcp_tw_timer); } spin_unlock(&tw_death_lock); - tcp_timewait_kill(tw); + __inet_twsk_kill(tw, &tcp_hashinfo); } /* Short-time timewait calendar */ @@ -653,7 +583,7 @@ void tcp_twcal_tick(unsigned long dummy) inet_twsk_for_each_inmate_safe(tw, node, safe, &tcp_twcal_row[slot]) { __inet_twsk_del_dead_node(tw); - tcp_timewait_kill(tw); + __inet_twsk_kill(tw, &tcp_hashinfo); inet_twsk_put(tw); killed++; } -- cgit v1.2.3 From c676270bcd25015b978722ec0352c330dcc87883 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:09:59 -0700 Subject: [INET_TWSK]: Introduce inet_twsk_alloc With the parts of tcp_time_wait that are not TCP specific, tcp_time_wait uses it and so will dccp_time_wait. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/inet_timewait_sock.c | 28 ++++++++++++++++++++++++++++ net/ipv4/tcp_minisocks.c | 24 ++---------------------- 2 files changed, 30 insertions(+), 22 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index d38d160faeb..ceb577c7423 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -81,3 +81,31 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, write_unlock(&ehead->lock); } + +struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) +{ + struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab, + SLAB_ATOMIC); + if (tw != NULL) { + const struct inet_sock *inet = inet_sk(sk); + + /* Give us an identity. */ + tw->tw_daddr = inet->daddr; + tw->tw_rcv_saddr = inet->rcv_saddr; + tw->tw_bound_dev_if = sk->sk_bound_dev_if; + tw->tw_num = inet->num; + tw->tw_state = TCP_TIME_WAIT; + tw->tw_substate = state; + tw->tw_sport = inet->sport; + tw->tw_dport = inet->dport; + tw->tw_family = sk->sk_family; + tw->tw_reuse = sk->sk_reuse; + tw->tw_hashent = sk->sk_hashent; + tw->tw_ipv6only = 0; + tw->tw_prot = sk->sk_prot_creator; + atomic_set(&tw->tw_refcnt, 1); + inet_twsk_dead_node_init(tw); + } + + return tw; +} diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 4112f7a6d10..66ce1790a94 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -267,37 +267,18 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) recycle_ok = tp->af_specific->remember_stamp(sk); if (tcp_tw_count < sysctl_tcp_max_tw_buckets) - tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab, SLAB_ATOMIC); + tw = inet_twsk_alloc(sk, state); if (tw != NULL) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); - const struct inet_sock *inet = inet_sk(sk); const int rto = (tp->rto << 2) - (tp->rto >> 1); - /* Remember our protocol */ - tw->tw_prot = sk->sk_prot_creator; - - /* Give us an identity. */ - tw->tw_daddr = inet->daddr; - tw->tw_rcv_saddr = inet->rcv_saddr; - tw->tw_bound_dev_if = sk->sk_bound_dev_if; - tw->tw_num = inet->num; - tw->tw_state = TCP_TIME_WAIT; - tw->tw_substate = state; - tw->tw_sport = inet->sport; - tw->tw_dport = inet->dport; - tw->tw_family = sk->sk_family; - tw->tw_reuse = sk->sk_reuse; tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; - atomic_set(&tw->tw_refcnt, 1); - - tw->tw_hashent = sk->sk_hashent; tcptw->tw_rcv_nxt = tp->rcv_nxt; tcptw->tw_snd_nxt = tp->snd_nxt; tcptw->tw_rcv_wnd = tcp_receive_window(tp); tcptw->tw_ts_recent = tp->rx_opt.ts_recent; tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; - inet_twsk_dead_node_init(tw); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if (tw->tw_family == PF_INET6) { @@ -307,8 +288,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr); ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr); tw->tw_ipv6only = np->ipv6only; - } else - tw->tw_ipv6only = 0; + } #endif /* Linkage updates. */ __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); -- cgit v1.2.3 From 87d11ceb9deb7a3f13fdee6e89d9bb6be7d27a71 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:10:12 -0700 Subject: [SOCK]: Introduce sk_clone Out of tcp_create_openreq_child, will be used in dccp_create_openreq_child, and is a nice sock function anyway. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/tcp_minisocks.c | 70 +++--------------------------------------------- 1 file changed, 3 insertions(+), 67 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 66ce1790a94..8b6cd8d8066 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -599,67 +599,26 @@ out: */ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb) { - /* allocate the newsk from the same slab of the master sock, - * if not, at sk_free time we'll try to free it from the wrong - * slabcache (i.e. is it TCPv4 or v6?), this is handled thru sk->sk_prot -acme */ - struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0); + struct sock *newsk = sk_clone(sk, GFP_ATOMIC); - if(newsk != NULL) { + if (newsk != NULL) { struct inet_request_sock *ireq = inet_rsk(req); struct tcp_request_sock *treq = tcp_rsk(req); struct inet_sock *newinet = inet_sk(newsk); struct tcp_sock *newtp; - struct sk_filter *filter; - memcpy(newsk, sk, sizeof(struct tcp_sock)); newsk->sk_state = TCP_SYN_RECV; - - /* SANITY */ - sk_node_init(&newsk->sk_node); newinet->bind_hash = NULL; /* Clone the TCP header template */ newinet->dport = ireq->rmt_port; - - sock_lock_init(newsk); - bh_lock_sock(newsk); - - rwlock_init(&newsk->sk_dst_lock); - newsk->sk_dst_cache = NULL; - atomic_set(&newsk->sk_rmem_alloc, 0); - skb_queue_head_init(&newsk->sk_receive_queue); - atomic_set(&newsk->sk_wmem_alloc, 0); - skb_queue_head_init(&newsk->sk_write_queue); - atomic_set(&newsk->sk_omem_alloc, 0); - newsk->sk_wmem_queued = 0; - newsk->sk_forward_alloc = 0; - - sock_reset_flag(newsk, SOCK_DONE); - newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; - newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; - newsk->sk_send_head = NULL; - rwlock_init(&newsk->sk_callback_lock); - skb_queue_head_init(&newsk->sk_error_queue); newsk->sk_write_space = sk_stream_write_space; - if ((filter = newsk->sk_filter) != NULL) - sk_filter_charge(newsk, filter); - - if (unlikely(xfrm_sk_clone_policy(newsk))) { - /* It is still raw copy of parent, so invalidate - * destructor and make plain sk_free() */ - newsk->sk_destruct = NULL; - sk_free(newsk); - return NULL; - } - /* Now setup tcp_sock */ newtp = tcp_sk(newsk); newtp->pred_flags = 0; newtp->rcv_nxt = treq->rcv_isn + 1; - newtp->snd_nxt = treq->snt_isn + 1; - newtp->snd_una = treq->snt_isn + 1; - newtp->snd_sml = treq->snt_isn + 1; + newtp->snd_nxt = newtp->snd_una = newtp->snd_sml = treq->snt_isn + 1; tcp_prequeue_init(newtp); @@ -710,32 +669,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, /* Deinitialize accept_queue to trap illegal accesses. */ memset(&newtp->accept_queue, 0, sizeof(newtp->accept_queue)); - /* Back to base struct sock members. */ - newsk->sk_err = 0; - newsk->sk_priority = 0; - atomic_set(&newsk->sk_refcnt, 2); - - /* - * Increment the counter in the same struct proto as the master - * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that - * is the same as sk->sk_prot->socks, as this field was copied - * with memcpy), same rationale as the first comment in this - * function. - * - * This _changes_ the previous behaviour, where - * tcp_create_openreq_child always was incrementing the - * equivalent to tcp_prot->socks (inet_sock_nr), so this have - * to be taken into account in all callers. -acme - */ - sk_refcnt_debug_inc(newsk); - - atomic_inc(&tcp_sockets_allocated); - if (sock_flag(newsk, SOCK_KEEPOPEN)) tcp_reset_keepalive_timer(newsk, keepalive_time_when(newtp)); - newsk->sk_socket = NULL; - newsk->sk_sleep = NULL; newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { -- cgit v1.2.3 From 463c84b97f24010a67cd871746d6a7e4c925a5f9 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:10:42 -0700 Subject: [NET]: Introduce inet_connection_sock This creates struct inet_connection_sock, moving members out of struct tcp_sock that are shareable with other INET connection oriented protocols, such as DCCP, that in my private tree already uses most of these members. The functions that operate on these members were renamed, using a inet_csk_ prefix while not being moved yet to a new file, so as to ease the review of these changes. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/inet_hashtables.c | 15 ++- net/ipv4/inet_timewait_sock.c | 5 +- net/ipv4/syncookies.c | 2 +- net/ipv4/tcp.c | 90 +++++++------- net/ipv4/tcp_diag.c | 21 ++-- net/ipv4/tcp_input.c | 266 ++++++++++++++++++++++-------------------- net/ipv4/tcp_ipv4.c | 158 +++++++++++++------------ net/ipv4/tcp_minisocks.c | 28 ++--- net/ipv4/tcp_output.c | 86 +++++++------- net/ipv4/tcp_timer.c | 165 +++++++++++++------------- 10 files changed, 440 insertions(+), 396 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index d94e962958a..e8d29fe736d 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -19,6 +19,7 @@ #include #include +#include #include /* @@ -56,10 +57,9 @@ void inet_bind_bucket_destroy(kmem_cache_t *cachep, struct inet_bind_bucket *tb) void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, const unsigned short snum) { - struct inet_sock *inet = inet_sk(sk); - inet->num = snum; + inet_sk(sk)->num = snum; sk_add_bind_node(sk, &tb->owners); - inet->bind_hash = tb; + inet_csk(sk)->icsk_bind_hash = tb; } EXPORT_SYMBOL(inet_bind_hash); @@ -69,16 +69,15 @@ EXPORT_SYMBOL(inet_bind_hash); */ static void __inet_put_port(struct inet_hashinfo *hashinfo, struct sock *sk) { - struct inet_sock *inet = inet_sk(sk); - const int bhash = inet_bhashfn(inet->num, hashinfo->bhash_size); + const int bhash = inet_bhashfn(inet_sk(sk)->num, hashinfo->bhash_size); struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; struct inet_bind_bucket *tb; spin_lock(&head->lock); - tb = inet->bind_hash; + tb = inet_csk(sk)->icsk_bind_hash; __sk_del_bind_node(sk); - inet->bind_hash = NULL; - inet->num = 0; + inet_csk(sk)->icsk_bind_hash = NULL; + inet_sk(sk)->num = 0; inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); spin_unlock(&head->lock); } diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index ceb577c7423..5cba59b869f 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -56,6 +56,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, struct inet_hashinfo *hashinfo) { const struct inet_sock *inet = inet_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_ehash_bucket *ehead = &hashinfo->ehash[sk->sk_hashent]; struct inet_bind_hashbucket *bhead; /* Step 1: Put TW into bind hash. Original socket stays there too. @@ -64,8 +65,8 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, */ bhead = &hashinfo->bhash[inet_bhashfn(inet->num, hashinfo->bhash_size)]; spin_lock(&bhead->lock); - tw->tw_tb = inet->bind_hash; - BUG_TRAP(inet->bind_hash); + tw->tw_tb = icsk->icsk_bind_hash; + BUG_TRAP(icsk->icsk_bind_hash); inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); spin_unlock(&bhead->lock); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 72d01444218..8692cb9d4bd 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -180,7 +180,7 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, child = tp->af_specific->syn_recv_sock(sk, skb, req, dst); if (child) - tcp_acceptq_queue(sk, req, child); + inet_csk_reqsk_queue_add(sk, req, child); else reqsk_free(req); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f1a708bf7a9..8177b86570d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -313,7 +313,7 @@ EXPORT_SYMBOL(tcp_enter_memory_pressure); static __inline__ unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait) { - return !reqsk_queue_empty(&tcp_sk(sk)->accept_queue) ? (POLLIN | POLLRDNORM) : 0; + return !reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue) ? (POLLIN | POLLRDNORM) : 0; } /* @@ -458,15 +458,15 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) int tcp_listen_start(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); - struct tcp_sock *tp = tcp_sk(sk); - int rc = reqsk_queue_alloc(&tp->accept_queue, TCP_SYNQ_HSIZE); + struct inet_connection_sock *icsk = inet_csk(sk); + int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, TCP_SYNQ_HSIZE); if (rc != 0) return rc; sk->sk_max_ack_backlog = 0; sk->sk_ack_backlog = 0; - tcp_delack_init(tp); + inet_csk_delack_init(sk); /* There is race window here: we announce ourselves listening, * but this transition is still not validated by get_port(). @@ -484,7 +484,7 @@ int tcp_listen_start(struct sock *sk) } sk->sk_state = TCP_CLOSE; - __reqsk_queue_destroy(&tp->accept_queue); + __reqsk_queue_destroy(&icsk->icsk_accept_queue); return -EADDRINUSE; } @@ -495,14 +495,14 @@ int tcp_listen_start(struct sock *sk) static void tcp_listen_stop (struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock *acc_req; struct request_sock *req; - tcp_delete_keepalive_timer(sk); + inet_csk_delete_keepalive_timer(sk); /* make all the listen_opt local to us */ - acc_req = reqsk_queue_yank_acceptq(&tp->accept_queue); + acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue); /* Following specs, it would be better either to send FIN * (and enter FIN-WAIT-1, it is normal close) @@ -512,7 +512,7 @@ static void tcp_listen_stop (struct sock *sk) * To be honest, we are not able to make either * of the variants now. --ANK */ - reqsk_queue_destroy(&tp->accept_queue); + reqsk_queue_destroy(&icsk->icsk_accept_queue); while ((req = acc_req) != NULL) { struct sock *child = req->sk; @@ -1039,20 +1039,21 @@ static void cleanup_rbuf(struct sock *sk, int copied) BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); #endif - if (tcp_ack_scheduled(tp)) { + if (inet_csk_ack_scheduled(sk)) { + const struct inet_connection_sock *icsk = inet_csk(sk); /* Delayed ACKs frequently hit locked sockets during bulk * receive. */ - if (tp->ack.blocked || + if (icsk->icsk_ack.blocked || /* Once-per-two-segments ACK was not sent by tcp_input.c */ - tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss || + tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || /* * If this read emptied read buffer, we send ACK, if * connection is not bidirectional, user drained * receive buffer and there was a small segment * in queue. */ - (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) && - !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc))) + (copied > 0 && (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && + !icsk->icsk_ack.pingpong && !atomic_read(&sk->sk_rmem_alloc))) time_to_ack = 1; } @@ -1569,7 +1570,7 @@ void tcp_destroy_sock(struct sock *sk) BUG_TRAP(sk_unhashed(sk)); /* If it has not 0 inet_sk(sk)->num, it must be bound */ - BUG_TRAP(!inet_sk(sk)->num || inet_sk(sk)->bind_hash); + BUG_TRAP(!inet_sk(sk)->num || inet_csk(sk)->icsk_bind_hash); sk->sk_prot->destroy(sk); @@ -1698,10 +1699,10 @@ adjudge_to_death: tcp_send_active_reset(sk, GFP_ATOMIC); NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER); } else { - int tmo = tcp_fin_time(tp); + const int tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { - tcp_reset_keepalive_timer(sk, tcp_fin_time(tp)); + inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk)); } else { atomic_inc(&tcp_orphan_count); tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); @@ -1746,6 +1747,7 @@ static inline int tcp_need_reset(int state) int tcp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int err = 0; int old_state = sk->sk_state; @@ -1782,7 +1784,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->srtt = 0; if ((tp->write_seq += tp->max_window + 2) == 0) tp->write_seq = 1; - tp->backoff = 0; + icsk->icsk_backoff = 0; tp->snd_cwnd = 2; tp->probes_out = 0; tp->packets_out = 0; @@ -1790,13 +1792,13 @@ int tcp_disconnect(struct sock *sk, int flags) tp->snd_cwnd_cnt = 0; tcp_set_ca_state(tp, TCP_CA_Open); tcp_clear_retrans(tp); - tcp_delack_init(tp); + inet_csk_delack_init(sk); sk->sk_send_head = NULL; tp->rx_opt.saw_tstamp = 0; tcp_sack_reset(&tp->rx_opt); __sk_dst_reset(sk); - BUG_TRAP(!inet->num || inet->bind_hash); + BUG_TRAP(!inet->num || icsk->icsk_bind_hash); sk->sk_error_report(sk); return err; @@ -1808,7 +1810,7 @@ int tcp_disconnect(struct sock *sk, int flags) */ static int wait_for_connect(struct sock *sk, long timeo) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); DEFINE_WAIT(wait); int err; @@ -1830,11 +1832,11 @@ static int wait_for_connect(struct sock *sk, long timeo) prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); release_sock(sk); - if (reqsk_queue_empty(&tp->accept_queue)) + if (reqsk_queue_empty(&icsk->icsk_accept_queue)) timeo = schedule_timeout(timeo); lock_sock(sk); err = 0; - if (!reqsk_queue_empty(&tp->accept_queue)) + if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) break; err = -EINVAL; if (sk->sk_state != TCP_LISTEN) @@ -1854,9 +1856,9 @@ static int wait_for_connect(struct sock *sk, long timeo) * This will accept the next outstanding connection. */ -struct sock *tcp_accept(struct sock *sk, int flags, int *err) +struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); struct sock *newsk; int error; @@ -1870,7 +1872,7 @@ struct sock *tcp_accept(struct sock *sk, int flags, int *err) goto out_err; /* Find already established connection */ - if (reqsk_queue_empty(&tp->accept_queue)) { + if (reqsk_queue_empty(&icsk->icsk_accept_queue)) { long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); /* If this is a non blocking socket don't sleep */ @@ -1883,7 +1885,7 @@ struct sock *tcp_accept(struct sock *sk, int flags, int *err) goto out_err; } - newsk = reqsk_queue_get_child(&tp->accept_queue, sk); + newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); BUG_TRAP(newsk->sk_state != TCP_SYN_RECV); out: release_sock(sk); @@ -1901,6 +1903,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen) { struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); int val; int err = 0; @@ -1999,7 +2002,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, elapsed = tp->keepalive_time - elapsed; else elapsed = 0; - tcp_reset_keepalive_timer(sk, elapsed); + inet_csk_reset_keepalive_timer(sk, elapsed); } } break; @@ -2019,7 +2022,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, if (val < 1 || val > MAX_TCP_SYNCNT) err = -EINVAL; else - tp->syn_retries = val; + icsk->icsk_syn_retries = val; break; case TCP_LINGER2: @@ -2058,16 +2061,16 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, case TCP_QUICKACK: if (!val) { - tp->ack.pingpong = 1; + icsk->icsk_ack.pingpong = 1; } else { - tp->ack.pingpong = 0; + icsk->icsk_ack.pingpong = 0; if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && - tcp_ack_scheduled(tp)) { - tp->ack.pending |= TCP_ACK_PUSHED; + inet_csk_ack_scheduled(sk)) { + icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; cleanup_rbuf(sk, 1); if (!(val & 1)) - tp->ack.pingpong = 1; + icsk->icsk_ack.pingpong = 1; } } break; @@ -2084,15 +2087,16 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, void tcp_get_info(struct sock *sk, struct tcp_info *info) { struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); u32 now = tcp_time_stamp; memset(info, 0, sizeof(*info)); info->tcpi_state = sk->sk_state; info->tcpi_ca_state = tp->ca_state; - info->tcpi_retransmits = tp->retransmits; + info->tcpi_retransmits = icsk->icsk_retransmits; info->tcpi_probes = tp->probes_out; - info->tcpi_backoff = tp->backoff; + info->tcpi_backoff = icsk->icsk_backoff; if (tp->rx_opt.tstamp_ok) info->tcpi_options |= TCPI_OPT_TIMESTAMPS; @@ -2107,10 +2111,10 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (tp->ecn_flags&TCP_ECN_OK) info->tcpi_options |= TCPI_OPT_ECN; - info->tcpi_rto = jiffies_to_usecs(tp->rto); - info->tcpi_ato = jiffies_to_usecs(tp->ack.ato); + info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); + info->tcpi_ato = jiffies_to_usecs(icsk->icsk_ack.ato); info->tcpi_snd_mss = tp->mss_cache; - info->tcpi_rcv_mss = tp->ack.rcv_mss; + info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; info->tcpi_unacked = tp->packets_out; info->tcpi_sacked = tp->sacked_out; @@ -2119,7 +2123,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_fackets = tp->fackets_out; info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); - info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime); + info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); info->tcpi_pmtu = tp->pmtu_cookie; @@ -2179,7 +2183,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes; break; case TCP_SYNCNT: - val = tp->syn_retries ? : sysctl_tcp_syn_retries; + val = inet_csk(sk)->icsk_syn_retries ? : sysctl_tcp_syn_retries; break; case TCP_LINGER2: val = tp->linger2; @@ -2209,7 +2213,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, return 0; } case TCP_QUICKACK: - val = !tp->ack.pingpong; + val = !inet_csk(sk)->icsk_ack.pingpong; break; case TCP_CONGESTION: @@ -2340,7 +2344,7 @@ void __init tcp_init(void) tcp_register_congestion_control(&tcp_reno); } -EXPORT_SYMBOL(tcp_accept); +EXPORT_SYMBOL(inet_csk_accept); EXPORT_SYMBOL(tcp_close); EXPORT_SYMBOL(tcp_destroy_sock); EXPORT_SYMBOL(tcp_disconnect); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 60c6a797cc5..5f4c74f45e8 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -48,8 +48,9 @@ static struct sock *tcpnl; static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, int ext, u32 pid, u32 seq, u16 nlmsg_flags) { - struct inet_sock *inet = inet_sk(sk); + const struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); struct tcpdiagmsg *r; struct nlmsghdr *nlh; struct tcp_info *info = NULL; @@ -129,14 +130,14 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, #define EXPIRES_IN_MS(tmo) ((tmo-jiffies)*1000+HZ-1)/HZ - if (tp->pending == TCP_TIME_RETRANS) { + if (icsk->icsk_pending == ICSK_TIME_RETRANS) { r->tcpdiag_timer = 1; - r->tcpdiag_retrans = tp->retransmits; - r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout); - } else if (tp->pending == TCP_TIME_PROBE0) { + r->tcpdiag_retrans = icsk->icsk_retransmits; + r->tcpdiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); + } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { r->tcpdiag_timer = 4; r->tcpdiag_retrans = tp->probes_out; - r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout); + r->tcpdiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); } else if (timer_pending(&sk->sk_timer)) { r->tcpdiag_timer = 2; r->tcpdiag_retrans = tp->probes_out; @@ -497,7 +498,7 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk, { struct tcpdiag_entry entry; struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); struct listen_sock *lopt; struct rtattr *bc = NULL; struct inet_sock *inet = inet_sk(sk); @@ -513,9 +514,9 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk, entry.family = sk->sk_family; - read_lock_bh(&tp->accept_queue.syn_wait_lock); + read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - lopt = tp->accept_queue.listen_opt; + lopt = icsk->icsk_accept_queue.listen_opt; if (!lopt || !lopt->qlen) goto out; @@ -572,7 +573,7 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk, } out: - read_unlock_bh(&tp->accept_queue.syn_wait_lock); + read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); return err; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ffa24025cd0..8a8c5c2d90c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -114,20 +114,21 @@ int sysctl_tcp_moderate_rcvbuf = 1; /* Adapt the MSS value used to make delayed ack decision to the * real world. */ -static inline void tcp_measure_rcv_mss(struct tcp_sock *tp, - struct sk_buff *skb) +static inline void tcp_measure_rcv_mss(struct sock *sk, + const struct sk_buff *skb) { - unsigned int len, lss; + struct inet_connection_sock *icsk = inet_csk(sk); + const unsigned int lss = icsk->icsk_ack.last_seg_size; + unsigned int len; - lss = tp->ack.last_seg_size; - tp->ack.last_seg_size = 0; + icsk->icsk_ack.last_seg_size = 0; /* skb->len may jitter because of SACKs, even if peer * sends good full-sized frames. */ len = skb->len; - if (len >= tp->ack.rcv_mss) { - tp->ack.rcv_mss = len; + if (len >= icsk->icsk_ack.rcv_mss) { + icsk->icsk_ack.rcv_mss = len; } else { /* Otherwise, we make more careful check taking into account, * that SACKs block is variable. @@ -147,41 +148,44 @@ static inline void tcp_measure_rcv_mss(struct tcp_sock *tp, * tcp header plus fixed timestamp option length. * Resulting "len" is MSS free of SACK jitter. */ - len -= tp->tcp_header_len; - tp->ack.last_seg_size = len; + len -= tcp_sk(sk)->tcp_header_len; + icsk->icsk_ack.last_seg_size = len; if (len == lss) { - tp->ack.rcv_mss = len; + icsk->icsk_ack.rcv_mss = len; return; } } - tp->ack.pending |= TCP_ACK_PUSHED; + icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; } } -static void tcp_incr_quickack(struct tcp_sock *tp) +static void tcp_incr_quickack(struct sock *sk) { - unsigned quickacks = tp->rcv_wnd/(2*tp->ack.rcv_mss); + struct inet_connection_sock *icsk = inet_csk(sk); + unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); if (quickacks==0) quickacks=2; - if (quickacks > tp->ack.quick) - tp->ack.quick = min(quickacks, TCP_MAX_QUICKACKS); + if (quickacks > icsk->icsk_ack.quick) + icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); } -void tcp_enter_quickack_mode(struct tcp_sock *tp) +void tcp_enter_quickack_mode(struct sock *sk) { - tcp_incr_quickack(tp); - tp->ack.pingpong = 0; - tp->ack.ato = TCP_ATO_MIN; + struct inet_connection_sock *icsk = inet_csk(sk); + tcp_incr_quickack(sk); + icsk->icsk_ack.pingpong = 0; + icsk->icsk_ack.ato = TCP_ATO_MIN; } /* Send ACKs quickly, if "quick" count is not exhausted * and the session is not interactive. */ -static __inline__ int tcp_in_quickack_mode(struct tcp_sock *tp) +static inline int tcp_in_quickack_mode(const struct sock *sk) { - return (tp->ack.quick && !tp->ack.pingpong); + const struct inet_connection_sock *icsk = inet_csk(sk); + return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; } /* Buffer size and advertised window tuning. @@ -224,8 +228,8 @@ static void tcp_fixup_sndbuf(struct sock *sk) */ /* Slow part of check#2. */ -static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp, - struct sk_buff *skb) +static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, + const struct sk_buff *skb) { /* Optimize this! */ int truesize = tcp_win_from_space(skb->truesize)/2; @@ -233,7 +237,7 @@ static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp, while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) - return 2*tp->ack.rcv_mss; + return 2 * inet_csk(sk)->icsk_ack.rcv_mss; truesize >>= 1; window >>= 1; @@ -260,7 +264,7 @@ static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, if (incr) { tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp); - tp->ack.quick |= 1; + inet_csk(sk)->icsk_ack.quick |= 1; } } } @@ -325,7 +329,7 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) unsigned int app_win = tp->rcv_nxt - tp->copied_seq; int ofo_win = 0; - tp->ack.quick = 0; + inet_csk(sk)->icsk_ack.quick = 0; skb_queue_walk(&tp->out_of_order_queue, skb) { ofo_win += skb->len; @@ -346,8 +350,8 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) app_win += ofo_win; if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf) app_win >>= 1; - if (app_win > tp->ack.rcv_mss) - app_win -= tp->ack.rcv_mss; + if (app_win > inet_csk(sk)->icsk_ack.rcv_mss) + app_win -= inet_csk(sk)->icsk_ack.rcv_mss; app_win = max(app_win, 2U*tp->advmss); if (!ofo_win) @@ -415,11 +419,12 @@ new_measure: tp->rcv_rtt_est.time = tcp_time_stamp; } -static inline void tcp_rcv_rtt_measure_ts(struct tcp_sock *tp, struct sk_buff *skb) +static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); if (tp->rx_opt.rcv_tsecr && (TCP_SKB_CB(skb)->end_seq - - TCP_SKB_CB(skb)->seq >= tp->ack.rcv_mss)) + TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0); } @@ -492,41 +497,42 @@ new_measure: */ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) { + struct inet_connection_sock *icsk = inet_csk(sk); u32 now; - tcp_schedule_ack(tp); + inet_csk_schedule_ack(sk); - tcp_measure_rcv_mss(tp, skb); + tcp_measure_rcv_mss(sk, skb); tcp_rcv_rtt_measure(tp); now = tcp_time_stamp; - if (!tp->ack.ato) { + if (!icsk->icsk_ack.ato) { /* The _first_ data packet received, initialize * delayed ACK engine. */ - tcp_incr_quickack(tp); - tp->ack.ato = TCP_ATO_MIN; + tcp_incr_quickack(sk); + icsk->icsk_ack.ato = TCP_ATO_MIN; } else { - int m = now - tp->ack.lrcvtime; + int m = now - icsk->icsk_ack.lrcvtime; if (m <= TCP_ATO_MIN/2) { /* The fastest case is the first. */ - tp->ack.ato = (tp->ack.ato>>1) + TCP_ATO_MIN/2; - } else if (m < tp->ack.ato) { - tp->ack.ato = (tp->ack.ato>>1) + m; - if (tp->ack.ato > tp->rto) - tp->ack.ato = tp->rto; - } else if (m > tp->rto) { + icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2; + } else if (m < icsk->icsk_ack.ato) { + icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m; + if (icsk->icsk_ack.ato > icsk->icsk_rto) + icsk->icsk_ack.ato = icsk->icsk_rto; + } else if (m > icsk->icsk_rto) { /* Too long gap. Apparently sender falled to * restart window, so that we send ACKs quickly. */ - tcp_incr_quickack(tp); + tcp_incr_quickack(sk); sk_stream_mem_reclaim(sk); } } - tp->ack.lrcvtime = now; + icsk->icsk_ack.lrcvtime = now; TCP_ECN_check_ce(tp, skb); @@ -611,8 +617,9 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt) /* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ -static inline void tcp_set_rto(struct tcp_sock *tp) +static inline void tcp_set_rto(struct sock *sk) { + const struct tcp_sock *tp = tcp_sk(sk); /* Old crap is replaced with new one. 8) * * More seriously: @@ -623,7 +630,7 @@ static inline void tcp_set_rto(struct tcp_sock *tp) * is invisible. Actually, Linux-2.4 also generates erratic * ACKs in some curcumstances. */ - tp->rto = (tp->srtt >> 3) + tp->rttvar; + inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar; /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, @@ -635,10 +642,10 @@ static inline void tcp_set_rto(struct tcp_sock *tp) /* NOTE: clamping at TCP_RTO_MIN is not required, current algo * guarantees that rto is higher. */ -static inline void tcp_bound_rto(struct tcp_sock *tp) +static inline void tcp_bound_rto(struct sock *sk) { - if (tp->rto > TCP_RTO_MAX) - tp->rto = TCP_RTO_MAX; + if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX) + inet_csk(sk)->icsk_rto = TCP_RTO_MAX; } /* Save metrics learned by this TCP session. @@ -658,7 +665,7 @@ void tcp_update_metrics(struct sock *sk) if (dst && (dst->flags&DST_HOST)) { int m; - if (tp->backoff || !tp->srtt) { + if (inet_csk(sk)->icsk_backoff || !tp->srtt) { /* This session failed to estimate rtt. Why? * Probably, no packets returned in time. * Reset our results. @@ -801,9 +808,9 @@ static void tcp_init_metrics(struct sock *sk) tp->mdev = dst_metric(dst, RTAX_RTTVAR); tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); } - tcp_set_rto(tp); - tcp_bound_rto(tp); - if (tp->rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) + tcp_set_rto(sk); + tcp_bound_rto(sk); + if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) goto reset; tp->snd_cwnd = tcp_init_cwnd(tp, dst); tp->snd_cwnd_stamp = tcp_time_stamp; @@ -817,7 +824,7 @@ reset: if (!tp->rx_opt.saw_tstamp && tp->srtt) { tp->srtt = 0; tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT; - tp->rto = TCP_TIMEOUT_INIT; + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; } } @@ -1118,7 +1125,7 @@ void tcp_enter_frto(struct sock *sk) if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || - (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { + (tp->ca_state == TCP_CA_Loss && !inet_csk(sk)->icsk_retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(tp); tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); tcp_ca_event(tp, CA_EVENT_FRTO); @@ -1214,7 +1221,7 @@ void tcp_enter_loss(struct sock *sk, int how) /* Reduce ssthresh if it has not yet been made inside this window. */ if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || - (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { + (tp->ca_state == TCP_CA_Loss && !inet_csk(sk)->icsk_retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(tp); tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); tcp_ca_event(tp, CA_EVENT_LOSS); @@ -1253,7 +1260,7 @@ void tcp_enter_loss(struct sock *sk, int how) TCP_ECN_queue_cwr(tp); } -static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp) +static int tcp_check_sack_reneging(struct sock *sk) { struct sk_buff *skb; @@ -1268,9 +1275,10 @@ static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp) NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING); tcp_enter_loss(sk, 1); - tp->retransmits++; + inet_csk(sk)->icsk_retransmits++; tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)); - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto); return 1; } return 0; @@ -1281,15 +1289,15 @@ static inline int tcp_fackets_out(struct tcp_sock *tp) return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out; } -static inline int tcp_skb_timedout(struct tcp_sock *tp, struct sk_buff *skb) +static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) { - return (tcp_time_stamp - TCP_SKB_CB(skb)->when > tp->rto); + return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto); } static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp) { return tp->packets_out && - tcp_skb_timedout(tp, skb_peek(&sk->sk_write_queue)); + tcp_skb_timedout(sk, skb_peek(&sk->sk_write_queue)); } /* Linux NewReno/SACK/FACK/ECN state machine. @@ -1509,7 +1517,7 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) struct sk_buff *skb; sk_stream_for_retrans_queue(skb, sk) { - if (tcp_skb_timedout(tp, skb) && + if (tcp_skb_timedout(sk, skb) && !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); @@ -1676,7 +1684,7 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) tp->left_out = tp->sacked_out; tcp_undo_cwr(tp, 1); NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); - tp->retransmits = 0; + inet_csk(sk)->icsk_retransmits = 0; tp->undo_marker = 0; if (!IsReno(tp)) tcp_set_ca_state(tp, TCP_CA_Open); @@ -1750,7 +1758,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, tp->prior_ssthresh = 0; /* B. In all the states check for reneging SACKs. */ - if (tp->sacked_out && tcp_check_sack_reneging(sk, tp)) + if (tp->sacked_out && tcp_check_sack_reneging(sk)) return; /* C. Process data loss notification, provided it is valid. */ @@ -1774,7 +1782,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, } else if (!before(tp->snd_una, tp->high_seq)) { switch (tp->ca_state) { case TCP_CA_Loss: - tp->retransmits = 0; + inet_csk(sk)->icsk_retransmits = 0; if (tcp_try_undo_recovery(sk, tp)) return; break; @@ -1824,7 +1832,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, break; case TCP_CA_Loss: if (flag&FLAG_DATA_ACKED) - tp->retransmits = 0; + inet_csk(sk)->icsk_retransmits = 0; if (!tcp_try_undo_loss(sk, tp)) { tcp_moderate_cwnd(tp); tcp_xmit_retransmit_queue(sk); @@ -1881,10 +1889,8 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* Read draft-ietf-tcplw-high-performance before mucking * with this code. (Superceeds RFC1323) */ -static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag) +static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) { - __u32 seq_rtt; - /* RTTM Rule: A TSecr value received in a segment is used to * update the averaged RTT measurement only if the segment * acknowledges some new data, i.e., only if it advances the @@ -1900,14 +1906,15 @@ static void tcp_ack_saw_tstamp(struct tcp_sock *tp, u32 *usrtt, int flag) * answer arrives rto becomes 120 seconds! If at least one of segments * in window is lost... Voila. --ANK (010210) */ - seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; + struct tcp_sock *tp = tcp_sk(sk); + const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; tcp_rtt_estimator(tp, seq_rtt, usrtt); - tcp_set_rto(tp); - tp->backoff = 0; - tcp_bound_rto(tp); + tcp_set_rto(sk); + inet_csk(sk)->icsk_backoff = 0; + tcp_bound_rto(sk); } -static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int flag) +static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag) { /* We don't have a timestamp. Can only use * packets that are not retransmitted to determine @@ -1921,20 +1928,21 @@ static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, u32 *usrtt, int if (flag & FLAG_RETRANS_DATA_ACKED) return; - tcp_rtt_estimator(tp, seq_rtt, usrtt); - tcp_set_rto(tp); - tp->backoff = 0; - tcp_bound_rto(tp); + tcp_rtt_estimator(tcp_sk(sk), seq_rtt, usrtt); + tcp_set_rto(sk); + inet_csk(sk)->icsk_backoff = 0; + tcp_bound_rto(sk); } -static inline void tcp_ack_update_rtt(struct tcp_sock *tp, - int flag, s32 seq_rtt, u32 *usrtt) +static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, + const s32 seq_rtt, u32 *usrtt) { + const struct tcp_sock *tp = tcp_sk(sk); /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) - tcp_ack_saw_tstamp(tp, usrtt, flag); + tcp_ack_saw_tstamp(sk, usrtt, flag); else if (seq_rtt >= 0) - tcp_ack_no_tstamp(tp, seq_rtt, usrtt, flag); + tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag); } static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, @@ -1951,9 +1959,9 @@ static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) { if (!tp->packets_out) { - tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS); + inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); } else { - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto); } } @@ -2090,7 +2098,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt } if (acked&FLAG_ACKED) { - tcp_ack_update_rtt(tp, acked, seq_rtt, seq_usrtt); + tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt); tcp_ack_packets_out(sk, tp); if (tp->ca_ops->pkts_acked) @@ -2125,20 +2133,21 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt static void tcp_ack_probe(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); /* Was it a usable window open? */ if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq, tp->snd_una + tp->snd_wnd)) { - tp->backoff = 0; - tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0); + icsk->icsk_backoff = 0; + inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); /* Socket must be waked up by subsequent tcp_data_snd_check(). * This function is not for random using! */ } else { - tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, - min(tp->rto << tp->backoff, TCP_RTO_MAX)); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX)); } } @@ -2157,8 +2166,8 @@ static inline int tcp_may_raise_cwnd(struct tcp_sock *tp, int flag) /* Check that window update is acceptable. * The function assumes that snd_una<=ack<=snd_next. */ -static inline int tcp_may_update_window(struct tcp_sock *tp, u32 ack, - u32 ack_seq, u32 nwin) +static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, + const u32 ack_seq, const u32 nwin) { return (after(ack, tp->snd_una) || after(ack_seq, tp->snd_wl1) || @@ -2500,8 +2509,9 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) * up to bandwidth of 18Gigabit/sec. 8) ] */ -static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb) +static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); struct tcphdr *th = skb->h.th; u32 seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; @@ -2516,14 +2526,15 @@ static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb) !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) && /* 4. ... and sits in replay window. */ - (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (tp->rto*1024)/HZ); + (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (inet_csk(sk)->icsk_rto * 1024) / HZ); } -static inline int tcp_paws_discard(struct tcp_sock *tp, struct sk_buff *skb) +static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff *skb) { + const struct tcp_sock *tp = tcp_sk(sk); return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW && xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS && - !tcp_disordered_ack(tp, skb)); + !tcp_disordered_ack(sk, skb)); } /* Check segment sequence number for validity. @@ -2586,7 +2597,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) { struct tcp_sock *tp = tcp_sk(sk); - tcp_schedule_ack(tp); + inet_csk_schedule_ack(sk); sk->sk_shutdown |= RCV_SHUTDOWN; sock_set_flag(sk, SOCK_DONE); @@ -2596,7 +2607,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) case TCP_ESTABLISHED: /* Move to CLOSE_WAIT */ tcp_set_state(sk, TCP_CLOSE_WAIT); - tp->ack.pingpong = 1; + inet_csk(sk)->icsk_ack.pingpong = 1; break; case TCP_CLOSE_WAIT: @@ -2694,7 +2705,7 @@ static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb) if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST); - tcp_enter_quickack_mode(tp); + tcp_enter_quickack_mode(sk); if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; @@ -2942,7 +2953,7 @@ queue_and_out: * gap in queue is filled. */ if (skb_queue_empty(&tp->out_of_order_queue)) - tp->ack.pingpong = 0; + inet_csk(sk)->icsk_ack.pingpong = 0; } if (tp->rx_opt.num_sacks) @@ -2963,8 +2974,8 @@ queue_and_out: tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); out_of_window: - tcp_enter_quickack_mode(tp); - tcp_schedule_ack(tp); + tcp_enter_quickack_mode(sk); + inet_csk_schedule_ack(sk); drop: __kfree_skb(skb); return; @@ -2974,7 +2985,7 @@ drop: if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp))) goto out_of_window; - tcp_enter_quickack_mode(tp); + tcp_enter_quickack_mode(sk); if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { /* Partial packet, seq < rcv_next < end_seq */ @@ -3003,7 +3014,7 @@ drop: /* Disable header prediction. */ tp->pred_flags = 0; - tcp_schedule_ack(tp); + inet_csk_schedule_ack(sk); SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); @@ -3373,13 +3384,13 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) struct tcp_sock *tp = tcp_sk(sk); /* More than one full frame received... */ - if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss + if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss /* ... and right edge of window advances far enough. * (tcp_recvmsg() will send ACK otherwise). Or... */ && __tcp_select_window(sk) >= tp->rcv_wnd) || /* We ACK each frame or... */ - tcp_in_quickack_mode(tp) || + tcp_in_quickack_mode(sk) || /* We have out of order data. */ (ofo_possible && skb_peek(&tp->out_of_order_queue))) { @@ -3393,8 +3404,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) static __inline__ void tcp_ack_snd_check(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); - if (!tcp_ack_scheduled(tp)) { + if (!inet_csk_ack_scheduled(sk)) { /* We sent a data segment already. */ return; } @@ -3648,7 +3658,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tp->rcv_nxt == tp->rcv_wup) tcp_store_ts_recent(tp); - tcp_rcv_rtt_measure_ts(tp, skb); + tcp_rcv_rtt_measure_ts(sk, skb); /* We know that such packets are checksummed * on entry. @@ -3681,7 +3691,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tp->rcv_nxt == tp->rcv_wup) tcp_store_ts_recent(tp); - tcp_rcv_rtt_measure_ts(tp, skb); + tcp_rcv_rtt_measure_ts(sk, skb); __skb_pull(skb, tcp_header_len); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; @@ -3702,7 +3712,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tp->rcv_nxt == tp->rcv_wup) tcp_store_ts_recent(tp); - tcp_rcv_rtt_measure_ts(tp, skb); + tcp_rcv_rtt_measure_ts(sk, skb); if ((int)skb->truesize > sk->sk_forward_alloc) goto step5; @@ -3722,7 +3732,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, /* Well, only one small jumplet in fast path... */ tcp_ack(sk, skb, FLAG_DATA); tcp_data_snd_check(sk, tp); - if (!tcp_ack_scheduled(tp)) + if (!inet_csk_ack_scheduled(sk)) goto no_ack; } @@ -3744,7 +3754,7 @@ slow_path: * RFC1323: H1. Apply PAWS check first. */ if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && - tcp_paws_discard(tp, skb)) { + tcp_paws_discard(sk, skb)) { if (!th->rst) { NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); tcp_send_dupack(sk, skb); @@ -3791,7 +3801,7 @@ step5: if(th->ack) tcp_ack(sk, skb, FLAG_SLOWPATH); - tcp_rcv_rtt_measure_ts(tp, skb); + tcp_rcv_rtt_measure_ts(sk, skb); /* Process urgent data. */ tcp_urg(sk, skb, th); @@ -3933,7 +3943,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_init_buffer_space(sk); if (sock_flag(sk, SOCK_KEEPOPEN)) - tcp_reset_keepalive_timer(sk, keepalive_time_when(tp)); + inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); if (!tp->rx_opt.snd_wscale) __tcp_fast_path_on(tp, tp->snd_wnd); @@ -3945,7 +3955,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, sk_wake_async(sk, 0, POLL_OUT); } - if (sk->sk_write_pending || tp->defer_accept || tp->ack.pingpong) { + if (sk->sk_write_pending || tp->defer_accept || inet_csk(sk)->icsk_ack.pingpong) { /* Save one ACK. Data will be ready after * several ticks, if write_pending is set. * @@ -3953,12 +3963,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * look so _wonderfully_ clever, that I was not able * to stand against the temptation 8) --ANK */ - tcp_schedule_ack(tp); - tp->ack.lrcvtime = tcp_time_stamp; - tp->ack.ato = TCP_ATO_MIN; - tcp_incr_quickack(tp); - tcp_enter_quickack_mode(tp); - tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX); + inet_csk_schedule_ack(sk); + inet_csk(sk)->icsk_ack.lrcvtime = tcp_time_stamp; + inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; + tcp_incr_quickack(sk); + tcp_enter_quickack_mode(sk); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX); discard: __kfree_skb(skb); @@ -4114,7 +4124,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, } if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && - tcp_paws_discard(tp, skb)) { + tcp_paws_discard(sk, skb)) { if (!th->rst) { NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); tcp_send_dupack(sk, skb); @@ -4183,7 +4193,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !tp->srtt) - tcp_ack_saw_tstamp(tp, 0, 0); + tcp_ack_saw_tstamp(sk, 0, 0); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; @@ -4230,9 +4240,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, return 1; } - tmo = tcp_fin_time(tp); + tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { - tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); + inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); } else if (th->fin || sock_owned_by_user(sk)) { /* Bad case. We could lose such FIN otherwise. * It is not a big problem, but it looks confusing @@ -4240,7 +4250,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, * if it spins in bh_lock_sock(), but it is really * marginal case. */ - tcp_reset_keepalive_timer(sk, tmo); + inet_csk_reset_keepalive_timer(sk, tmo); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto discard; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e7e91e60ac7..2cd41265d17 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -104,7 +104,7 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { */ int sysctl_local_port_range[2] = { 1024, 4999 }; -static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) +static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) { const u32 sk_rcv_saddr = inet_rcv_saddr(sk); struct sock *sk2; @@ -113,7 +113,7 @@ static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb sk_for_each_bound(sk2, node, &tb->owners) { if (sk != sk2 && - !tcp_v6_ipv6only(sk2) && + !inet_v6_ipv6only(sk2) && (!sk->sk_bound_dev_if || !sk2->sk_bound_dev_if || sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { @@ -132,7 +132,8 @@ static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb /* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. */ -static int tcp_v4_get_port(struct sock *sk, unsigned short snum) +int inet_csk_get_port(struct inet_hashinfo *hashinfo, + struct sock *sk, unsigned short snum) { struct inet_bind_hashbucket *head; struct hlist_node *node; @@ -146,16 +147,16 @@ static int tcp_v4_get_port(struct sock *sk, unsigned short snum) int remaining = (high - low) + 1; int rover; - spin_lock(&tcp_hashinfo.portalloc_lock); - if (tcp_hashinfo.port_rover < low) + spin_lock(&hashinfo->portalloc_lock); + if (hashinfo->port_rover < low) rover = low; else - rover = tcp_hashinfo.port_rover; + rover = hashinfo->port_rover; do { rover++; if (rover > high) rover = low; - head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)]; + head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) if (tb->port == rover) @@ -164,8 +165,8 @@ static int tcp_v4_get_port(struct sock *sk, unsigned short snum) next: spin_unlock(&head->lock); } while (--remaining > 0); - tcp_hashinfo.port_rover = rover; - spin_unlock(&tcp_hashinfo.portalloc_lock); + hashinfo->port_rover = rover; + spin_unlock(&hashinfo->portalloc_lock); /* Exhausted local port range during search? It is not * possible for us to be holding one of the bind hash @@ -182,7 +183,7 @@ static int tcp_v4_get_port(struct sock *sk, unsigned short snum) */ snum = rover; } else { - head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; + head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) if (tb->port == snum) @@ -199,13 +200,13 @@ tb_found: goto success; } else { ret = 1; - if (tcp_bind_conflict(sk, tb)) + if (inet_csk_bind_conflict(sk, tb)) goto fail_unlock; } } tb_not_found: ret = 1; - if (!tb && (tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum)) == NULL) + if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL) goto fail_unlock; if (hlist_empty(&tb->owners)) { if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) @@ -216,9 +217,9 @@ tb_not_found: (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) tb->fastreuse = 0; success: - if (!inet_sk(sk)->bind_hash) + if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, snum); - BUG_TRAP(inet_sk(sk)->bind_hash == tb); + BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb); ret = 0; fail_unlock: @@ -228,6 +229,11 @@ fail: return ret; } +static int tcp_v4_get_port(struct sock *sk, unsigned short snum) +{ + return inet_csk_get_port(&tcp_hashinfo, sk, snum); +} + static void tcp_v4_hash(struct sock *sk) { inet_hash(&tcp_hashinfo, sk); @@ -426,7 +432,7 @@ ok: } head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; - tb = inet_sk(sk)->bind_hash; + tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { __inet_hash(&tcp_hashinfo, sk, 0); @@ -557,25 +563,28 @@ failure: return err; } -static __inline__ int tcp_v4_iif(struct sk_buff *skb) +static inline int inet_iif(const struct sk_buff *skb) { return ((struct rtable *)skb->dst)->rt_iif; } -static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd) +static inline u32 inet_synq_hash(const u32 raddr, const u16 rport, + const u32 rnd, const u16 synq_hsize) { - return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1)); + return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1); } -static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp, - struct request_sock ***prevp, - __u16 rport, - __u32 raddr, __u32 laddr) +struct request_sock *inet_csk_search_req(const struct sock *sk, + struct request_sock ***prevp, + const __u16 rport, const __u32 raddr, + const __u32 laddr) { - struct listen_sock *lopt = tp->accept_queue.listen_opt; + const struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; struct request_sock *req, **prev; - for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)]; + for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd, + lopt->nr_table_entries)]; (req = *prev) != NULL; prev = &req->dl_next) { const struct inet_request_sock *ireq = inet_rsk(req); @@ -583,7 +592,7 @@ static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp, if (ireq->rmt_port == rport && ireq->rmt_addr == raddr && ireq->loc_addr == laddr && - TCP_INET_FAMILY(req->rsk_ops->family)) { + AF_INET_FAMILY(req->rsk_ops->family)) { BUG_TRAP(!req->sk); *prevp = prev; break; @@ -595,12 +604,13 @@ static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp, static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req) { - struct tcp_sock *tp = tcp_sk(sk); - struct listen_sock *lopt = tp->accept_queue.listen_opt; - u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd); + struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, + lopt->hash_rnd, lopt->nr_table_entries); - reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT); - tcp_synq_added(sk); + reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, TCP_TIMEOUT_INIT); + inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT); } @@ -687,7 +697,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) } sk = inet_lookup(&tcp_hashinfo, iph->daddr, th->dest, iph->saddr, - th->source, tcp_v4_iif(skb)); + th->source, inet_iif(skb)); if (!sk) { ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); return; @@ -747,8 +757,8 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) if (sock_owned_by_user(sk)) goto out; - req = tcp_v4_search_req(tp, &prev, th->dest, - iph->daddr, iph->saddr); + req = inet_csk_search_req(sk, &prev, th->dest, + iph->daddr, iph->saddr); if (!req) goto out; @@ -768,7 +778,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 info) * created socket, and POSIX does not want network * errors returned from accept(). */ - tcp_synq_drop(sk, req, prev); + inet_csk_reqsk_queue_drop(sk, req, prev); goto out; case TCP_SYN_SENT: @@ -953,8 +963,8 @@ static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req) req->ts_recent); } -static struct dst_entry* tcp_v4_route_req(struct sock *sk, - struct request_sock *req) +struct dst_entry* inet_csk_route_req(struct sock *sk, + const struct request_sock *req) { struct rtable *rt; const struct inet_request_sock *ireq = inet_rsk(req); @@ -966,7 +976,7 @@ static struct dst_entry* tcp_v4_route_req(struct sock *sk, ireq->rmt_addr), .saddr = ireq->loc_addr, .tos = RT_CONN_FLAGS(sk) } }, - .proto = IPPROTO_TCP, + .proto = sk->sk_protocol, .uli_u = { .ports = { .sport = inet_sk(sk)->sport, .dport = ireq->rmt_port } } }; @@ -996,7 +1006,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req, struct sk_buff * skb; /* First, grab a route. */ - if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL) + if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) goto out; skb = tcp_make_synack(sk, dst, req); @@ -1098,7 +1108,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) * limitations, they conserve resources and peer is * evidently real one. */ - if (tcp_synq_is_full(sk) && !isn) { + if (inet_csk_reqsk_queue_is_full(sk) && !isn) { #ifdef CONFIG_SYN_COOKIES if (sysctl_tcp_syncookies) { want_cookie = 1; @@ -1112,7 +1122,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) * clogging syn queue with openreqs with exponentially increasing * timeout. */ - if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1) + if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; req = reqsk_alloc(&tcp_request_sock_ops); @@ -1169,7 +1179,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) */ if (tmp_opt.saw_tstamp && sysctl_tcp_tw_recycle && - (dst = tcp_v4_route_req(sk, req)) != NULL && + (dst = inet_csk_route_req(sk, req)) != NULL && (peer = rt_get_peer((struct rtable *)dst)) != NULL && peer->v4daddr == saddr) { if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL && @@ -1182,7 +1192,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) } /* Kill the following clause, if you dislike this way. */ else if (!sysctl_tcp_syncookies && - (sysctl_max_syn_backlog - tcp_synq_len(sk) < + (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < (sysctl_max_syn_backlog >> 2)) && (!peer || !peer->tcp_ts_stamp) && (!dst || !dst_metric(dst, RTAX_RTT))) { @@ -1240,7 +1250,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, if (sk_acceptq_is_full(sk)) goto exit_overflow; - if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL) + if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) goto exit; newsk = tcp_create_openreq_child(sk, req, skb); @@ -1257,7 +1267,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newinet->saddr = ireq->loc_addr; newinet->opt = ireq->opt; ireq->opt = NULL; - newinet->mc_index = tcp_v4_iif(skb); + newinet->mc_index = inet_iif(skb); newinet->mc_ttl = skb->nh.iph->ttl; newtp->ext_header_len = 0; if (newinet->opt) @@ -1285,18 +1295,17 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) { struct tcphdr *th = skb->h.th; struct iphdr *iph = skb->nh.iph; - struct tcp_sock *tp = tcp_sk(sk); struct sock *nsk; struct request_sock **prev; /* Find possible connection requests. */ - struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source, - iph->saddr, iph->daddr); + struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, + iph->saddr, iph->daddr); if (req) return tcp_check_req(sk, skb, req, prev); nsk = __inet_lookup_established(&tcp_hashinfo, skb->nh.iph->saddr, th->source, skb->nh.iph->daddr, - ntohs(th->dest), tcp_v4_iif(skb)); + ntohs(th->dest), inet_iif(skb)); if (nsk) { if (nsk->sk_state != TCP_TIME_WAIT) { @@ -1440,7 +1449,7 @@ int tcp_v4_rcv(struct sk_buff *skb) sk = __inet_lookup(&tcp_hashinfo, skb->nh.iph->saddr, th->source, skb->nh.iph->daddr, ntohs(th->dest), - tcp_v4_iif(skb)); + inet_iif(skb)); if (!sk) goto no_tcp_socket; @@ -1507,7 +1516,7 @@ do_time_wait: struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo, skb->nh.iph->daddr, ntohs(th->dest), - tcp_v4_iif(skb)); + inet_iif(skb)); if (sk2) { tcp_tw_deschedule((struct inet_timewait_sock *)sk); inet_twsk_put((struct inet_timewait_sock *)sk); @@ -1619,7 +1628,7 @@ static int tcp_v4_init_sock(struct sock *sk) tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); - tp->rto = TCP_TIMEOUT_INIT; + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev = TCP_TIMEOUT_INIT; /* So many TCP implementations out there (incorrectly) count the @@ -1672,7 +1681,7 @@ int tcp_v4_destroy_sock(struct sock *sk) __skb_queue_purge(&tp->ucopy.prequeue); /* Clean up a referenced TCP bind bucket. */ - if (inet_sk(sk)->bind_hash) + if (inet_csk(sk)->icsk_bind_hash) inet_put_port(&tcp_hashinfo, sk); /* @@ -1707,7 +1716,7 @@ static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw) static void *listening_get_next(struct seq_file *seq, void *cur) { - struct tcp_sock *tp; + struct inet_connection_sock *icsk; struct hlist_node *node; struct sock *sk = cur; struct tcp_iter_state* st = seq->private; @@ -1723,7 +1732,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) if (st->state == TCP_SEQ_STATE_OPENREQ) { struct request_sock *req = cur; - tp = tcp_sk(st->syn_wait_sk); + icsk = inet_csk(st->syn_wait_sk); req = req->dl_next; while (1) { while (req) { @@ -1736,17 +1745,17 @@ static void *listening_get_next(struct seq_file *seq, void *cur) if (++st->sbucket >= TCP_SYNQ_HSIZE) break; get_req: - req = tp->accept_queue.listen_opt->syn_table[st->sbucket]; + req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket]; } sk = sk_next(st->syn_wait_sk); st->state = TCP_SEQ_STATE_LISTENING; - read_unlock_bh(&tp->accept_queue.syn_wait_lock); + read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } else { - tp = tcp_sk(sk); - read_lock_bh(&tp->accept_queue.syn_wait_lock); - if (reqsk_queue_len(&tp->accept_queue)) + icsk = inet_csk(sk); + read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + if (reqsk_queue_len(&icsk->icsk_accept_queue)) goto start_req; - read_unlock_bh(&tp->accept_queue.syn_wait_lock); + read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); sk = sk_next(sk); } get_sk: @@ -1755,9 +1764,9 @@ get_sk: cur = sk; goto out; } - tp = tcp_sk(sk); - read_lock_bh(&tp->accept_queue.syn_wait_lock); - if (reqsk_queue_len(&tp->accept_queue)) { + icsk = inet_csk(sk); + read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + if (reqsk_queue_len(&icsk->icsk_accept_queue)) { start_req: st->uid = sock_i_uid(sk); st->syn_wait_sk = sk; @@ -1765,7 +1774,7 @@ start_req: st->sbucket = 0; goto get_req; } - read_unlock_bh(&tp->accept_queue.syn_wait_lock); + read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } if (++st->bucket < INET_LHTABLE_SIZE) { sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]); @@ -1951,8 +1960,8 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) switch (st->state) { case TCP_SEQ_STATE_OPENREQ: if (v) { - struct tcp_sock *tp = tcp_sk(st->syn_wait_sk); - read_unlock_bh(&tp->accept_queue.syn_wait_lock); + struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk); + read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } case TCP_SEQ_STATE_LISTENING: if (v != SEQ_START_TOKEN) @@ -2058,18 +2067,19 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i) int timer_active; unsigned long timer_expires; struct tcp_sock *tp = tcp_sk(sp); + const struct inet_connection_sock *icsk = inet_csk(sp); struct inet_sock *inet = inet_sk(sp); unsigned int dest = inet->daddr; unsigned int src = inet->rcv_saddr; __u16 destp = ntohs(inet->dport); __u16 srcp = ntohs(inet->sport); - if (tp->pending == TCP_TIME_RETRANS) { + if (icsk->icsk_pending == ICSK_TIME_RETRANS) { timer_active = 1; - timer_expires = tp->timeout; - } else if (tp->pending == TCP_TIME_PROBE0) { + timer_expires = icsk->icsk_timeout; + } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; - timer_expires = tp->timeout; + timer_expires = icsk->icsk_timeout; } else if (timer_pending(&sp->sk_timer)) { timer_active = 2; timer_expires = sp->sk_timer.expires; @@ -2084,12 +2094,14 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i) tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq, timer_active, jiffies_to_clock_t(timer_expires - jiffies), - tp->retransmits, + icsk->icsk_retransmits, sock_i_uid(sp), tp->probes_out, sock_i_ino(sp), atomic_read(&sp->sk_refcnt), sp, - tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong, + icsk->icsk_rto, + icsk->icsk_ack.ato, + (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong, tp->snd_cwnd, tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh); } @@ -2174,7 +2186,7 @@ struct proto tcp_prot = { .close = tcp_close, .connect = tcp_v4_connect, .disconnect = tcp_disconnect, - .accept = tcp_accept, + .accept = inet_csk_accept, .ioctl = tcp_ioctl, .init = tcp_v4_init_sock, .destroy = tcp_v4_destroy_sock, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 8b6cd8d8066..56823704eb7 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -271,7 +271,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) if (tw != NULL) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); - const int rto = (tp->rto << 2) - (tp->rto >> 1); + const struct inet_connection_sock *icsk = inet_csk(sk); + const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; tcptw->tw_rcv_nxt = tp->rcv_nxt; @@ -605,10 +606,11 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct inet_request_sock *ireq = inet_rsk(req); struct tcp_request_sock *treq = tcp_rsk(req); struct inet_sock *newinet = inet_sk(newsk); + struct inet_connection_sock *newicsk = inet_csk(newsk); struct tcp_sock *newtp; newsk->sk_state = TCP_SYN_RECV; - newinet->bind_hash = NULL; + newicsk->icsk_bind_hash = NULL; /* Clone the TCP header template */ newinet->dport = ireq->rmt_port; @@ -624,11 +626,11 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn); - newtp->retransmits = 0; - newtp->backoff = 0; + newicsk->icsk_retransmits = 0; + newicsk->icsk_backoff = 0; newtp->srtt = 0; newtp->mdev = TCP_TIMEOUT_INIT; - newtp->rto = TCP_TIMEOUT_INIT; + newicsk->icsk_rto = TCP_TIMEOUT_INIT; newtp->packets_out = 0; newtp->left_out = 0; @@ -667,11 +669,11 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->rx_opt.num_sacks = 0; newtp->urg_data = 0; /* Deinitialize accept_queue to trap illegal accesses. */ - memset(&newtp->accept_queue, 0, sizeof(newtp->accept_queue)); + memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); if (sock_flag(newsk, SOCK_KEEPOPEN)) - tcp_reset_keepalive_timer(newsk, - keepalive_time_when(newtp)); + inet_csk_reset_keepalive_timer(newsk, + keepalive_time_when(newtp)); newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) { @@ -701,7 +703,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->tcp_header_len = sizeof(struct tcphdr); } if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len) - newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len; + newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; TCP_ECN_openreq_child(newtp, req); if (newtp->ecn_flags&TCP_ECN_OK) @@ -881,10 +883,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, if (child == NULL) goto listen_overflow; - tcp_synq_unlink(tp, req, prev); - tcp_synq_removed(sk, req); + inet_csk_reqsk_queue_unlink(sk, req, prev); + inet_csk_reqsk_queue_removed(sk, req); - tcp_acceptq_queue(sk, req, child); + inet_csk_reqsk_queue_add(sk, req, child); return child; listen_overflow: @@ -898,7 +900,7 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, if (!(flg & TCP_FLAG_RST)) req->rsk_ops->send_reset(skb); - tcp_synq_drop(sk, req, prev); + inet_csk_reqsk_queue_drop(sk, req, prev); return NULL; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a4d1eb9a092..6f0a7e30cea 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -105,8 +105,9 @@ static __u16 tcp_advertise_mss(struct sock *sk) /* RFC2861. Reset CWND after idle period longer RTO to "restart window". * This is the first part of cwnd validation mechanism. */ -static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst) +static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst) { + struct tcp_sock *tp = tcp_sk(sk); s32 delta = tcp_time_stamp - tp->lsndtime; u32 restart_cwnd = tcp_init_cwnd(tp, dst); u32 cwnd = tp->snd_cwnd; @@ -116,7 +117,7 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst) tp->snd_ssthresh = tcp_current_ssthresh(tp); restart_cwnd = min(restart_cwnd, cwnd); - while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd) + while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd) cwnd >>= 1; tp->snd_cwnd = max(cwnd, restart_cwnd); tp->snd_cwnd_stamp = tcp_time_stamp; @@ -126,26 +127,25 @@ static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst) static inline void tcp_event_data_sent(struct tcp_sock *tp, struct sk_buff *skb, struct sock *sk) { - u32 now = tcp_time_stamp; + struct inet_connection_sock *icsk = inet_csk(sk); + const u32 now = tcp_time_stamp; - if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto) - tcp_cwnd_restart(tp, __sk_dst_get(sk)); + if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto) + tcp_cwnd_restart(sk, __sk_dst_get(sk)); tp->lsndtime = now; /* If it is a reply for ato after last received * packet, enter pingpong mode. */ - if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato) - tp->ack.pingpong = 1; + if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) + icsk->icsk_ack.pingpong = 1; } static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) { - struct tcp_sock *tp = tcp_sk(sk); - - tcp_dec_quickack_mode(tp, pkts); - tcp_clear_xmit_timer(sk, TCP_TIME_DACK); + tcp_dec_quickack_mode(sk, pkts); + inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); } /* Determine a window scaling and initial window to offer. @@ -696,7 +696,7 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) if (tp->packets_out > tp->snd_cwnd_used) tp->snd_cwnd_used = tp->packets_out; - if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto) + if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto) tcp_cwnd_application_limited(sk); } } @@ -1147,6 +1147,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now) */ u32 __tcp_select_window(struct sock *sk) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); /* MSS for the peer's data. Previous verions used mss_clamp * here. I don't know if the value based on our guesses @@ -1154,7 +1155,7 @@ u32 __tcp_select_window(struct sock *sk) * but may be worse for the performance because of rcv_mss * fluctuations. --SAW 1998/11/1 */ - int mss = tp->ack.rcv_mss; + int mss = icsk->icsk_ack.rcv_mss; int free_space = tcp_space(sk); int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); int window; @@ -1163,7 +1164,7 @@ u32 __tcp_select_window(struct sock *sk) mss = full_space; if (free_space < full_space/2) { - tp->ack.quick = 0; + icsk->icsk_ack.quick = 0; if (tcp_memory_pressure) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss); @@ -1491,7 +1492,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (skb == skb_peek(&sk->sk_write_queue)) - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto); } packet_cnt -= tcp_skb_pcount(skb); @@ -1544,7 +1546,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) break; if (skb == skb_peek(&sk->sk_write_queue)) - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto); NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS); } @@ -1780,8 +1782,8 @@ static inline void tcp_connect_init(struct sock *sk) tp->rcv_wup = 0; tp->copied_seq = 0; - tp->rto = TCP_TIMEOUT_INIT; - tp->retransmits = 0; + inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; + inet_csk(sk)->icsk_retransmits = 0; tcp_clear_retrans(tp); } @@ -1824,7 +1826,7 @@ int tcp_connect(struct sock *sk) TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto); return 0; } @@ -1834,20 +1836,21 @@ int tcp_connect(struct sock *sk) */ void tcp_send_delayed_ack(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); - int ato = tp->ack.ato; + struct inet_connection_sock *icsk = inet_csk(sk); + int ato = icsk->icsk_ack.ato; unsigned long timeout; if (ato > TCP_DELACK_MIN) { + const struct tcp_sock *tp = tcp_sk(sk); int max_ato = HZ/2; - if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED)) + if (icsk->icsk_ack.pingpong || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)) max_ato = TCP_DELACK_MAX; /* Slow path, intersegment interval is "high". */ /* If some rtt estimate is known, use it to bound delayed ack. - * Do not use tp->rto here, use results of rtt measurements + * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements * directly. */ if (tp->srtt) { @@ -1864,21 +1867,22 @@ void tcp_send_delayed_ack(struct sock *sk) timeout = jiffies + ato; /* Use new timeout only if there wasn't a older one earlier. */ - if (tp->ack.pending&TCP_ACK_TIMER) { + if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) { /* If delack timer was blocked or is about to expire, * send ACK now. */ - if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) { + if (icsk->icsk_ack.blocked || + time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) { tcp_send_ack(sk); return; } - if (!time_before(timeout, tp->ack.timeout)) - timeout = tp->ack.timeout; + if (!time_before(timeout, icsk->icsk_ack.timeout)) + timeout = icsk->icsk_ack.timeout; } - tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER; - tp->ack.timeout = timeout; - sk_reset_timer(sk, &tp->delack_timer, timeout); + icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; + icsk->icsk_ack.timeout = timeout; + sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); } /* This routine sends an ack and also updates the window. */ @@ -1895,9 +1899,9 @@ void tcp_send_ack(struct sock *sk) */ buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (buff == NULL) { - tcp_schedule_ack(tp); - tp->ack.ato = TCP_ATO_MIN; - tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX); + inet_csk_schedule_ack(sk); + inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX); return; } @@ -2011,6 +2015,7 @@ int tcp_write_wakeup(struct sock *sk) */ void tcp_send_probe0(struct sock *sk) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int err; @@ -2019,16 +2024,16 @@ void tcp_send_probe0(struct sock *sk) if (tp->packets_out || !sk->sk_send_head) { /* Cancel probe timer, if it is not required. */ tp->probes_out = 0; - tp->backoff = 0; + icsk->icsk_backoff = 0; return; } if (err <= 0) { - if (tp->backoff < sysctl_tcp_retries2) - tp->backoff++; + if (icsk->icsk_backoff < sysctl_tcp_retries2) + icsk->icsk_backoff++; tp->probes_out++; - tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, - min(tp->rto << tp->backoff, TCP_RTO_MAX)); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX)); } else { /* If packet was not sent due to local congestion, * do not backoff and do not remember probes_out. @@ -2038,8 +2043,9 @@ void tcp_send_probe0(struct sock *sk) */ if (!tp->probes_out) tp->probes_out=1; - tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, - min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL)); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, + min(icsk->icsk_rto << icsk->icsk_backoff, + TCP_RESOURCE_PROBE_INTERVAL)); } } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 0084227438c..0b71380ee42 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -36,9 +36,9 @@ static void tcp_write_timer(unsigned long); static void tcp_delack_timer(unsigned long); static void tcp_keepalive_timer (unsigned long data); -#ifdef TCP_DEBUG -const char tcp_timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n"; -EXPORT_SYMBOL(tcp_timer_bug_msg); +#ifdef INET_CSK_DEBUG +const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; +EXPORT_SYMBOL(inet_csk_timer_bug_msg); #endif /* @@ -46,40 +46,45 @@ EXPORT_SYMBOL(tcp_timer_bug_msg); * We may wish use just one timer maintaining a list of expire jiffies * to optimize. */ - -void tcp_init_xmit_timers(struct sock *sk) +void inet_csk_init_xmit_timers(struct sock *sk, + void (*retransmit_handler)(unsigned long), + void (*delack_handler)(unsigned long), + void (*keepalive_handler)(unsigned long)) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); - init_timer(&tp->retransmit_timer); - tp->retransmit_timer.function=&tcp_write_timer; - tp->retransmit_timer.data = (unsigned long) sk; - tp->pending = 0; + init_timer(&icsk->icsk_retransmit_timer); + init_timer(&icsk->icsk_delack_timer); + init_timer(&sk->sk_timer); - init_timer(&tp->delack_timer); - tp->delack_timer.function=&tcp_delack_timer; - tp->delack_timer.data = (unsigned long) sk; - tp->ack.pending = 0; + icsk->icsk_retransmit_timer.function = retransmit_handler; + icsk->icsk_delack_timer.function = delack_handler; + sk->sk_timer.function = keepalive_handler; - init_timer(&sk->sk_timer); - sk->sk_timer.function = &tcp_keepalive_timer; - sk->sk_timer.data = (unsigned long)sk; + icsk->icsk_retransmit_timer.data = + icsk->icsk_delack_timer.data = + sk->sk_timer.data = (unsigned long)sk; + + icsk->icsk_pending = icsk->icsk_ack.pending = 0; } -void tcp_clear_xmit_timers(struct sock *sk) +void inet_csk_clear_xmit_timers(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); - tp->pending = 0; - sk_stop_timer(sk, &tp->retransmit_timer); - - tp->ack.pending = 0; - tp->ack.blocked = 0; - sk_stop_timer(sk, &tp->delack_timer); + icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0; + sk_stop_timer(sk, &icsk->icsk_retransmit_timer); + sk_stop_timer(sk, &icsk->icsk_delack_timer); sk_stop_timer(sk, &sk->sk_timer); } +void tcp_init_xmit_timers(struct sock *sk) +{ + inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, + &tcp_keepalive_timer); +} + static void tcp_write_err(struct sock *sk) { sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; @@ -155,15 +160,15 @@ static int tcp_orphan_retries(struct sock *sk, int alive) /* A write timeout has occurred. Process the after effects. */ static int tcp_write_timeout(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); int retry_until; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { - if (tp->retransmits) + if (icsk->icsk_retransmits) dst_negative_advice(&sk->sk_dst_cache); - retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries; + retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; } else { - if (tp->retransmits >= sysctl_tcp_retries1) { + if (icsk->icsk_retransmits >= sysctl_tcp_retries1) { /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black hole detection. :-( @@ -189,16 +194,16 @@ static int tcp_write_timeout(struct sock *sk) retry_until = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - int alive = (tp->rto < TCP_RTO_MAX); + const int alive = (icsk->icsk_rto < TCP_RTO_MAX); retry_until = tcp_orphan_retries(sk, alive); - if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until)) + if (tcp_out_of_resources(sk, alive || icsk->icsk_retransmits < retry_until)) return 1; } } - if (tp->retransmits >= retry_until) { + if (icsk->icsk_retransmits >= retry_until) { /* Has it gone just too far? */ tcp_write_err(sk); return 1; @@ -210,26 +215,27 @@ static void tcp_delack_timer(unsigned long data) { struct sock *sk = (struct sock*)data; struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later. */ - tp->ack.blocked = 1; + icsk->icsk_ack.blocked = 1; NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED); - sk_reset_timer(sk, &tp->delack_timer, jiffies + TCP_DELACK_MIN); + sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN); goto out_unlock; } sk_stream_mem_reclaim(sk); - if (sk->sk_state == TCP_CLOSE || !(tp->ack.pending & TCP_ACK_TIMER)) + if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER)) goto out; - if (time_after(tp->ack.timeout, jiffies)) { - sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout); + if (time_after(icsk->icsk_ack.timeout, jiffies)) { + sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout); goto out; } - tp->ack.pending &= ~TCP_ACK_TIMER; + icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER; if (!skb_queue_empty(&tp->ucopy.prequeue)) { struct sk_buff *skb; @@ -242,16 +248,16 @@ static void tcp_delack_timer(unsigned long data) tp->ucopy.memory = 0; } - if (tcp_ack_scheduled(tp)) { - if (!tp->ack.pingpong) { + if (inet_csk_ack_scheduled(sk)) { + if (!icsk->icsk_ack.pingpong) { /* Delayed ACK missed: inflate ATO. */ - tp->ack.ato = min(tp->ack.ato << 1, tp->rto); + icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto); } else { /* Delayed ACK missed: leave pingpong mode and * deflate ATO. */ - tp->ack.pingpong = 0; - tp->ack.ato = TCP_ATO_MIN; + icsk->icsk_ack.pingpong = 0; + icsk->icsk_ack.ato = TCP_ATO_MIN; } tcp_send_ack(sk); NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS); @@ -294,7 +300,8 @@ static void tcp_probe_timer(struct sock *sk) max_probes = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - int alive = ((tp->rto<backoff) < TCP_RTO_MAX); + const struct inet_connection_sock *icsk = inet_csk(sk); + const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX); max_probes = tcp_orphan_retries(sk, alive); @@ -317,6 +324,7 @@ static void tcp_probe_timer(struct sock *sk) static void tcp_retransmit_timer(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); if (!tp->packets_out) goto out; @@ -351,7 +359,7 @@ static void tcp_retransmit_timer(struct sock *sk) if (tcp_write_timeout(sk)) goto out; - if (tp->retransmits == 0) { + if (icsk->icsk_retransmits == 0) { if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) { if (tp->rx_opt.sack_ok) { if (tp->ca_state == TCP_CA_Recovery) @@ -381,10 +389,10 @@ static void tcp_retransmit_timer(struct sock *sk) /* Retransmission failed because of local congestion, * do not backoff. */ - if (!tp->retransmits) - tp->retransmits=1; - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, - min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL)); + if (!icsk->icsk_retransmits) + icsk->icsk_retransmits = 1; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL)); goto out; } @@ -403,13 +411,13 @@ static void tcp_retransmit_timer(struct sock *sk) * implemented ftp to mars will work nicely. We will have to fix * the 120 second clamps though! */ - tp->backoff++; - tp->retransmits++; + icsk->icsk_backoff++; + icsk->icsk_retransmits++; out_reset_timer: - tp->rto = min(tp->rto << 1, TCP_RTO_MAX); - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); - if (tp->retransmits > sysctl_tcp_retries1) + icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto); + if (icsk->icsk_retransmits > sysctl_tcp_retries1) __sk_dst_reset(sk); out:; @@ -418,32 +426,32 @@ out:; static void tcp_write_timer(unsigned long data) { struct sock *sk = (struct sock*)data; - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); int event; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later */ - sk_reset_timer(sk, &tp->retransmit_timer, jiffies + (HZ / 20)); + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20)); goto out_unlock; } - if (sk->sk_state == TCP_CLOSE || !tp->pending) + if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending) goto out; - if (time_after(tp->timeout, jiffies)) { - sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout); + if (time_after(icsk->icsk_timeout, jiffies)) { + sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); goto out; } - event = tp->pending; - tp->pending = 0; + event = icsk->icsk_pending; + icsk->icsk_pending = 0; switch (event) { - case TCP_TIME_RETRANS: + case ICSK_TIME_RETRANS: tcp_retransmit_timer(sk); break; - case TCP_TIME_PROBE0: + case ICSK_TIME_PROBE0: tcp_probe_timer(sk); break; } @@ -463,8 +471,9 @@ out_unlock: static void tcp_synack_timer(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct listen_sock *lopt = tp->accept_queue.listen_opt; - int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries; + struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; int thresh = max_retries; unsigned long now = jiffies; struct request_sock **reqp, *req; @@ -526,8 +535,8 @@ static void tcp_synack_timer(struct sock *sk) } /* Drop this request */ - tcp_synq_unlink(tp, req, reqp); - reqsk_queue_removed(&tp->accept_queue, req); + inet_csk_reqsk_queue_unlink(sk, req, reqp); + reqsk_queue_removed(&icsk->icsk_accept_queue, req); reqsk_free(req); continue; } @@ -541,15 +550,15 @@ static void tcp_synack_timer(struct sock *sk) lopt->clock_hand = i; if (lopt->qlen) - tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL); + inet_csk_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL); } -void tcp_delete_keepalive_timer (struct sock *sk) +void inet_csk_delete_keepalive_timer(struct sock *sk) { sk_stop_timer(sk, &sk->sk_timer); } -void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len) +void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) { sk_reset_timer(sk, &sk->sk_timer, jiffies + len); } @@ -560,9 +569,9 @@ void tcp_set_keepalive(struct sock *sk, int val) return; if (val && !sock_flag(sk, SOCK_KEEPOPEN)) - tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); + inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); else if (!val) - tcp_delete_keepalive_timer(sk); + inet_csk_delete_keepalive_timer(sk); } @@ -576,7 +585,7 @@ static void tcp_keepalive_timer (unsigned long data) bh_lock_sock(sk); if (sock_owned_by_user(sk)) { /* Try again later. */ - tcp_reset_keepalive_timer (sk, HZ/20); + inet_csk_reset_keepalive_timer (sk, HZ/20); goto out; } @@ -587,7 +596,7 @@ static void tcp_keepalive_timer (unsigned long data) if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { if (tp->linger2 >= 0) { - int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN; + const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN; if (tmo > 0) { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); @@ -634,7 +643,7 @@ static void tcp_keepalive_timer (unsigned long data) sk_stream_mem_reclaim(sk); resched: - tcp_reset_keepalive_timer (sk, elapsed); + inet_csk_reset_keepalive_timer (sk, elapsed); goto out; death: @@ -645,7 +654,7 @@ out: sock_put(sk); } -EXPORT_SYMBOL(tcp_clear_xmit_timers); -EXPORT_SYMBOL(tcp_delete_keepalive_timer); +EXPORT_SYMBOL(inet_csk_clear_xmit_timers); +EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); EXPORT_SYMBOL(tcp_init_xmit_timers); -EXPORT_SYMBOL(tcp_reset_keepalive_timer); +EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); -- cgit v1.2.3 From 3f421baa4720b708022f8bcc52a61e5cd6f10bf8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:11:08 -0700 Subject: [NET]: Just move the inet_connection_sock function from tcp sources Completing the previous changeset, this also generalises tcp_v4_synq_add, renaming it to inet_csk_reqsk_queue_hash_add, already geing used in the DCCP tree, which I plan to merge RSN. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/Makefile | 2 +- net/ipv4/inet_connection_sock.c | 401 ++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp.c | 93 ---------- net/ipv4/tcp_input.c | 10 +- net/ipv4/tcp_ipv4.c | 210 +-------------------- net/ipv4/tcp_output.c | 19 +- net/ipv4/tcp_timer.c | 65 +------ 7 files changed, 427 insertions(+), 373 deletions(-) create mode 100644 net/ipv4/inet_connection_sock.c (limited to 'net/ipv4') diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 6650d18e400..ea0e1d87dc7 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -5,7 +5,7 @@ obj-y := route.o inetpeer.o protocol.o \ ip_input.o ip_fragment.o ip_forward.o ip_options.o \ ip_output.o ip_sockglue.o inet_hashtables.o \ - inet_timewait_sock.o \ + inet_timewait_sock.o inet_connection_sock.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o \ datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c new file mode 100644 index 00000000000..2712400a8bb --- /dev/null +++ b/net/ipv4/inet_connection_sock.c @@ -0,0 +1,401 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Support for INET connection oriented protocols. + * + * Authors: See the TCP sources + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or(at your option) any later version. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifdef INET_CSK_DEBUG +const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; +EXPORT_SYMBOL(inet_csk_timer_bug_msg); +#endif + +/* + * This array holds the first and last local port number. + * For high-usage systems, use sysctl to change this to + * 32768-61000 + */ +int sysctl_local_port_range[2] = { 1024, 4999 }; + +static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) +{ + const u32 sk_rcv_saddr = inet_rcv_saddr(sk); + struct sock *sk2; + struct hlist_node *node; + int reuse = sk->sk_reuse; + + sk_for_each_bound(sk2, node, &tb->owners) { + if (sk != sk2 && + !inet_v6_ipv6only(sk2) && + (!sk->sk_bound_dev_if || + !sk2->sk_bound_dev_if || + sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { + if (!reuse || !sk2->sk_reuse || + sk2->sk_state == TCP_LISTEN) { + const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2); + if (!sk2_rcv_saddr || !sk_rcv_saddr || + sk2_rcv_saddr == sk_rcv_saddr) + break; + } + } + } + return node != NULL; +} + +/* Obtain a reference to a local port for the given sock, + * if snum is zero it means select any available local port. + */ +int inet_csk_get_port(struct inet_hashinfo *hashinfo, + struct sock *sk, unsigned short snum) +{ + struct inet_bind_hashbucket *head; + struct hlist_node *node; + struct inet_bind_bucket *tb; + int ret; + + local_bh_disable(); + if (!snum) { + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int remaining = (high - low) + 1; + int rover; + + spin_lock(&hashinfo->portalloc_lock); + if (hashinfo->port_rover < low) + rover = low; + else + rover = hashinfo->port_rover; + do { + rover++; + if (rover > high) + rover = low; + head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; + spin_lock(&head->lock); + inet_bind_bucket_for_each(tb, node, &head->chain) + if (tb->port == rover) + goto next; + break; + next: + spin_unlock(&head->lock); + } while (--remaining > 0); + hashinfo->port_rover = rover; + spin_unlock(&hashinfo->portalloc_lock); + + /* Exhausted local port range during search? It is not + * possible for us to be holding one of the bind hash + * locks if this test triggers, because if 'remaining' + * drops to zero, we broke out of the do/while loop at + * the top level, not from the 'break;' statement. + */ + ret = 1; + if (remaining <= 0) + goto fail; + + /* OK, here is the one we will use. HEAD is + * non-NULL and we hold it's mutex. + */ + snum = rover; + } else { + head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)]; + spin_lock(&head->lock); + inet_bind_bucket_for_each(tb, node, &head->chain) + if (tb->port == snum) + goto tb_found; + } + tb = NULL; + goto tb_not_found; +tb_found: + if (!hlist_empty(&tb->owners)) { + if (sk->sk_reuse > 1) + goto success; + if (tb->fastreuse > 0 && + sk->sk_reuse && sk->sk_state != TCP_LISTEN) { + goto success; + } else { + ret = 1; + if (inet_csk_bind_conflict(sk, tb)) + goto fail_unlock; + } + } +tb_not_found: + ret = 1; + if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL) + goto fail_unlock; + if (hlist_empty(&tb->owners)) { + if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) + tb->fastreuse = 1; + else + tb->fastreuse = 0; + } else if (tb->fastreuse && + (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) + tb->fastreuse = 0; +success: + if (!inet_csk(sk)->icsk_bind_hash) + inet_bind_hash(sk, tb, snum); + BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb); + ret = 0; + +fail_unlock: + spin_unlock(&head->lock); +fail: + local_bh_enable(); + return ret; +} + +EXPORT_SYMBOL_GPL(inet_csk_get_port); + +/* + * Wait for an incoming connection, avoid race conditions. This must be called + * with the socket locked. + */ +static int inet_csk_wait_for_connect(struct sock *sk, long timeo) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + DEFINE_WAIT(wait); + int err; + + /* + * True wake-one mechanism for incoming connections: only + * one process gets woken up, not the 'whole herd'. + * Since we do not 'race & poll' for established sockets + * anymore, the common case will execute the loop only once. + * + * Subtle issue: "add_wait_queue_exclusive()" will be added + * after any current non-exclusive waiters, and we know that + * it will always _stay_ after any new non-exclusive waiters + * because all non-exclusive waiters are added at the + * beginning of the wait-queue. As such, it's ok to "drop" + * our exclusiveness temporarily when we get woken up without + * having to remove and re-insert us on the wait queue. + */ + for (;;) { + prepare_to_wait_exclusive(sk->sk_sleep, &wait, + TASK_INTERRUPTIBLE); + release_sock(sk); + if (reqsk_queue_empty(&icsk->icsk_accept_queue)) + timeo = schedule_timeout(timeo); + lock_sock(sk); + err = 0; + if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) + break; + err = -EINVAL; + if (sk->sk_state != TCP_LISTEN) + break; + err = sock_intr_errno(timeo); + if (signal_pending(current)) + break; + err = -EAGAIN; + if (!timeo) + break; + } + finish_wait(sk->sk_sleep, &wait); + return err; +} + +/* + * This will accept the next outstanding connection. + */ +struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct sock *newsk; + int error; + + lock_sock(sk); + + /* We need to make sure that this socket is listening, + * and that it has something pending. + */ + error = -EINVAL; + if (sk->sk_state != TCP_LISTEN) + goto out_err; + + /* Find already established connection */ + if (reqsk_queue_empty(&icsk->icsk_accept_queue)) { + long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + + /* If this is a non blocking socket don't sleep */ + error = -EAGAIN; + if (!timeo) + goto out_err; + + error = inet_csk_wait_for_connect(sk, timeo); + if (error) + goto out_err; + } + + newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); + BUG_TRAP(newsk->sk_state != TCP_SYN_RECV); +out: + release_sock(sk); + return newsk; +out_err: + newsk = NULL; + *err = error; + goto out; +} + +EXPORT_SYMBOL(inet_csk_accept); + +/* + * Using different timers for retransmit, delayed acks and probes + * We may wish use just one timer maintaining a list of expire jiffies + * to optimize. + */ +void inet_csk_init_xmit_timers(struct sock *sk, + void (*retransmit_handler)(unsigned long), + void (*delack_handler)(unsigned long), + void (*keepalive_handler)(unsigned long)) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + init_timer(&icsk->icsk_retransmit_timer); + init_timer(&icsk->icsk_delack_timer); + init_timer(&sk->sk_timer); + + icsk->icsk_retransmit_timer.function = retransmit_handler; + icsk->icsk_delack_timer.function = delack_handler; + sk->sk_timer.function = keepalive_handler; + + icsk->icsk_retransmit_timer.data = + icsk->icsk_delack_timer.data = + sk->sk_timer.data = (unsigned long)sk; + + icsk->icsk_pending = icsk->icsk_ack.pending = 0; +} + +EXPORT_SYMBOL(inet_csk_init_xmit_timers); + +void inet_csk_clear_xmit_timers(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0; + + sk_stop_timer(sk, &icsk->icsk_retransmit_timer); + sk_stop_timer(sk, &icsk->icsk_delack_timer); + sk_stop_timer(sk, &sk->sk_timer); +} + +EXPORT_SYMBOL(inet_csk_clear_xmit_timers); + +void inet_csk_delete_keepalive_timer(struct sock *sk) +{ + sk_stop_timer(sk, &sk->sk_timer); +} + +EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); + +void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) +{ + sk_reset_timer(sk, &sk->sk_timer, jiffies + len); +} + +EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); + +struct dst_entry* inet_csk_route_req(struct sock *sk, + const struct request_sock *req) +{ + struct rtable *rt; + const struct inet_request_sock *ireq = inet_rsk(req); + struct ip_options *opt = inet_rsk(req)->opt; + struct flowi fl = { .oif = sk->sk_bound_dev_if, + .nl_u = { .ip4_u = + { .daddr = ((opt && opt->srr) ? + opt->faddr : + ireq->rmt_addr), + .saddr = ireq->loc_addr, + .tos = RT_CONN_FLAGS(sk) } }, + .proto = sk->sk_protocol, + .uli_u = { .ports = + { .sport = inet_sk(sk)->sport, + .dport = ireq->rmt_port } } }; + + if (ip_route_output_flow(&rt, &fl, sk, 0)) { + IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + return NULL; + } + if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { + ip_rt_put(rt); + IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + return NULL; + } + return &rt->u.dst; +} + +EXPORT_SYMBOL_GPL(inet_csk_route_req); + +static inline u32 inet_synq_hash(const u32 raddr, const u16 rport, + const u32 rnd, const u16 synq_hsize) +{ + return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1); +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#define AF_INET_FAMILY(fam) ((fam) == AF_INET) +#else +#define AF_INET_FAMILY(fam) 1 +#endif + +struct request_sock *inet_csk_search_req(const struct sock *sk, + struct request_sock ***prevp, + const __u16 rport, const __u32 raddr, + const __u32 laddr) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + struct request_sock *req, **prev; + + for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd, + lopt->nr_table_entries)]; + (req = *prev) != NULL; + prev = &req->dl_next) { + const struct inet_request_sock *ireq = inet_rsk(req); + + if (ireq->rmt_port == rport && + ireq->rmt_addr == raddr && + ireq->loc_addr == laddr && + AF_INET_FAMILY(req->rsk_ops->family)) { + BUG_TRAP(!req->sk); + *prevp = prev; + break; + } + } + + return req; +} + +EXPORT_SYMBOL_GPL(inet_csk_search_req); + +void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, + const unsigned timeout) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, + lopt->hash_rnd, lopt->nr_table_entries); + + reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); + inet_csk_reqsk_queue_added(sk, timeout); +} + +EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8177b86570d..581016a6a93 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1804,98 +1804,6 @@ int tcp_disconnect(struct sock *sk, int flags) return err; } -/* - * Wait for an incoming connection, avoid race - * conditions. This must be called with the socket locked. - */ -static int wait_for_connect(struct sock *sk, long timeo) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - DEFINE_WAIT(wait); - int err; - - /* - * True wake-one mechanism for incoming connections: only - * one process gets woken up, not the 'whole herd'. - * Since we do not 'race & poll' for established sockets - * anymore, the common case will execute the loop only once. - * - * Subtle issue: "add_wait_queue_exclusive()" will be added - * after any current non-exclusive waiters, and we know that - * it will always _stay_ after any new non-exclusive waiters - * because all non-exclusive waiters are added at the - * beginning of the wait-queue. As such, it's ok to "drop" - * our exclusiveness temporarily when we get woken up without - * having to remove and re-insert us on the wait queue. - */ - for (;;) { - prepare_to_wait_exclusive(sk->sk_sleep, &wait, - TASK_INTERRUPTIBLE); - release_sock(sk); - if (reqsk_queue_empty(&icsk->icsk_accept_queue)) - timeo = schedule_timeout(timeo); - lock_sock(sk); - err = 0; - if (!reqsk_queue_empty(&icsk->icsk_accept_queue)) - break; - err = -EINVAL; - if (sk->sk_state != TCP_LISTEN) - break; - err = sock_intr_errno(timeo); - if (signal_pending(current)) - break; - err = -EAGAIN; - if (!timeo) - break; - } - finish_wait(sk->sk_sleep, &wait); - return err; -} - -/* - * This will accept the next outstanding connection. - */ - -struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct sock *newsk; - int error; - - lock_sock(sk); - - /* We need to make sure that this socket is listening, - * and that it has something pending. - */ - error = -EINVAL; - if (sk->sk_state != TCP_LISTEN) - goto out_err; - - /* Find already established connection */ - if (reqsk_queue_empty(&icsk->icsk_accept_queue)) { - long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); - - /* If this is a non blocking socket don't sleep */ - error = -EAGAIN; - if (!timeo) - goto out_err; - - error = wait_for_connect(sk, timeo); - if (error) - goto out_err; - } - - newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); - BUG_TRAP(newsk->sk_state != TCP_SYN_RECV); -out: - release_sock(sk); - return newsk; -out_err: - newsk = NULL; - *err = error; - goto out; -} - /* * Socket option code for TCP. */ @@ -2344,7 +2252,6 @@ void __init tcp_init(void) tcp_register_congestion_control(&tcp_reno); } -EXPORT_SYMBOL(inet_csk_accept); EXPORT_SYMBOL(tcp_close); EXPORT_SYMBOL(tcp_destroy_sock); EXPORT_SYMBOL(tcp_disconnect); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8a8c5c2d90c..b35badf53aa 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1278,7 +1278,7 @@ static int tcp_check_sack_reneging(struct sock *sk) inet_csk(sk)->icsk_retransmits++; tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto); + inet_csk(sk)->icsk_rto, TCP_RTO_MAX); return 1; } return 0; @@ -1961,7 +1961,7 @@ static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) if (!tp->packets_out) { inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); } else { - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); } } @@ -2147,7 +2147,8 @@ static void tcp_ack_probe(struct sock *sk) */ } else { inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX)); + min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), + TCP_RTO_MAX); } } @@ -3968,7 +3969,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; tcp_incr_quickack(sk); tcp_enter_quickack_mode(sk); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + TCP_DELACK_MAX, TCP_RTO_MAX); discard: __kfree_skb(skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2cd41265d17..2f605b9e6b6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -97,138 +97,6 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { .port_rover = 1024 - 1, }; -/* - * This array holds the first and last local port number. - * For high-usage systems, use sysctl to change this to - * 32768-61000 - */ -int sysctl_local_port_range[2] = { 1024, 4999 }; - -static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) -{ - const u32 sk_rcv_saddr = inet_rcv_saddr(sk); - struct sock *sk2; - struct hlist_node *node; - int reuse = sk->sk_reuse; - - sk_for_each_bound(sk2, node, &tb->owners) { - if (sk != sk2 && - !inet_v6_ipv6only(sk2) && - (!sk->sk_bound_dev_if || - !sk2->sk_bound_dev_if || - sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { - if (!reuse || !sk2->sk_reuse || - sk2->sk_state == TCP_LISTEN) { - const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2); - if (!sk2_rcv_saddr || !sk_rcv_saddr || - sk2_rcv_saddr == sk_rcv_saddr) - break; - } - } - } - return node != NULL; -} - -/* Obtain a reference to a local port for the given sock, - * if snum is zero it means select any available local port. - */ -int inet_csk_get_port(struct inet_hashinfo *hashinfo, - struct sock *sk, unsigned short snum) -{ - struct inet_bind_hashbucket *head; - struct hlist_node *node; - struct inet_bind_bucket *tb; - int ret; - - local_bh_disable(); - if (!snum) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int remaining = (high - low) + 1; - int rover; - - spin_lock(&hashinfo->portalloc_lock); - if (hashinfo->port_rover < low) - rover = low; - else - rover = hashinfo->port_rover; - do { - rover++; - if (rover > high) - rover = low; - head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; - spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, node, &head->chain) - if (tb->port == rover) - goto next; - break; - next: - spin_unlock(&head->lock); - } while (--remaining > 0); - hashinfo->port_rover = rover; - spin_unlock(&hashinfo->portalloc_lock); - - /* Exhausted local port range during search? It is not - * possible for us to be holding one of the bind hash - * locks if this test triggers, because if 'remaining' - * drops to zero, we broke out of the do/while loop at - * the top level, not from the 'break;' statement. - */ - ret = 1; - if (unlikely(remaining <= 0)) - goto fail; - - /* OK, here is the one we will use. HEAD is - * non-NULL and we hold it's mutex. - */ - snum = rover; - } else { - head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)]; - spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, node, &head->chain) - if (tb->port == snum) - goto tb_found; - } - tb = NULL; - goto tb_not_found; -tb_found: - if (!hlist_empty(&tb->owners)) { - if (sk->sk_reuse > 1) - goto success; - if (tb->fastreuse > 0 && - sk->sk_reuse && sk->sk_state != TCP_LISTEN) { - goto success; - } else { - ret = 1; - if (inet_csk_bind_conflict(sk, tb)) - goto fail_unlock; - } - } -tb_not_found: - ret = 1; - if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL) - goto fail_unlock; - if (hlist_empty(&tb->owners)) { - if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) - tb->fastreuse = 1; - else - tb->fastreuse = 0; - } else if (tb->fastreuse && - (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) - tb->fastreuse = 0; -success: - if (!inet_csk(sk)->icsk_bind_hash) - inet_bind_hash(sk, tb, snum); - BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb); - ret = 0; - -fail_unlock: - spin_unlock(&head->lock); -fail: - local_bh_enable(); - return ret; -} - static int tcp_v4_get_port(struct sock *sk, unsigned short snum) { return inet_csk_get_port(&tcp_hashinfo, sk, snum); @@ -568,52 +436,6 @@ static inline int inet_iif(const struct sk_buff *skb) return ((struct rtable *)skb->dst)->rt_iif; } -static inline u32 inet_synq_hash(const u32 raddr, const u16 rport, - const u32 rnd, const u16 synq_hsize) -{ - return jhash_2words(raddr, (u32)rport, rnd) & (synq_hsize - 1); -} - -struct request_sock *inet_csk_search_req(const struct sock *sk, - struct request_sock ***prevp, - const __u16 rport, const __u32 raddr, - const __u32 laddr) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - struct request_sock *req, **prev; - - for (prev = &lopt->syn_table[inet_synq_hash(raddr, rport, lopt->hash_rnd, - lopt->nr_table_entries)]; - (req = *prev) != NULL; - prev = &req->dl_next) { - const struct inet_request_sock *ireq = inet_rsk(req); - - if (ireq->rmt_port == rport && - ireq->rmt_addr == raddr && - ireq->loc_addr == laddr && - AF_INET_FAMILY(req->rsk_ops->family)) { - BUG_TRAP(!req->sk); - *prevp = prev; - break; - } - } - - return req; -} - -static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - const u32 h = inet_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, - lopt->hash_rnd, lopt->nr_table_entries); - - reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, TCP_TIMEOUT_INIT); - inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT); -} - - /* * This routine does path mtu discovery as defined in RFC1191. */ @@ -963,36 +785,6 @@ static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req) req->ts_recent); } -struct dst_entry* inet_csk_route_req(struct sock *sk, - const struct request_sock *req) -{ - struct rtable *rt; - const struct inet_request_sock *ireq = inet_rsk(req); - struct ip_options *opt = inet_rsk(req)->opt; - struct flowi fl = { .oif = sk->sk_bound_dev_if, - .nl_u = { .ip4_u = - { .daddr = ((opt && opt->srr) ? - opt->faddr : - ireq->rmt_addr), - .saddr = ireq->loc_addr, - .tos = RT_CONN_FLAGS(sk) } }, - .proto = sk->sk_protocol, - .uli_u = { .ports = - { .sport = inet_sk(sk)->sport, - .dport = ireq->rmt_port } } }; - - if (ip_route_output_flow(&rt, &fl, sk, 0)) { - IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); - return NULL; - } - if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { - ip_rt_put(rt); - IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); - return NULL; - } - return &rt->u.dst; -} - /* * Send a SYN-ACK after having received an ACK. * This still operates on a request_sock only, not on a big @@ -1222,7 +1014,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) if (want_cookie) { reqsk_free(req); } else { - tcp_v4_synq_add(sk, req); + inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); } return 0; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 6f0a7e30cea..f458eacb5ef 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1493,7 +1493,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (skb == skb_peek(&sk->sk_write_queue)) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto); + inet_csk(sk)->icsk_rto, + TCP_RTO_MAX); } packet_cnt -= tcp_skb_pcount(skb); @@ -1546,7 +1547,9 @@ void tcp_xmit_retransmit_queue(struct sock *sk) break; if (skb == skb_peek(&sk->sk_write_queue)) - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, + TCP_RTO_MAX); NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS); } @@ -1826,7 +1829,8 @@ int tcp_connect(struct sock *sk) TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, TCP_RTO_MAX); return 0; } @@ -1901,7 +1905,8 @@ void tcp_send_ack(struct sock *sk) if (buff == NULL) { inet_csk_schedule_ack(sk); inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, TCP_DELACK_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + TCP_DELACK_MAX, TCP_RTO_MAX); return; } @@ -2033,7 +2038,8 @@ void tcp_send_probe0(struct sock *sk) icsk->icsk_backoff++; tp->probes_out++; inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX)); + min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), + TCP_RTO_MAX); } else { /* If packet was not sent due to local congestion, * do not backoff and do not remember probes_out. @@ -2045,7 +2051,8 @@ void tcp_send_probe0(struct sock *sk) tp->probes_out=1; inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, min(icsk->icsk_rto << icsk->icsk_backoff, - TCP_RESOURCE_PROBE_INTERVAL)); + TCP_RESOURCE_PROBE_INTERVAL), + TCP_RTO_MAX); } } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 0b71380ee42..c03930c48f4 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -36,55 +36,14 @@ static void tcp_write_timer(unsigned long); static void tcp_delack_timer(unsigned long); static void tcp_keepalive_timer (unsigned long data); -#ifdef INET_CSK_DEBUG -const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; -EXPORT_SYMBOL(inet_csk_timer_bug_msg); -#endif - -/* - * Using different timers for retransmit, delayed acks and probes - * We may wish use just one timer maintaining a list of expire jiffies - * to optimize. - */ -void inet_csk_init_xmit_timers(struct sock *sk, - void (*retransmit_handler)(unsigned long), - void (*delack_handler)(unsigned long), - void (*keepalive_handler)(unsigned long)) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - - init_timer(&icsk->icsk_retransmit_timer); - init_timer(&icsk->icsk_delack_timer); - init_timer(&sk->sk_timer); - - icsk->icsk_retransmit_timer.function = retransmit_handler; - icsk->icsk_delack_timer.function = delack_handler; - sk->sk_timer.function = keepalive_handler; - - icsk->icsk_retransmit_timer.data = - icsk->icsk_delack_timer.data = - sk->sk_timer.data = (unsigned long)sk; - - icsk->icsk_pending = icsk->icsk_ack.pending = 0; -} - -void inet_csk_clear_xmit_timers(struct sock *sk) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - - icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0; - - sk_stop_timer(sk, &icsk->icsk_retransmit_timer); - sk_stop_timer(sk, &icsk->icsk_delack_timer); - sk_stop_timer(sk, &sk->sk_timer); -} - void tcp_init_xmit_timers(struct sock *sk) { inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer, &tcp_keepalive_timer); } +EXPORT_SYMBOL(tcp_init_xmit_timers); + static void tcp_write_err(struct sock *sk) { sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; @@ -392,7 +351,8 @@ static void tcp_retransmit_timer(struct sock *sk) if (!icsk->icsk_retransmits) icsk->icsk_retransmits = 1; inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL)); + min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL), + TCP_RTO_MAX); goto out; } @@ -416,7 +376,7 @@ static void tcp_retransmit_timer(struct sock *sk) out_reset_timer: icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); if (icsk->icsk_retransmits > sysctl_tcp_retries1) __sk_dst_reset(sk); @@ -553,16 +513,6 @@ static void tcp_synack_timer(struct sock *sk) inet_csk_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL); } -void inet_csk_delete_keepalive_timer(struct sock *sk) -{ - sk_stop_timer(sk, &sk->sk_timer); -} - -void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) -{ - sk_reset_timer(sk, &sk->sk_timer, jiffies + len); -} - void tcp_set_keepalive(struct sock *sk, int val) { if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) @@ -653,8 +603,3 @@ out: bh_unlock_sock(sk); sock_put(sk); } - -EXPORT_SYMBOL(inet_csk_clear_xmit_timers); -EXPORT_SYMBOL(inet_csk_delete_keepalive_timer); -EXPORT_SYMBOL(tcp_init_xmit_timers); -EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); -- cgit v1.2.3 From 9f1d2604c71498579609b1532fedc5a89276bb00 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:11:24 -0700 Subject: [ICSK]: Introduce inet_csk_clone Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 25 +++++++++++++++++++++++++ net/ipv4/tcp_minisocks.c | 18 +++--------------- 2 files changed, 28 insertions(+), 15 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 2712400a8bb..136ada050b6 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -399,3 +399,28 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, } EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); + +struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, + const unsigned int __nocast priority) +{ + struct sock *newsk = sk_clone(sk, priority); + + if (newsk != NULL) { + struct inet_connection_sock *newicsk = inet_csk(newsk); + + newsk->sk_state = TCP_SYN_RECV; + newicsk->icsk_bind_hash = NULL; + + inet_sk(newsk)->dport = inet_rsk(req)->rmt_port; + newsk->sk_write_space = sk_stream_write_space; + + newicsk->icsk_retransmits = 0; + newicsk->icsk_backoff = 0; + + /* Deinitialize accept_queue to trap illegal accesses. */ + memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); + } + return newsk; +} + +EXPORT_SYMBOL_GPL(inet_csk_clone); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 56823704eb7..4cfbe1d1c92 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -600,22 +600,14 @@ out: */ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb) { - struct sock *newsk = sk_clone(sk, GFP_ATOMIC); + struct sock *newsk = inet_csk_clone(sk, req, GFP_ATOMIC); if (newsk != NULL) { - struct inet_request_sock *ireq = inet_rsk(req); + const struct inet_request_sock *ireq = inet_rsk(req); struct tcp_request_sock *treq = tcp_rsk(req); - struct inet_sock *newinet = inet_sk(newsk); - struct inet_connection_sock *newicsk = inet_csk(newsk); + struct inet_connection_sock *newicsk = inet_csk(sk); struct tcp_sock *newtp; - newsk->sk_state = TCP_SYN_RECV; - newicsk->icsk_bind_hash = NULL; - - /* Clone the TCP header template */ - newinet->dport = ireq->rmt_port; - newsk->sk_write_space = sk_stream_write_space; - /* Now setup tcp_sock */ newtp = tcp_sk(newsk); newtp->pred_flags = 0; @@ -626,8 +618,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn); - newicsk->icsk_retransmits = 0; - newicsk->icsk_backoff = 0; newtp->srtt = 0; newtp->mdev = TCP_TIMEOUT_INIT; newicsk->icsk_rto = TCP_TIMEOUT_INIT; @@ -668,8 +658,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->probes_out = 0; newtp->rx_opt.num_sacks = 0; newtp->urg_data = 0; - /* Deinitialize accept_queue to trap illegal accesses. */ - memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); if (sock_flag(newsk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(newsk, -- cgit v1.2.3 From 0a5578cf8e5e045aaa68643c17ce885426697c6b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:11:41 -0700 Subject: [ICSK]: Generalise tcp_listen_{start,stop} This also moved inet_iif from tcp to inet_hashtables.h, as it is needed by the inet_lookup callers, perhaps this needs a bit of polishing, but for now seems fine. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 +- net/ipv4/tcp.c | 38 +++++++++++++++++++++----------------- net/ipv4/tcp_ipv4.c | 6 +----- 3 files changed, 23 insertions(+), 23 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 7137e6420d6..f691058cf59 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -202,7 +202,7 @@ int inet_listen(struct socket *sock, int backlog) * we can only allow the backlog to be adjusted. */ if (old_state != TCP_LISTEN) { - err = tcp_listen_start(sk); + err = inet_csk_listen_start(sk, TCP_SYNQ_HSIZE); if (err) goto out; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 581016a6a93..a1f812159ce 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -273,6 +273,8 @@ DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics); atomic_t tcp_orphan_count = ATOMIC_INIT(0); +EXPORT_SYMBOL_GPL(tcp_orphan_count); + int sysctl_tcp_mem[3]; int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 }; int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 }; @@ -454,12 +456,11 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) return put_user(answ, (int __user *)arg); } - -int tcp_listen_start(struct sock *sk) +int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) { struct inet_sock *inet = inet_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); - int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, TCP_SYNQ_HSIZE); + int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); if (rc != 0) return rc; @@ -488,12 +489,13 @@ int tcp_listen_start(struct sock *sk) return -EADDRINUSE; } +EXPORT_SYMBOL_GPL(inet_csk_listen_start); + /* * This routine closes sockets which have been at least partially * opened, but not yet accepted. */ - -static void tcp_listen_stop (struct sock *sk) +static void inet_csk_listen_stop(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock *acc_req; @@ -524,13 +526,13 @@ static void tcp_listen_stop (struct sock *sk) BUG_TRAP(!sock_owned_by_user(child)); sock_hold(child); - tcp_disconnect(child, O_NONBLOCK); + sk->sk_prot->disconnect(child, O_NONBLOCK); sock_orphan(child); - atomic_inc(&tcp_orphan_count); + atomic_inc(sk->sk_prot->orphan_count); - tcp_destroy_sock(child); + inet_csk_destroy_sock(child); bh_unlock_sock(child); local_bh_enable(); @@ -542,6 +544,8 @@ static void tcp_listen_stop (struct sock *sk) BUG_TRAP(!sk->sk_ack_backlog); } +EXPORT_SYMBOL_GPL(inet_csk_listen_stop); + static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) { TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; @@ -1561,7 +1565,7 @@ void tcp_shutdown(struct sock *sk, int how) * can assume the socket waitqueue is inactive and nobody will * try to jump onto it. */ -void tcp_destroy_sock(struct sock *sk) +void inet_csk_destroy_sock(struct sock *sk) { BUG_TRAP(sk->sk_state == TCP_CLOSE); BUG_TRAP(sock_flag(sk, SOCK_DEAD)); @@ -1580,7 +1584,7 @@ void tcp_destroy_sock(struct sock *sk) sk_refcnt_debug_release(sk); - atomic_dec(&tcp_orphan_count); + atomic_dec(sk->sk_prot->orphan_count); sock_put(sk); } @@ -1596,7 +1600,7 @@ void tcp_close(struct sock *sk, long timeout) tcp_set_state(sk, TCP_CLOSE); /* Special case. */ - tcp_listen_stop(sk); + inet_csk_listen_stop(sk); goto adjudge_to_death; } @@ -1704,7 +1708,7 @@ adjudge_to_death: if (tmo > TCP_TIMEWAIT_LEN) { inet_csk_reset_keepalive_timer(sk, tcp_fin_time(sk)); } else { - atomic_inc(&tcp_orphan_count); + atomic_inc(sk->sk_prot->orphan_count); tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto out; } @@ -1712,7 +1716,7 @@ adjudge_to_death: } if (sk->sk_state != TCP_CLOSE) { sk_stream_mem_reclaim(sk); - if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans || + if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans || (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { if (net_ratelimit()) @@ -1723,10 +1727,10 @@ adjudge_to_death: NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); } } - atomic_inc(&tcp_orphan_count); + atomic_inc(sk->sk_prot->orphan_count); if (sk->sk_state == TCP_CLOSE) - tcp_destroy_sock(sk); + inet_csk_destroy_sock(sk); /* Otherwise, socket is reprieved until protocol close. */ out: @@ -1757,7 +1761,7 @@ int tcp_disconnect(struct sock *sk, int flags) /* ABORT function of RFC793 */ if (old_state == TCP_LISTEN) { - tcp_listen_stop(sk); + inet_csk_listen_stop(sk); } else if (tcp_need_reset(old_state) || (tp->snd_nxt != tp->write_seq && (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { @@ -2253,7 +2257,7 @@ void __init tcp_init(void) } EXPORT_SYMBOL(tcp_close); -EXPORT_SYMBOL(tcp_destroy_sock); +EXPORT_SYMBOL(inet_csk_destroy_sock); EXPORT_SYMBOL(tcp_disconnect); EXPORT_SYMBOL(tcp_getsockopt); EXPORT_SYMBOL(tcp_ioctl); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2f605b9e6b6..b966102b9f3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -431,11 +431,6 @@ failure: return err; } -static inline int inet_iif(const struct sk_buff *skb) -{ - return ((struct rtable *)skb->dst)->rt_iif; -} - /* * This routine does path mtu discovery as defined in RFC1191. */ @@ -1993,6 +1988,7 @@ struct proto tcp_prot = { .get_port = tcp_v4_get_port, .enter_memory_pressure = tcp_enter_memory_pressure, .sockets_allocated = &tcp_sockets_allocated, + .orphan_count = &tcp_orphan_count, .memory_allocated = &tcp_memory_allocated, .memory_pressure = &tcp_memory_pressure, .sysctl_mem = sysctl_tcp_mem, -- cgit v1.2.3 From 295f7324ff8d9ea58b4d3ec93b1aaa1d80e048a9 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:11:56 -0700 Subject: [ICSK]: Introduce reqsk_queue_prune from code in tcp_synack_timer With this we're very close to getting all of the current TCP refactorings in my dccp-2.6 tree merged, next changeset will export some functions needed by the current DCCP code and then dccp-2.6.git will be born! Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 1 + net/ipv4/tcp.c | 21 +++++++++++---------- net/ipv4/tcp_input.c | 11 ++++++++--- net/ipv4/tcp_minisocks.c | 10 ++++++---- net/ipv4/tcp_timer.c | 46 ++++++++++++++++++++++++++++------------------ 5 files changed, 54 insertions(+), 35 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f691058cf59..52f5ecc58c4 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -99,6 +99,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a1f812159ce..a4e9eec4489 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -495,7 +495,7 @@ EXPORT_SYMBOL_GPL(inet_csk_listen_start); * This routine closes sockets which have been at least partially * opened, but not yet accepted. */ -static void inet_csk_listen_stop(struct sock *sk) +void inet_csk_listen_stop(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct request_sock *acc_req; @@ -1947,15 +1947,15 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, break; case TCP_DEFER_ACCEPT: - tp->defer_accept = 0; + icsk->icsk_accept_queue.rskq_defer_accept = 0; if (val > 0) { /* Translate value in seconds to number of * retransmits */ - while (tp->defer_accept < 32 && + while (icsk->icsk_accept_queue.rskq_defer_accept < 32 && val > ((TCP_TIMEOUT_INIT / HZ) << - tp->defer_accept)) - tp->defer_accept++; - tp->defer_accept++; + icsk->icsk_accept_queue.rskq_defer_accept)) + icsk->icsk_accept_queue.rskq_defer_accept++; + icsk->icsk_accept_queue.rskq_defer_accept++; } break; @@ -2058,6 +2058,7 @@ EXPORT_SYMBOL_GPL(tcp_get_info); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int val, len; @@ -2095,7 +2096,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes; break; case TCP_SYNCNT: - val = inet_csk(sk)->icsk_syn_retries ? : sysctl_tcp_syn_retries; + val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; break; case TCP_LINGER2: val = tp->linger2; @@ -2103,8 +2104,8 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, val = (val ? : sysctl_tcp_fin_timeout) / HZ; break; case TCP_DEFER_ACCEPT: - val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) << - (tp->defer_accept - 1)); + val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 : + ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1)); break; case TCP_WINDOW_CLAMP: val = tp->window_clamp; @@ -2125,7 +2126,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, return 0; } case TCP_QUICKACK: - val = !inet_csk(sk)->icsk_ack.pingpong; + val = !icsk->icsk_ack.pingpong; break; case TCP_CONGESTION: diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b35badf53aa..71d456148de 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3831,6 +3831,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_parse_options(skb, &tp->rx_opt, 0); if (th->ack) { + struct inet_connection_sock *icsk; /* rfc793: * "If the state is SYN-SENT then * first check the ACK bit @@ -3956,7 +3957,11 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, sk_wake_async(sk, 0, POLL_OUT); } - if (sk->sk_write_pending || tp->defer_accept || inet_csk(sk)->icsk_ack.pingpong) { + icsk = inet_csk(sk); + + if (sk->sk_write_pending || + icsk->icsk_accept_queue.rskq_defer_accept || + icsk->icsk_ack.pingpong) { /* Save one ACK. Data will be ready after * several ticks, if write_pending is set. * @@ -3965,8 +3970,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * to stand against the temptation 8) --ANK */ inet_csk_schedule_ack(sk); - inet_csk(sk)->icsk_ack.lrcvtime = tcp_time_stamp; - inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; + icsk->icsk_ack.lrcvtime = tcp_time_stamp; + icsk->icsk_ack.ato = TCP_ATO_MIN; tcp_incr_quickack(sk); tcp_enter_quickack_mode(sk); inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 4cfbe1d1c92..2d95afe5b39 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -787,9 +787,10 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, does sequence test, SYN is truncated, and thus we consider it a bare ACK. - If tp->defer_accept, we silently drop this bare ACK. Otherwise, - we create an established connection. Both ends (listening sockets) - accept the new incoming connection and try to talk to each other. 8-) + If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this + bare ACK. Otherwise, we create an established connection. Both + ends (listening sockets) accept the new incoming connection and try + to talk to each other. 8-) Note: This case is both harmless, and rare. Possibility is about the same as us discovering intelligent life on another plant tomorrow. @@ -856,7 +857,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, return NULL; /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ - if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { + if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && + TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { inet_rsk(req)->acked = 1; return NULL; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index c03930c48f4..b614ad4d30c 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -424,16 +424,12 @@ out_unlock: sock_put(sk); } -/* - * Timer for listening sockets - */ - -static void tcp_synack_timer(struct sock *sk) +void reqsk_queue_prune(struct request_sock_queue *queue, struct sock *parent, + const unsigned long interval, const unsigned long timeout, + const unsigned long max_rto, int max_retries) { - struct tcp_sock *tp = tcp_sk(sk); - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; + struct inet_connection_sock *icsk = inet_csk(parent); + struct listen_sock *lopt = queue->listen_opt; int thresh = max_retries; unsigned long now = jiffies; struct request_sock **reqp, *req; @@ -470,10 +466,10 @@ static void tcp_synack_timer(struct sock *sk) } } - if (tp->defer_accept) - max_retries = tp->defer_accept; + if (queue->rskq_defer_accept) + max_retries = queue->rskq_defer_accept; - budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL)); + budget = 2 * (lopt->nr_table_entries / (timeout / interval)); i = lopt->clock_hand; do { @@ -482,20 +478,19 @@ static void tcp_synack_timer(struct sock *sk) if (time_after_eq(now, req->expires)) { if ((req->retrans < thresh || (inet_rsk(req)->acked && req->retrans < max_retries)) - && !req->rsk_ops->rtx_syn_ack(sk, req, NULL)) { + && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) { unsigned long timeo; if (req->retrans++ == 0) lopt->qlen_young--; - timeo = min((TCP_TIMEOUT_INIT << req->retrans), - TCP_RTO_MAX); + timeo = min((timeout << req->retrans), max_rto); req->expires = now + timeo; reqp = &req->dl_next; continue; } /* Drop this request */ - inet_csk_reqsk_queue_unlink(sk, req, reqp); + inet_csk_reqsk_queue_unlink(parent, req, reqp); reqsk_queue_removed(&icsk->icsk_accept_queue, req); reqsk_free(req); continue; @@ -503,14 +498,29 @@ static void tcp_synack_timer(struct sock *sk) reqp = &req->dl_next; } - i = (i+1)&(TCP_SYNQ_HSIZE-1); + i = (i + 1) & (lopt->nr_table_entries - 1); } while (--budget > 0); lopt->clock_hand = i; if (lopt->qlen) - inet_csk_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL); + inet_csk_reset_keepalive_timer(parent, interval); +} + +EXPORT_SYMBOL_GPL(reqsk_queue_prune); + +/* + * Timer for listening sockets + */ + +static void tcp_synack_timer(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + const int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; + + reqsk_queue_prune(&icsk->icsk_accept_queue, sk, TCP_SYNQ_INTERVAL, + TCP_TIMEOUT_INIT, TCP_RTO_MAX, max_retries); } void tcp_set_keepalive(struct sock *sk, int val) -- cgit v1.2.3 From d8c97a9451068dd9f7b838a240bb6db894133a5e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:12:12 -0700 Subject: [NET]: Export symbols needed by the current DCCP code Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/inet_timewait_sock.c | 2 ++ net/ipv4/ip_output.c | 2 ++ net/ipv4/ip_sockglue.c | 2 -- net/ipv4/route.c | 4 ++++ 4 files changed, 8 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 5cba59b869f..22882d95f64 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -47,6 +47,8 @@ void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashi inet_twsk_put(tw); } +EXPORT_SYMBOL_GPL(__inet_twsk_kill); + /* * Enter the time wait state. This is called with locally disabled BH. * Essentially we whip up a timewait bucket, copy the relevant info into it diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index dd568b0b706..633945d27ac 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -158,6 +158,8 @@ int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, dst_output); } +EXPORT_SYMBOL_GPL(ip_build_and_send_pkt); + static inline int ip_finish_output2(struct sk_buff *skb) { struct dst_entry *dst = skb->dst; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index ff4bd067b39..ddb1aedbdc6 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1090,7 +1090,5 @@ int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, EXPORT_SYMBOL(ip_cmsg_recv); -#ifdef CONFIG_IP_SCTP_MODULE EXPORT_SYMBOL(ip_getsockopt); EXPORT_SYMBOL(ip_setsockopt); -#endif diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3aef0e15460..8c0b14e3bee 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2602,6 +2602,8 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp) return ip_route_output_slow(rp, flp); } +EXPORT_SYMBOL_GPL(__ip_route_output_key); + int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags) { int err; @@ -2620,6 +2622,8 @@ int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, return 0; } +EXPORT_SYMBOL_GPL(ip_route_output_flow); + int ip_route_output_key(struct rtable **rp, struct flowi *flp) { return ip_route_output_flow(rp, flp, NULL, 0); -- cgit v1.2.3 From a019d6fe2b9da68ea4ba6cf3c4e86fc1dbf554c3 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:15:09 -0700 Subject: [ICSK]: Move generalised functions from tcp to inet_connection_sock This also improves reqsk_queue_prune and renames it to inet_csk_reqsk_queue_prune, as it deals with both inet_connection_sock and inet_request_sock objects, not just with request_sock ones thus belonging to inet_request_sock. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 214 ++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp.c | 120 ---------------------- net/ipv4/tcp_timer.c | 93 +---------------- 3 files changed, 216 insertions(+), 211 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 136ada050b6..026630a15ea 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -23,6 +23,7 @@ #include #include #include +#include #ifdef INET_CSK_DEBUG const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n"; @@ -398,8 +399,100 @@ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, inet_csk_reqsk_queue_added(sk, timeout); } +/* Only thing we need from tcp.h */ +extern int sysctl_tcp_synack_retries; + EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); +void inet_csk_reqsk_queue_prune(struct sock *parent, + const unsigned long interval, + const unsigned long timeout, + const unsigned long max_rto) +{ + struct inet_connection_sock *icsk = inet_csk(parent); + struct request_sock_queue *queue = &icsk->icsk_accept_queue; + struct listen_sock *lopt = queue->listen_opt; + int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; + int thresh = max_retries; + unsigned long now = jiffies; + struct request_sock **reqp, *req; + int i, budget; + + if (lopt == NULL || lopt->qlen == 0) + return; + + /* Normally all the openreqs are young and become mature + * (i.e. converted to established socket) for first timeout. + * If synack was not acknowledged for 3 seconds, it means + * one of the following things: synack was lost, ack was lost, + * rtt is high or nobody planned to ack (i.e. synflood). + * When server is a bit loaded, queue is populated with old + * open requests, reducing effective size of queue. + * When server is well loaded, queue size reduces to zero + * after several minutes of work. It is not synflood, + * it is normal operation. The solution is pruning + * too old entries overriding normal timeout, when + * situation becomes dangerous. + * + * Essentially, we reserve half of room for young + * embrions; and abort old ones without pity, if old + * ones are about to clog our table. + */ + if (lopt->qlen>>(lopt->max_qlen_log-1)) { + int young = (lopt->qlen_young<<1); + + while (thresh > 2) { + if (lopt->qlen < young) + break; + thresh--; + young <<= 1; + } + } + + if (queue->rskq_defer_accept) + max_retries = queue->rskq_defer_accept; + + budget = 2 * (lopt->nr_table_entries / (timeout / interval)); + i = lopt->clock_hand; + + do { + reqp=&lopt->syn_table[i]; + while ((req = *reqp) != NULL) { + if (time_after_eq(now, req->expires)) { + if ((req->retrans < thresh || + (inet_rsk(req)->acked && req->retrans < max_retries)) + && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) { + unsigned long timeo; + + if (req->retrans++ == 0) + lopt->qlen_young--; + timeo = min((timeout << req->retrans), max_rto); + req->expires = now + timeo; + reqp = &req->dl_next; + continue; + } + + /* Drop this request */ + inet_csk_reqsk_queue_unlink(parent, req, reqp); + reqsk_queue_removed(queue, req); + reqsk_free(req); + continue; + } + reqp = &req->dl_next; + } + + i = (i + 1) & (lopt->nr_table_entries - 1); + + } while (--budget > 0); + + lopt->clock_hand = i; + + if (lopt->qlen) + inet_csk_reset_keepalive_timer(parent, interval); +} + +EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune); + struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, const unsigned int __nocast priority) { @@ -424,3 +517,124 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, } EXPORT_SYMBOL_GPL(inet_csk_clone); + +/* + * At this point, there should be no process reference to this + * socket, and thus no user references at all. Therefore we + * can assume the socket waitqueue is inactive and nobody will + * try to jump onto it. + */ +void inet_csk_destroy_sock(struct sock *sk) +{ + BUG_TRAP(sk->sk_state == TCP_CLOSE); + BUG_TRAP(sock_flag(sk, SOCK_DEAD)); + + /* It cannot be in hash table! */ + BUG_TRAP(sk_unhashed(sk)); + + /* If it has not 0 inet_sk(sk)->num, it must be bound */ + BUG_TRAP(!inet_sk(sk)->num || inet_csk(sk)->icsk_bind_hash); + + sk->sk_prot->destroy(sk); + + sk_stream_kill_queues(sk); + + xfrm_sk_free_policy(sk); + + sk_refcnt_debug_release(sk); + + atomic_dec(sk->sk_prot->orphan_count); + sock_put(sk); +} + +EXPORT_SYMBOL(inet_csk_destroy_sock); + +int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) +{ + struct inet_sock *inet = inet_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); + int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); + + if (rc != 0) + return rc; + + sk->sk_max_ack_backlog = 0; + sk->sk_ack_backlog = 0; + inet_csk_delack_init(sk); + + /* There is race window here: we announce ourselves listening, + * but this transition is still not validated by get_port(). + * It is OK, because this socket enters to hash table only + * after validation is complete. + */ + sk->sk_state = TCP_LISTEN; + if (!sk->sk_prot->get_port(sk, inet->num)) { + inet->sport = htons(inet->num); + + sk_dst_reset(sk); + sk->sk_prot->hash(sk); + + return 0; + } + + sk->sk_state = TCP_CLOSE; + __reqsk_queue_destroy(&icsk->icsk_accept_queue); + return -EADDRINUSE; +} + +EXPORT_SYMBOL_GPL(inet_csk_listen_start); + +/* + * This routine closes sockets which have been at least partially + * opened, but not yet accepted. + */ +void inet_csk_listen_stop(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct request_sock *acc_req; + struct request_sock *req; + + inet_csk_delete_keepalive_timer(sk); + + /* make all the listen_opt local to us */ + acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue); + + /* Following specs, it would be better either to send FIN + * (and enter FIN-WAIT-1, it is normal close) + * or to send active reset (abort). + * Certainly, it is pretty dangerous while synflood, but it is + * bad justification for our negligence 8) + * To be honest, we are not able to make either + * of the variants now. --ANK + */ + reqsk_queue_destroy(&icsk->icsk_accept_queue); + + while ((req = acc_req) != NULL) { + struct sock *child = req->sk; + + acc_req = req->dl_next; + + local_bh_disable(); + bh_lock_sock(child); + BUG_TRAP(!sock_owned_by_user(child)); + sock_hold(child); + + sk->sk_prot->disconnect(child, O_NONBLOCK); + + sock_orphan(child); + + atomic_inc(sk->sk_prot->orphan_count); + + inet_csk_destroy_sock(child); + + bh_unlock_sock(child); + local_bh_enable(); + sock_put(child); + + sk_acceptq_removed(sk); + __reqsk_free(req); + } + BUG_TRAP(!sk->sk_ack_backlog); +} + +EXPORT_SYMBOL_GPL(inet_csk_listen_stop); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a4e9eec4489..4bda522d25c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -456,96 +456,6 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) return put_user(answ, (int __user *)arg); } -int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) -{ - struct inet_sock *inet = inet_sk(sk); - struct inet_connection_sock *icsk = inet_csk(sk); - int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); - - if (rc != 0) - return rc; - - sk->sk_max_ack_backlog = 0; - sk->sk_ack_backlog = 0; - inet_csk_delack_init(sk); - - /* There is race window here: we announce ourselves listening, - * but this transition is still not validated by get_port(). - * It is OK, because this socket enters to hash table only - * after validation is complete. - */ - sk->sk_state = TCP_LISTEN; - if (!sk->sk_prot->get_port(sk, inet->num)) { - inet->sport = htons(inet->num); - - sk_dst_reset(sk); - sk->sk_prot->hash(sk); - - return 0; - } - - sk->sk_state = TCP_CLOSE; - __reqsk_queue_destroy(&icsk->icsk_accept_queue); - return -EADDRINUSE; -} - -EXPORT_SYMBOL_GPL(inet_csk_listen_start); - -/* - * This routine closes sockets which have been at least partially - * opened, but not yet accepted. - */ -void inet_csk_listen_stop(struct sock *sk) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct request_sock *acc_req; - struct request_sock *req; - - inet_csk_delete_keepalive_timer(sk); - - /* make all the listen_opt local to us */ - acc_req = reqsk_queue_yank_acceptq(&icsk->icsk_accept_queue); - - /* Following specs, it would be better either to send FIN - * (and enter FIN-WAIT-1, it is normal close) - * or to send active reset (abort). - * Certainly, it is pretty dangerous while synflood, but it is - * bad justification for our negligence 8) - * To be honest, we are not able to make either - * of the variants now. --ANK - */ - reqsk_queue_destroy(&icsk->icsk_accept_queue); - - while ((req = acc_req) != NULL) { - struct sock *child = req->sk; - - acc_req = req->dl_next; - - local_bh_disable(); - bh_lock_sock(child); - BUG_TRAP(!sock_owned_by_user(child)); - sock_hold(child); - - sk->sk_prot->disconnect(child, O_NONBLOCK); - - sock_orphan(child); - - atomic_inc(sk->sk_prot->orphan_count); - - inet_csk_destroy_sock(child); - - bh_unlock_sock(child); - local_bh_enable(); - sock_put(child); - - sk_acceptq_removed(sk); - __reqsk_free(req); - } - BUG_TRAP(!sk->sk_ack_backlog); -} - -EXPORT_SYMBOL_GPL(inet_csk_listen_stop); - static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) { TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; @@ -1559,35 +1469,6 @@ void tcp_shutdown(struct sock *sk, int how) } } -/* - * At this point, there should be no process reference to this - * socket, and thus no user references at all. Therefore we - * can assume the socket waitqueue is inactive and nobody will - * try to jump onto it. - */ -void inet_csk_destroy_sock(struct sock *sk) -{ - BUG_TRAP(sk->sk_state == TCP_CLOSE); - BUG_TRAP(sock_flag(sk, SOCK_DEAD)); - - /* It cannot be in hash table! */ - BUG_TRAP(sk_unhashed(sk)); - - /* If it has not 0 inet_sk(sk)->num, it must be bound */ - BUG_TRAP(!inet_sk(sk)->num || inet_csk(sk)->icsk_bind_hash); - - sk->sk_prot->destroy(sk); - - sk_stream_kill_queues(sk); - - xfrm_sk_free_policy(sk); - - sk_refcnt_debug_release(sk); - - atomic_dec(sk->sk_prot->orphan_count); - sock_put(sk); -} - void tcp_close(struct sock *sk, long timeout) { struct sk_buff *skb; @@ -2258,7 +2139,6 @@ void __init tcp_init(void) } EXPORT_SYMBOL(tcp_close); -EXPORT_SYMBOL(inet_csk_destroy_sock); EXPORT_SYMBOL(tcp_disconnect); EXPORT_SYMBOL(tcp_getsockopt); EXPORT_SYMBOL(tcp_ioctl); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b614ad4d30c..72cec698183 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -424,103 +424,14 @@ out_unlock: sock_put(sk); } -void reqsk_queue_prune(struct request_sock_queue *queue, struct sock *parent, - const unsigned long interval, const unsigned long timeout, - const unsigned long max_rto, int max_retries) -{ - struct inet_connection_sock *icsk = inet_csk(parent); - struct listen_sock *lopt = queue->listen_opt; - int thresh = max_retries; - unsigned long now = jiffies; - struct request_sock **reqp, *req; - int i, budget; - - if (lopt == NULL || lopt->qlen == 0) - return; - - /* Normally all the openreqs are young and become mature - * (i.e. converted to established socket) for first timeout. - * If synack was not acknowledged for 3 seconds, it means - * one of the following things: synack was lost, ack was lost, - * rtt is high or nobody planned to ack (i.e. synflood). - * When server is a bit loaded, queue is populated with old - * open requests, reducing effective size of queue. - * When server is well loaded, queue size reduces to zero - * after several minutes of work. It is not synflood, - * it is normal operation. The solution is pruning - * too old entries overriding normal timeout, when - * situation becomes dangerous. - * - * Essentially, we reserve half of room for young - * embrions; and abort old ones without pity, if old - * ones are about to clog our table. - */ - if (lopt->qlen>>(lopt->max_qlen_log-1)) { - int young = (lopt->qlen_young<<1); - - while (thresh > 2) { - if (lopt->qlen < young) - break; - thresh--; - young <<= 1; - } - } - - if (queue->rskq_defer_accept) - max_retries = queue->rskq_defer_accept; - - budget = 2 * (lopt->nr_table_entries / (timeout / interval)); - i = lopt->clock_hand; - - do { - reqp=&lopt->syn_table[i]; - while ((req = *reqp) != NULL) { - if (time_after_eq(now, req->expires)) { - if ((req->retrans < thresh || - (inet_rsk(req)->acked && req->retrans < max_retries)) - && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) { - unsigned long timeo; - - if (req->retrans++ == 0) - lopt->qlen_young--; - timeo = min((timeout << req->retrans), max_rto); - req->expires = now + timeo; - reqp = &req->dl_next; - continue; - } - - /* Drop this request */ - inet_csk_reqsk_queue_unlink(parent, req, reqp); - reqsk_queue_removed(&icsk->icsk_accept_queue, req); - reqsk_free(req); - continue; - } - reqp = &req->dl_next; - } - - i = (i + 1) & (lopt->nr_table_entries - 1); - - } while (--budget > 0); - - lopt->clock_hand = i; - - if (lopt->qlen) - inet_csk_reset_keepalive_timer(parent, interval); -} - -EXPORT_SYMBOL_GPL(reqsk_queue_prune); - /* * Timer for listening sockets */ static void tcp_synack_timer(struct sock *sk) { - struct inet_connection_sock *icsk = inet_csk(sk); - const int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; - - reqsk_queue_prune(&icsk->icsk_accept_queue, sk, TCP_SYNQ_INTERVAL, - TCP_TIMEOUT_INIT, TCP_RTO_MAX, max_retries); + inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, + TCP_TIMEOUT_INIT, TCP_RTO_MAX); } void tcp_set_keepalive(struct sock *sk, int val) -- cgit v1.2.3 From bb97d31f5130d677644d9931ef38613d1164ec94 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:19:14 -0700 Subject: [INET]: Make inet_create try to load protocol modules Syntax is net-pf-PROTOCOL_FAMILY-PROTOCOL-SOCK_TYPE and if this fails net-pf-PROTOCOL_FAMILY-PROTOCOL. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 52f5ecc58c4..20f52b5f5de 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -228,12 +228,14 @@ static int inet_create(struct socket *sock, int protocol) struct proto *answer_prot; unsigned char answer_flags; char answer_no_check; - int err; + int try_loading_module = 0; + int err = -ESOCKTNOSUPPORT; sock->state = SS_UNCONNECTED; /* Look for the requested type/protocol pair. */ answer = NULL; +lookup_protocol: rcu_read_lock(); list_for_each_rcu(p, &inetsw[sock->type]) { answer = list_entry(p, struct inet_protosw, list); @@ -254,9 +256,28 @@ static int inet_create(struct socket *sock, int protocol) answer = NULL; } - err = -ESOCKTNOSUPPORT; - if (!answer) - goto out_rcu_unlock; + if (unlikely(answer == NULL)) { + if (try_loading_module < 2) { + rcu_read_unlock(); + /* + * Be more specific, e.g. net-pf-2-proto-132-type-1 + * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM) + */ + if (++try_loading_module == 1) + request_module("net-pf-%d-proto-%d-type-%d", + PF_INET, protocol, sock->type); + /* + * Fall back to generic, e.g. net-pf-2-proto-132 + * (net-pf-PF_INET-proto-IPPROTO_SCTP) + */ + else + request_module("net-pf-%d-proto-%d", + PF_INET, protocol); + goto lookup_protocol; + } else + goto out_rcu_unlock; + } + err = -EPERM; if (answer->capability > 0 && !capable(answer->capability)) goto out_rcu_unlock; -- cgit v1.2.3 From 2669d63d20683828f673b606915957f3a070602d Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 20:19:44 -0700 Subject: [NETFILTER]: move conntrack helper buffers from BSS to kmalloc()ed memory According to DaveM, it is preferrable to have large data structures be allocated dynamically from the module init() function rather than putting them as static global variables into BSS. This patch moves the conntrack helper packet buffers into dynamically allocated memory. Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_amanda.c | 18 ++++++++++++++++-- net/ipv4/netfilter/ip_conntrack_ftp.c | 9 +++++++-- net/ipv4/netfilter/ip_conntrack_irc.c | 7 ++++++- 3 files changed, 29 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c index 01e1b58322a..be4c9eb3243 100644 --- a/net/ipv4/netfilter/ip_conntrack_amanda.c +++ b/net/ipv4/netfilter/ip_conntrack_amanda.c @@ -40,7 +40,7 @@ MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); static char *conns[] = { "DATA ", "MESG ", "INDEX " }; /* This is slow, but it's simple. --RR */ -static char amanda_buffer[65536]; +static char *amanda_buffer; static DEFINE_SPINLOCK(amanda_buffer_lock); unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, @@ -153,11 +153,25 @@ static struct ip_conntrack_helper amanda_helper = { static void __exit fini(void) { ip_conntrack_helper_unregister(&amanda_helper); + kfree(amanda_buffer); } static int __init init(void) { - return ip_conntrack_helper_register(&amanda_helper); + int ret; + + amanda_buffer = kmalloc(65536, GFP_KERNEL); + if (!amanda_buffer) + return -ENOMEM; + + ret = ip_conntrack_helper_register(&amanda_helper); + if (ret < 0) { + kfree(amanda_buffer); + return ret; + } + return 0; + + } module_init(init); diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c index 9658896f899..3a2627db172 100644 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c @@ -25,8 +25,7 @@ MODULE_AUTHOR("Rusty Russell "); MODULE_DESCRIPTION("ftp connection tracking helper"); /* This is slow, but it's simple. --RR */ -static char ftp_buffer[65536]; - +static char *ftp_buffer; static DEFINE_SPINLOCK(ip_ftp_lock); #define MAX_PORTS 8 @@ -461,6 +460,8 @@ static void fini(void) ports[i]); ip_conntrack_helper_unregister(&ftp[i]); } + + kfree(ftp_buffer); } static int __init init(void) @@ -468,6 +469,10 @@ static int __init init(void) int i, ret; char *tmpname; + ftp_buffer = kmalloc(65536, GFP_KERNEL); + if (!ftp_buffer) + return -ENOMEM; + if (ports_c == 0) ports[ports_c++] = FTP_PORT; diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c index 4a28f297d50..25438eec21a 100644 --- a/net/ipv4/netfilter/ip_conntrack_irc.c +++ b/net/ipv4/netfilter/ip_conntrack_irc.c @@ -39,7 +39,7 @@ static int ports_c; static int max_dcc_channels = 8; static unsigned int dcc_timeout = 300; /* This is slow, but it's simple. --RR */ -static char irc_buffer[65536]; +static char *irc_buffer; static DEFINE_SPINLOCK(irc_buffer_lock); unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb, @@ -257,6 +257,10 @@ static int __init init(void) printk("ip_conntrack_irc: dcc_timeout must be a positive integer\n"); return -EBUSY; } + + irc_buffer = kmalloc(65536, GFP_KERNEL); + if (!irc_buffer) + return -ENOMEM; /* If no port given, default to standard irc port */ if (ports_c == 0) @@ -304,6 +308,7 @@ static void fini(void) ports[i]); ip_conntrack_helper_unregister(&irc_helpers[i]); } + kfree(irc_buffer); } module_init(init); -- cgit v1.2.3 From 210a9ebef2d1bd32d9e9d81c84d538e237769cdb Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 20:20:54 -0700 Subject: [NETFILTER]: ip{6}_queue: prevent unregistration race with nfnetlink_queue Since nfnetlink_queue can override ip{6}_queue as queue handlers, we can no longer blindly unregister whoever is registered for PF_INET[6], but only unregister ourselves. Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index cfc886f382a..629de649f13 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -692,7 +692,7 @@ init_or_cleanup(int init) return status; cleanup: - nf_unregister_queue_handler(PF_INET); + nf_unregister_queue_handlers(&ipq_enqueue_packet); synchronize_net(); ipq_flush(NF_DROP); -- cgit v1.2.3 From bbd86b9fc469b7e91dc7444e6abb8930811d79cb Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 20:23:11 -0700 Subject: [NETFILTER]: add /proc/net/netfilter interface to nf_queue This patch adds a /proc/net/netfilter/nf_queue file, similar to the recently-added /proc/net/netfilter/nf_log. It indicates which queue handler is registered to which protocol family. This is useful since there are now multiple queue handlers in the treee (ip[6]_queue, nfnetlink_queue). Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_queue.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 629de649f13..1c49833e00a 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -656,6 +656,11 @@ ipq_get_info(char *buffer, char **start, off_t offset, int length) } #endif /* CONFIG_PROC_FS */ +static struct nf_queue_handler nfqh = { + .name = "ip_queue", + .outfn = &ipq_enqueue_packet, +}; + static int init_or_cleanup(int init) { @@ -684,7 +689,7 @@ init_or_cleanup(int init) register_netdevice_notifier(&ipq_dev_notifier); ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0); - status = nf_register_queue_handler(PF_INET, ipq_enqueue_packet, NULL); + status = nf_register_queue_handler(PF_INET, &nfqh); if (status < 0) { printk(KERN_ERR "ip_queue: failed to register queue handler\n"); goto cleanup_sysctl; @@ -692,7 +697,7 @@ init_or_cleanup(int init) return status; cleanup: - nf_unregister_queue_handlers(&ipq_enqueue_packet); + nf_unregister_queue_handlers(&nfqh); synchronize_net(); ipq_flush(NF_DROP); -- cgit v1.2.3 From 7663f18807805f02608457af8e2f59eee5d910fd Mon Sep 17 00:00:00 2001 From: Yasuyuki Kozakai Date: Tue, 9 Aug 2005 20:24:15 -0700 Subject: [NETFILTER]: return ENOMEM when ip_conntrack_alloc() fails. This patch fixes the bug which doesn't return ERR_PTR(-ENOMEM) if it failed to allocate memory space from slab cache. This bug leads to erroneously not dropped packets under stress, and wrong statistic counters ('invalid' is incremented instead of 'drop'). It was introduced during the ctnetlink merge in the net-2.6.14 tree, so no stable or mainline releases affected. Signed-off-by: Yasuyuki Kozakai Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 9261388d5ac..285743bfbed 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -655,7 +655,7 @@ struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig, conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC); if (!conntrack) { DEBUGP("Can't allocate conntrack.\n"); - return NULL; + return ERR_PTR(-ENOMEM); } memset(conntrack, 0, sizeof(*conntrack)); @@ -696,8 +696,9 @@ init_conntrack(struct ip_conntrack_tuple *tuple, return NULL; } - if (!(conntrack = ip_conntrack_alloc(tuple, &repl_tuple))) - return NULL; + conntrack = ip_conntrack_alloc(tuple, &repl_tuple); + if (conntrack == NULL || IS_ERR(conntrack)) + return (struct ip_conntrack_tuple_hash *)conntrack; if (!protocol->new(conntrack, skb)) { ip_conntrack_free(conntrack); -- cgit v1.2.3 From 91b9a277fc4d207249e459a455abf804ebb5499d Mon Sep 17 00:00:00 2001 From: Olof Johansson Date: Tue, 9 Aug 2005 20:24:39 -0700 Subject: [IPV4]: FIB Trie cleanups. Below is a patch that cleans up some of this, supposedly without changing any behaviour: * Whitespace cleanups * Introduce DBG() * BUG_ON() instead of if () { BUG(); } * Remove some of the deep nesting to make the code flow more comprehensible * Some mask operations were simplified Signed-off-by: Olof Johansson Signed-off-by: Robert Olsson Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 1237 ++++++++++++++++++++++++--------------------------- 1 file changed, 592 insertions(+), 645 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 45efd5f4741..6f818cc7efd 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -89,27 +89,27 @@ typedef unsigned int t_key; #define T_TNODE 0 #define T_LEAF 1 #define NODE_TYPE_MASK 0x1UL -#define NODE_PARENT(_node) \ - ((struct tnode *)((_node)->_parent & ~NODE_TYPE_MASK)) -#define NODE_SET_PARENT(_node, _ptr) \ - ((_node)->_parent = (((unsigned long)(_ptr)) | \ - ((_node)->_parent & NODE_TYPE_MASK))) -#define NODE_INIT_PARENT(_node, _type) \ - ((_node)->_parent = (_type)) -#define NODE_TYPE(_node) \ - ((_node)->_parent & NODE_TYPE_MASK) - -#define IS_TNODE(n) (!(n->_parent & T_LEAF)) -#define IS_LEAF(n) (n->_parent & T_LEAF) +#define NODE_PARENT(node) \ + ((struct tnode *)((node)->parent & ~NODE_TYPE_MASK)) +#define NODE_SET_PARENT(node, ptr) \ + ((node)->parent = (((unsigned long)(ptr)) | \ + ((node)->parent & NODE_TYPE_MASK))) +#define NODE_INIT_PARENT(node, type) \ + ((node)->parent = (type)) +#define NODE_TYPE(node) \ + ((node)->parent & NODE_TYPE_MASK) + +#define IS_TNODE(n) (!(n->parent & T_LEAF)) +#define IS_LEAF(n) (n->parent & T_LEAF) struct node { - t_key key; - unsigned long _parent; + t_key key; + unsigned long parent; }; struct leaf { - t_key key; - unsigned long _parent; + t_key key; + unsigned long parent; struct hlist_head list; }; @@ -120,13 +120,13 @@ struct leaf_info { }; struct tnode { - t_key key; - unsigned long _parent; - unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */ - unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */ - unsigned short full_children; /* KEYLENGTH bits needed */ - unsigned short empty_children; /* KEYLENGTH bits needed */ - struct node *child[0]; + t_key key; + unsigned long parent; + unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */ + unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */ + unsigned short full_children; /* KEYLENGTH bits needed */ + unsigned short empty_children; /* KEYLENGTH bits needed */ + struct node *child[0]; }; #ifdef CONFIG_IP_FIB_TRIE_STATS @@ -150,16 +150,18 @@ struct trie_stat { }; struct trie { - struct node *trie; + struct node *trie; #ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats stats; #endif - int size; + int size; unsigned int revision; }; static int trie_debug = 0; +#define DBG(x...) do { if (trie_debug) printk(x); } while (0) + static int tnode_full(struct tnode *tn, struct node *n); static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); @@ -171,56 +173,31 @@ static void tnode_free(struct tnode *tn); static void trie_dump_seq(struct seq_file *seq, struct trie *t); extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); extern int fib_detect_death(struct fib_info *fi, int order, - struct fib_info **last_resort, int *last_idx, int *dflt); + struct fib_info **last_resort, int *last_idx, int *dflt); extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id, - struct nlmsghdr *n, struct netlink_skb_parms *req); + struct nlmsghdr *n, struct netlink_skb_parms *req); static kmem_cache_t *fn_alias_kmem; static struct trie *trie_local = NULL, *trie_main = NULL; -static void trie_bug(char *err) -{ - printk("Trie Bug: %s\n", err); - BUG(); -} - static inline struct node *tnode_get_child(struct tnode *tn, int i) { - if (i >= 1<bits) - trie_bug("tnode_get_child"); + BUG_ON(i >= 1 << tn->bits); - return tn->child[i]; + return tn->child[i]; } static inline int tnode_child_length(struct tnode *tn) { - return 1<bits; + return 1 << tn->bits; } -/* - _________________________________________________________________ - | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C | - ---------------------------------------------------------------- - 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 - - _________________________________________________________________ - | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u | - ----------------------------------------------------------------- - 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 - - tp->pos = 7 - tp->bits = 3 - n->pos = 15 - n->bits=4 - KEYLENGTH=32 -*/ - static inline t_key tkey_extract_bits(t_key a, int offset, int bits) { - if (offset < KEYLENGTH) + if (offset < KEYLENGTH) return ((t_key)(a << offset)) >> (KEYLENGTH - bits); - else + else return 0; } @@ -233,8 +210,8 @@ static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b) { if (bits == 0 || offset >= KEYLENGTH) return 1; - bits = bits > KEYLENGTH ? KEYLENGTH : bits; - return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0; + bits = bits > KEYLENGTH ? KEYLENGTH : bits; + return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0; } static inline int tkey_mismatch(t_key a, int offset, t_key b) @@ -249,7 +226,7 @@ static inline int tkey_mismatch(t_key a, int offset, t_key b) return i; } -/* Candiate for fib_semantics */ +/* Candidate for fib_semantics */ static void fn_free_alias(struct fib_alias *fa) { @@ -295,7 +272,7 @@ static void fn_free_alias(struct fib_alias *fa) tp->pos = 7 tp->bits = 3 n->pos = 15 - n->bits=4 + n->bits = 4 First, let's just ignore the bits that come before the parent tp, that is the bits from 0 to (tp->pos-1). They are *known* but at this point we do @@ -343,10 +320,13 @@ static struct leaf *leaf_new(void) static struct leaf_info *leaf_info_new(int plen) { struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL); - if (li) { - li->plen = plen; - INIT_LIST_HEAD(&li->falh); - } + + if (!li) + return NULL; + + li->plen = plen; + INIT_LIST_HEAD(&li->falh); + return li; } @@ -373,7 +353,7 @@ static struct tnode *tnode_alloc(unsigned int size) static void __tnode_free(struct tnode *tn) { unsigned int size = sizeof(struct tnode) + - (1<bits) * sizeof(struct node *); + (1 << tn->bits) * sizeof(struct node *); if (size <= PAGE_SIZE) kfree(tn); @@ -387,7 +367,7 @@ static struct tnode* tnode_new(t_key key, int pos, int bits) int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *); struct tnode *tn = tnode_alloc(sz); - if (tn) { + if (tn) { memset(tn, 0, sz); NODE_INIT_PARENT(tn, T_TNODE); tn->pos = pos; @@ -397,29 +377,21 @@ static struct tnode* tnode_new(t_key key, int pos, int bits) tn->empty_children = 1< 0) - printk("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode), - (unsigned int) (sizeof(struct node) * 1< 0 ) - printk("FL %p \n", tn); - } - else if (IS_TNODE(tn)) { + DBG("FL %p \n", tn); + } else { __tnode_free(tn); - if (trie_debug > 0 ) - printk("FT %p \n", tn); - } - else { - trie_bug("tnode_free\n"); + DBG("FT %p \n", tn); } } @@ -453,7 +425,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w if (i >= 1<bits) { printk("bits=%d, i=%d\n", tn->bits, i); - trie_bug("tnode_put_child_reorg bits"); + BUG(); } write_lock_bh(&fib_lock); chi = tn->child[i]; @@ -465,15 +437,15 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w tn->empty_children--; /* update fullChildren */ - if (wasfull == -1) + if (wasfull == -1) wasfull = tnode_full(tn, chi); isfull = tnode_full(tn, n); if (wasfull && !isfull) tn->full_children--; - else if (!wasfull && isfull) tn->full_children++; + if (n) NODE_SET_PARENT(n, tn); @@ -489,9 +461,8 @@ static struct node *resize(struct trie *t, struct tnode *tn) if (!tn) return NULL; - if (trie_debug) - printk("In tnode_resize %p inflate_threshold=%d threshold=%d\n", - tn, inflate_threshold, halve_threshold); + DBG("In tnode_resize %p inflate_threshold=%d threshold=%d\n", + tn, inflate_threshold, halve_threshold); /* No children */ if (tn->empty_children == tnode_child_length(tn)) { @@ -501,20 +472,21 @@ static struct node *resize(struct trie *t, struct tnode *tn) /* One child */ if (tn->empty_children == tnode_child_length(tn) - 1) for (i = 0; i < tnode_child_length(tn); i++) { + struct node *n; write_lock_bh(&fib_lock); - if (tn->child[i] != NULL) { - - /* compress one level */ - struct node *n = tn->child[i]; - if (n) - NODE_INIT_PARENT(n, NODE_TYPE(n)); - + n = tn->child[i]; + if (!n) { write_unlock_bh(&fib_lock); - tnode_free(tn); - return n; + continue; } + + /* compress one level */ + NODE_INIT_PARENT(n, NODE_TYPE(n)); + write_unlock_bh(&fib_lock); + tnode_free(tn); + return n; } /* * Double as long as the resulting node has a number of @@ -566,16 +538,16 @@ static struct node *resize(struct trie *t, struct tnode *tn) * * expand not_to_be_doubled and to_be_doubled, and shorten: * 100 * (tnode_child_length(tn) - tn->empty_children + - * tn->full_children ) >= inflate_threshold * new_child_length + * tn->full_children) >= inflate_threshold * new_child_length * * expand new_child_length: * 100 * (tnode_child_length(tn) - tn->empty_children + - * tn->full_children ) >= + * tn->full_children) >= * inflate_threshold * tnode_child_length(tn) * 2 * * shorten again: * 50 * (tn->full_children + tnode_child_length(tn) - - * tn->empty_children ) >= inflate_threshold * + * tn->empty_children) >= inflate_threshold * * tnode_child_length(tn) * */ @@ -624,20 +596,23 @@ static struct node *resize(struct trie *t, struct tnode *tn) if (tn->empty_children == tnode_child_length(tn) - 1) for (i = 0; i < tnode_child_length(tn); i++) { - - write_lock_bh(&fib_lock); - if (tn->child[i] != NULL) { - /* compress one level */ - struct node *n = tn->child[i]; + struct node *n; - if (n) - NODE_INIT_PARENT(n, NODE_TYPE(n)); + write_lock_bh(&fib_lock); + n = tn->child[i]; + if (!n) { write_unlock_bh(&fib_lock); - tnode_free(tn); - return n; + continue; } + + /* compress one level */ + + NODE_INIT_PARENT(n, NODE_TYPE(n)); + write_unlock_bh(&fib_lock); + tnode_free(tn); + return n; } return (struct node *) tn; @@ -650,8 +625,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) int olen = tnode_child_length(tn); int i; - if (trie_debug) - printk("In inflate\n"); + DBG("In inflate\n"); tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); @@ -666,8 +640,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) * fails. In case of failure we return the oldnode and inflate * of tnode is ignored. */ - - for(i = 0; i < olen; i++) { + + for (i = 0; i < olen; i++) { struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i); if (inode && @@ -675,7 +649,6 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) inode->pos == oldtnode->pos + oldtnode->bits && inode->bits > 1) { struct tnode *left, *right; - t_key m = TKEY_GET_MASK(inode->pos, 1); left = tnode_new(inode->key&(~m), inode->pos + 1, @@ -685,7 +658,7 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) *err = -ENOMEM; break; } - + right = tnode_new(inode->key|m, inode->pos + 1, inode->bits - 1); @@ -703,18 +676,20 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) int size = tnode_child_length(tn); int j; - for(j = 0; j < size; j++) + for (j = 0; j < size; j++) if (tn->child[j]) tnode_free((struct tnode *)tn->child[j]); tnode_free(tn); - + *err = -ENOMEM; return oldtnode; } - for(i = 0; i < olen; i++) { + for (i = 0; i < olen; i++) { struct node *node = tnode_get_child(oldtnode, i); + struct tnode *left, *right; + int size, j; /* An empty child */ if (node == NULL) @@ -740,56 +715,51 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) put_child(t, tn, 2*i+1, inode->child[1]); tnode_free(inode); + continue; } - /* An internal node with more than two children */ - else { - struct tnode *left, *right; - int size, j; - - /* We will replace this node 'inode' with two new - * ones, 'left' and 'right', each with half of the - * original children. The two new nodes will have - * a position one bit further down the key and this - * means that the "significant" part of their keys - * (see the discussion near the top of this file) - * will differ by one bit, which will be "0" in - * left's key and "1" in right's key. Since we are - * moving the key position by one step, the bit that - * we are moving away from - the bit at position - * (inode->pos) - is the one that will differ between - * left and right. So... we synthesize that bit in the - * two new keys. - * The mask 'm' below will be a single "one" bit at - * the position (inode->pos) - */ - - /* Use the old key, but set the new significant - * bit to zero. - */ + /* An internal node with more than two children */ + + /* We will replace this node 'inode' with two new + * ones, 'left' and 'right', each with half of the + * original children. The two new nodes will have + * a position one bit further down the key and this + * means that the "significant" part of their keys + * (see the discussion near the top of this file) + * will differ by one bit, which will be "0" in + * left's key and "1" in right's key. Since we are + * moving the key position by one step, the bit that + * we are moving away from - the bit at position + * (inode->pos) - is the one that will differ between + * left and right. So... we synthesize that bit in the + * two new keys. + * The mask 'm' below will be a single "one" bit at + * the position (inode->pos) + */ - left = (struct tnode *) tnode_get_child(tn, 2*i); - put_child(t, tn, 2*i, NULL); + /* Use the old key, but set the new significant + * bit to zero. + */ - if (!left) - BUG(); + left = (struct tnode *) tnode_get_child(tn, 2*i); + put_child(t, tn, 2*i, NULL); - right = (struct tnode *) tnode_get_child(tn, 2*i+1); - put_child(t, tn, 2*i+1, NULL); + BUG_ON(!left); - if (!right) - BUG(); + right = (struct tnode *) tnode_get_child(tn, 2*i+1); + put_child(t, tn, 2*i+1, NULL); - size = tnode_child_length(left); - for(j = 0; j < size; j++) { - put_child(t, left, j, inode->child[j]); - put_child(t, right, j, inode->child[j + size]); - } - put_child(t, tn, 2*i, resize(t, left)); - put_child(t, tn, 2*i+1, resize(t, right)); + BUG_ON(!right); - tnode_free(inode); + size = tnode_child_length(left); + for (j = 0; j < size; j++) { + put_child(t, left, j, inode->child[j]); + put_child(t, right, j, inode->child[j + size]); } + put_child(t, tn, 2*i, resize(t, left)); + put_child(t, tn, 2*i+1, resize(t, right)); + + tnode_free(inode); } tnode_free(oldtnode); return tn; @@ -802,7 +772,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err) int i; int olen = tnode_child_length(tn); - if (trie_debug) printk("In halve\n"); + DBG("In halve\n"); tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); @@ -818,7 +788,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err) * of tnode is ignored. */ - for(i = 0; i < olen; i += 2) { + for (i = 0; i < olen; i += 2) { left = tnode_get_child(oldtnode, i); right = tnode_get_child(oldtnode, i+1); @@ -839,17 +809,19 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err) int size = tnode_child_length(tn); int j; - for(j = 0; j < size; j++) + for (j = 0; j < size; j++) if (tn->child[j]) tnode_free((struct tnode *)tn->child[j]); tnode_free(tn); - + *err = -ENOMEM; return oldtnode; } - for(i = 0; i < olen; i += 2) { + for (i = 0; i < olen; i += 2) { + struct tnode *newBinNode; + left = tnode_get_child(oldtnode, i); right = tnode_get_child(oldtnode, i+1); @@ -858,38 +830,39 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err) if (right == NULL) /* Both are empty */ continue; put_child(t, tn, i/2, right); - } else if (right == NULL) + continue; + } + + if (right == NULL) { put_child(t, tn, i/2, left); + continue; + } /* Two nonempty children */ - else { - struct tnode *newBinNode = - (struct tnode *) tnode_get_child(tn, i/2); - put_child(t, tn, i/2, NULL); + newBinNode = (struct tnode *) tnode_get_child(tn, i/2); + put_child(t, tn, i/2, NULL); - if (!newBinNode) - BUG(); + BUG_ON(!newBinNode); - put_child(t, newBinNode, 0, left); - put_child(t, newBinNode, 1, right); - put_child(t, tn, i/2, resize(t, newBinNode)); - } + put_child(t, newBinNode, 0, left); + put_child(t, newBinNode, 1, right); + put_child(t, tn, i/2, resize(t, newBinNode)); } tnode_free(oldtnode); return tn; } -static void *trie_init(struct trie *t) +static void trie_init(struct trie *t) { - if (t) { - t->size = 0; - t->trie = NULL; - t->revision = 0; + if (!t) + return; + + t->size = 0; + t->trie = NULL; + t->revision = 0; #ifdef CONFIG_IP_FIB_TRIE_STATS - memset(&t->stats, 0, sizeof(struct trie_use_stats)); + memset(&t->stats, 0, sizeof(struct trie_use_stats)); #endif - } - return t; } static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen) @@ -897,39 +870,37 @@ static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen) struct hlist_node *node; struct leaf_info *li; - hlist_for_each_entry(li, node, head, hlist) { + hlist_for_each_entry(li, node, head, hlist) if (li->plen == plen) return li; - } + return NULL; } static inline struct list_head * get_fa_head(struct leaf *l, int plen) { - struct list_head *fa_head = NULL; struct leaf_info *li = find_leaf_info(&l->list, plen); - if (li) - fa_head = &li->falh; + if (!li) + return NULL; - return fa_head; + return &li->falh; } static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) { struct leaf_info *li = NULL, *last = NULL; - struct hlist_node *node, *tmp; + struct hlist_node *node; write_lock_bh(&fib_lock); - if (hlist_empty(head)) + if (hlist_empty(head)) { hlist_add_head(&new->hlist, head); - else { - hlist_for_each_entry_safe(li, node, tmp, head, hlist) { - + } else { + hlist_for_each_entry(li, node, head, hlist) { if (new->plen > li->plen) break; - + last = li; } if (last) @@ -952,49 +923,47 @@ fib_find_node(struct trie *t, u32 key) while (n != NULL && NODE_TYPE(n) == T_TNODE) { tn = (struct tnode *) n; - + check_tnode(tn); - + if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { - pos=tn->pos + tn->bits; + pos = tn->pos + tn->bits; n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); - } - else + } else break; } /* Case we have found a leaf. Compare prefixes */ - if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { - struct leaf *l = (struct leaf *) n; - return l; - } + if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) + return (struct leaf *)n; + return NULL; } static struct node *trie_rebalance(struct trie *t, struct tnode *tn) { - int i = 0; + int i; int wasfull; t_key cindex, key; struct tnode *tp = NULL; - if (!tn) - BUG(); + BUG_ON(!tn); key = tn->key; i = 0; while (tn != NULL && NODE_PARENT(tn) != NULL) { - if (i > 10) { printk("Rebalance tn=%p \n", tn); - if (tn) printk("tn->parent=%p \n", NODE_PARENT(tn)); - + if (tn) + printk("tn->parent=%p \n", NODE_PARENT(tn)); + printk("Rebalance tp=%p \n", tp); - if (tp) printk("tp->parent=%p \n", NODE_PARENT(tp)); + if (tp) + printk("tp->parent=%p \n", NODE_PARENT(tp)); } - if (i > 12) BUG(); + BUG_ON(i > 12); /* Why is this a bug? -ojn */ i++; tp = NODE_PARENT(tn); @@ -1002,7 +971,7 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn) wasfull = tnode_full(tp, tnode_get_child(tp, cindex)); tn = (struct tnode *) resize (t, (struct tnode *)tn); tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull); - + if (!NODE_PARENT(tn)) break; @@ -1050,20 +1019,19 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) while (n != NULL && NODE_TYPE(n) == T_TNODE) { tn = (struct tnode *) n; - + check_tnode(tn); - + if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) { tp = tn; - pos=tn->pos + tn->bits; + pos = tn->pos + tn->bits; n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); if (n && NODE_PARENT(n) != tn) { printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n)); BUG(); } - } - else + } else break; } @@ -1073,17 +1041,15 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) * tp is n's (parent) ----> NULL or TNODE */ - if (tp && IS_LEAF(tp)) - BUG(); - + BUG_ON(tp && IS_LEAF(tp)); /* Case 1: n is a leaf. Compare prefixes */ if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) { - struct leaf *l = ( struct leaf *) n; - + struct leaf *l = (struct leaf *) n; + li = leaf_info_new(plen); - + if (!li) { *err = -ENOMEM; goto err; @@ -1113,35 +1079,31 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) fa_head = &li->falh; insert_leaf_info(&l->list, li); - /* Case 2: n is NULL, and will just insert a new leaf */ if (t->trie && n == NULL) { + /* Case 2: n is NULL, and will just insert a new leaf */ NODE_SET_PARENT(l, tp); - - if (!tp) - BUG(); - else { - cindex = tkey_extract_bits(key, tp->pos, tp->bits); - put_child(t, (struct tnode *)tp, cindex, (struct node *)l); - } - } - /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ - else { + BUG_ON(!tp); + + cindex = tkey_extract_bits(key, tp->pos, tp->bits); + put_child(t, (struct tnode *)tp, cindex, (struct node *)l); + } else { + /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ /* * Add a new tnode here * first tnode need some special handling */ if (tp) - pos=tp->pos+tp->bits; + pos = tp->pos+tp->bits; else - pos=0; + pos = 0; + if (n) { newpos = tkey_mismatch(key, pos, n->key); tn = tnode_new(n->key, newpos, 1); - } - else { + } else { newpos = 0; tn = tnode_new(key, newpos, 1); /* First tnode */ } @@ -1151,32 +1113,32 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) tnode_free((struct tnode *) l); *err = -ENOMEM; goto err; - } - + } + NODE_SET_PARENT(tn, tp); - missbit=tkey_extract_bits(key, newpos, 1); + missbit = tkey_extract_bits(key, newpos, 1); put_child(t, tn, missbit, (struct node *)l); put_child(t, tn, 1-missbit, n); if (tp) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); put_child(t, (struct tnode *)tp, cindex, (struct node *)tn); - } - else { + } else { t->trie = (struct node*) tn; /* First tnode */ tp = tn; } } - if (tp && tp->pos+tp->bits > 32) { + + if (tp && tp->pos + tp->bits > 32) printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n", tp, tp->pos, tp->bits, key, plen); - } + /* Rebalance the trie */ t->trie = trie_rebalance(t, tp); done: t->revision++; -err:; +err: return fa_head; } @@ -1204,17 +1166,18 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, key = ntohl(key); - if (trie_debug) - printk("Insert table=%d %08x/%d\n", tb->tb_id, key, plen); + DBG("Insert table=%d %08x/%d\n", tb->tb_id, key, plen); - mask = ntohl( inet_make_mask(plen) ); + mask = ntohl(inet_make_mask(plen)); if (key & ~mask) return -EINVAL; key = key & mask; - if ((fi = fib_create_info(r, rta, nlhdr, &err)) == NULL) + fi = fib_create_info(r, rta, nlhdr, &err); + + if (!fi) goto err; l = fib_find_node(t, key); @@ -1236,8 +1199,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, * and we need to allocate a new one of those as well. */ - if (fa && - fa->fa_info->fib_priority == fi->fib_priority) { + if (fa && fa->fa_info->fib_priority == fi->fib_priority) { struct fib_alias *fa_orig; err = -EEXIST; @@ -1261,9 +1223,9 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, fib_release_info(fi_drop); if (state & FA_S_ACCESSED) - rt_cache_flush(-1); + rt_cache_flush(-1); - goto succeeded; + goto succeeded; } /* Error if we find a perfect match which * uses the same scope, type, and nexthop @@ -1285,7 +1247,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, fa = fa_orig; } err = -ENOENT; - if (!(nlhdr->nlmsg_flags&NLM_F_CREATE)) + if (!(nlhdr->nlmsg_flags & NLM_F_CREATE)) goto out; err = -ENOBUFS; @@ -1298,9 +1260,6 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, new_fa->fa_type = type; new_fa->fa_scope = r->rtm_scope; new_fa->fa_state = 0; -#if 0 - new_fa->dst = NULL; -#endif /* * Insert new entry to the list. */ @@ -1314,8 +1273,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, write_lock_bh(&fib_lock); - list_add_tail(&new_fa->fa_list, - (fa ? &fa->fa_list : fa_head)); + list_add_tail(&new_fa->fa_list, (fa ? &fa->fa_list : fa_head)); write_unlock_bh(&fib_lock); @@ -1328,7 +1286,7 @@ out_free_new_fa: kmem_cache_free(fn_alias_kmem, new_fa); out: fib_release_info(fi); -err:; +err: return err; } @@ -1342,7 +1300,6 @@ static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *pl struct hlist_node *node; hlist_for_each_entry(li, node, hhead, hlist) { - i = li->plen; mask = ntohl(inet_make_mask(i)); if (l->key != (key & mask)) @@ -1370,13 +1327,18 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result struct node *n; struct tnode *pn; int pos, bits; - t_key key=ntohl(flp->fl4_dst); + t_key key = ntohl(flp->fl4_dst); int chopped_off; t_key cindex = 0; int current_prefix_length = KEYLENGTH; + struct tnode *cn; + t_key node_prefix, key_prefix, pref_mismatch; + int mp; + n = t->trie; read_lock(&fib_lock); + if (!n) goto failed; @@ -1393,8 +1355,7 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result pn = (struct tnode *) n; chopped_off = 0; - while (pn) { - + while (pn) { pos = pn->pos; bits = pn->bits; @@ -1410,130 +1371,129 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result goto backtrace; } - if (IS_TNODE(n)) { + if (IS_LEAF(n)) { + if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0) + goto found; + else + goto backtrace; + } + #define HL_OPTIMIZE #ifdef HL_OPTIMIZE - struct tnode *cn = (struct tnode *)n; - t_key node_prefix, key_prefix, pref_mismatch; - int mp; + cn = (struct tnode *)n; - /* - * It's a tnode, and we can do some extra checks here if we - * like, to avoid descending into a dead-end branch. - * This tnode is in the parent's child array at index - * key[p_pos..p_pos+p_bits] but potentially with some bits - * chopped off, so in reality the index may be just a - * subprefix, padded with zero at the end. - * We can also take a look at any skipped bits in this - * tnode - everything up to p_pos is supposed to be ok, - * and the non-chopped bits of the index (se previous - * paragraph) are also guaranteed ok, but the rest is - * considered unknown. - * - * The skipped bits are key[pos+bits..cn->pos]. - */ - - /* If current_prefix_length < pos+bits, we are already doing - * actual prefix matching, which means everything from - * pos+(bits-chopped_off) onward must be zero along some - * branch of this subtree - otherwise there is *no* valid - * prefix present. Here we can only check the skipped - * bits. Remember, since we have already indexed into the - * parent's child array, we know that the bits we chopped of - * *are* zero. - */ + /* + * It's a tnode, and we can do some extra checks here if we + * like, to avoid descending into a dead-end branch. + * This tnode is in the parent's child array at index + * key[p_pos..p_pos+p_bits] but potentially with some bits + * chopped off, so in reality the index may be just a + * subprefix, padded with zero at the end. + * We can also take a look at any skipped bits in this + * tnode - everything up to p_pos is supposed to be ok, + * and the non-chopped bits of the index (se previous + * paragraph) are also guaranteed ok, but the rest is + * considered unknown. + * + * The skipped bits are key[pos+bits..cn->pos]. + */ - /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */ - - if (current_prefix_length < pos+bits) { - if (tkey_extract_bits(cn->key, current_prefix_length, - cn->pos - current_prefix_length) != 0 || - !(cn->child[0])) - goto backtrace; - } + /* If current_prefix_length < pos+bits, we are already doing + * actual prefix matching, which means everything from + * pos+(bits-chopped_off) onward must be zero along some + * branch of this subtree - otherwise there is *no* valid + * prefix present. Here we can only check the skipped + * bits. Remember, since we have already indexed into the + * parent's child array, we know that the bits we chopped of + * *are* zero. + */ - /* - * If chopped_off=0, the index is fully validated and we - * only need to look at the skipped bits for this, the new, - * tnode. What we actually want to do is to find out if - * these skipped bits match our key perfectly, or if we will - * have to count on finding a matching prefix further down, - * because if we do, we would like to have some way of - * verifying the existence of such a prefix at this point. - */ + /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */ - /* The only thing we can do at this point is to verify that - * any such matching prefix can indeed be a prefix to our - * key, and if the bits in the node we are inspecting that - * do not match our key are not ZERO, this cannot be true. - * Thus, find out where there is a mismatch (before cn->pos) - * and verify that all the mismatching bits are zero in the - * new tnode's key. - */ + if (current_prefix_length < pos+bits) { + if (tkey_extract_bits(cn->key, current_prefix_length, + cn->pos - current_prefix_length) != 0 || + !(cn->child[0])) + goto backtrace; + } - /* Note: We aren't very concerned about the piece of the key - * that precede pn->pos+pn->bits, since these have already been - * checked. The bits after cn->pos aren't checked since these are - * by definition "unknown" at this point. Thus, what we want to - * see is if we are about to enter the "prefix matching" state, - * and in that case verify that the skipped bits that will prevail - * throughout this subtree are zero, as they have to be if we are - * to find a matching prefix. - */ + /* + * If chopped_off=0, the index is fully validated and we + * only need to look at the skipped bits for this, the new, + * tnode. What we actually want to do is to find out if + * these skipped bits match our key perfectly, or if we will + * have to count on finding a matching prefix further down, + * because if we do, we would like to have some way of + * verifying the existence of such a prefix at this point. + */ - node_prefix = MASK_PFX(cn->key, cn->pos); - key_prefix = MASK_PFX(key, cn->pos); - pref_mismatch = key_prefix^node_prefix; - mp = 0; + /* The only thing we can do at this point is to verify that + * any such matching prefix can indeed be a prefix to our + * key, and if the bits in the node we are inspecting that + * do not match our key are not ZERO, this cannot be true. + * Thus, find out where there is a mismatch (before cn->pos) + * and verify that all the mismatching bits are zero in the + * new tnode's key. + */ - /* In short: If skipped bits in this node do not match the search - * key, enter the "prefix matching" state.directly. - */ - if (pref_mismatch) { - while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) { - mp++; - pref_mismatch = pref_mismatch <<1; - } - key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp); - - if (key_prefix != 0) - goto backtrace; - - if (current_prefix_length >= cn->pos) - current_prefix_length=mp; - } -#endif - pn = (struct tnode *)n; /* Descend */ - chopped_off = 0; - continue; + /* Note: We aren't very concerned about the piece of the key + * that precede pn->pos+pn->bits, since these have already been + * checked. The bits after cn->pos aren't checked since these are + * by definition "unknown" at this point. Thus, what we want to + * see is if we are about to enter the "prefix matching" state, + * and in that case verify that the skipped bits that will prevail + * throughout this subtree are zero, as they have to be if we are + * to find a matching prefix. + */ + + node_prefix = MASK_PFX(cn->key, cn->pos); + key_prefix = MASK_PFX(key, cn->pos); + pref_mismatch = key_prefix^node_prefix; + mp = 0; + + /* In short: If skipped bits in this node do not match the search + * key, enter the "prefix matching" state.directly. + */ + if (pref_mismatch) { + while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) { + mp++; + pref_mismatch = pref_mismatch <<1; + } + key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp); + + if (key_prefix != 0) + goto backtrace; + + if (current_prefix_length >= cn->pos) + current_prefix_length = mp; } - if (IS_LEAF(n)) { - if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0) - goto found; - } +#endif + pn = (struct tnode *)n; /* Descend */ + chopped_off = 0; + continue; + backtrace: chopped_off++; /* As zero don't change the child key (cindex) */ - while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) { + while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1)))) chopped_off++; - } /* Decrease current_... with bits chopped off */ if (current_prefix_length > pn->pos + pn->bits - chopped_off) current_prefix_length = pn->pos + pn->bits - chopped_off; - + /* * Either we do the actual chop off according or if we have * chopped off all bits in this tnode walk up to our parent. */ - if (chopped_off <= pn->bits) + if (chopped_off <= pn->bits) { cindex &= ~(1 << (chopped_off-1)); - else { + } else { if (NODE_PARENT(pn) == NULL) goto failed; - + /* Get Child's index */ cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits); pn = NODE_PARENT(pn); @@ -1559,24 +1519,23 @@ static int trie_leaf_remove(struct trie *t, t_key key) struct node *n = t->trie; struct leaf *l; - if (trie_debug) - printk("entering trie_leaf_remove(%p)\n", n); + DBG("entering trie_leaf_remove(%p)\n", n); /* Note that in the case skipped bits, those bits are *not* checked! * When we finish this, we will have NULL or a T_LEAF, and the * T_LEAF may or may not match our key. */ - while (n != NULL && IS_TNODE(n)) { + while (n != NULL && IS_TNODE(n)) { struct tnode *tn = (struct tnode *) n; check_tnode(tn); n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits)); - if (n && NODE_PARENT(n) != tn) { - printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n)); - BUG(); - } - } + if (n && NODE_PARENT(n) != tn) { + printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n)); + BUG(); + } + } l = (struct leaf *) n; if (!n || !tkey_equals(l->key, key)) @@ -1597,8 +1556,7 @@ static int trie_leaf_remove(struct trie *t, t_key key) cindex = tkey_extract_bits(key, tp->pos, tp->bits); put_child(t, (struct tnode *)tp, cindex, NULL); t->trie = trie_rebalance(t, tp); - } - else + } else t->trie = NULL; return 1; @@ -1606,7 +1564,7 @@ static int trie_leaf_remove(struct trie *t, t_key key) static int fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, - struct nlmsghdr *nlhdr, struct netlink_skb_parms *req) + struct nlmsghdr *nlhdr, struct netlink_skb_parms *req) { struct trie *t = (struct trie *) tb->tb_data; u32 key, mask; @@ -1615,6 +1573,9 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, struct fib_alias *fa, *fa_to_delete; struct list_head *fa_head; struct leaf *l; + int kill_li = 0; + struct leaf_info *li; + if (plen > 32) return -EINVAL; @@ -1624,7 +1585,7 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, memcpy(&key, rta->rta_dst, 4); key = ntohl(key); - mask = ntohl( inet_make_mask(plen) ); + mask = ntohl(inet_make_mask(plen)); if (key & ~mask) return -EINVAL; @@ -1641,8 +1602,7 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, if (!fa) return -ESRCH; - if (trie_debug) - printk("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t); + DBG("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t); fa_to_delete = NULL; fa_head = fa->fa_list.prev; @@ -1664,39 +1624,36 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, } } - if (fa_to_delete) { - int kill_li = 0; - struct leaf_info *li; + if (!fa_to_delete) + return -ESRCH; - fa = fa_to_delete; - rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req); + fa = fa_to_delete; + rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req); + + l = fib_find_node(t, key); + li = find_leaf_info(&l->list, plen); - l = fib_find_node(t, key); - li = find_leaf_info(&l->list, plen); + write_lock_bh(&fib_lock); - write_lock_bh(&fib_lock); + list_del(&fa->fa_list); - list_del(&fa->fa_list); + if (list_empty(fa_head)) { + hlist_del(&li->hlist); + kill_li = 1; + } + write_unlock_bh(&fib_lock); - if (list_empty(fa_head)) { - hlist_del(&li->hlist); - kill_li = 1; - } - write_unlock_bh(&fib_lock); - - if (kill_li) - free_leaf_info(li); + if (kill_li) + free_leaf_info(li); - if (hlist_empty(&l->list)) - trie_leaf_remove(t, key); + if (hlist_empty(&l->list)) + trie_leaf_remove(t, key); - if (fa->fa_state & FA_S_ACCESSED) - rt_cache_flush(-1); + if (fa->fa_state & FA_S_ACCESSED) + rt_cache_flush(-1); - fn_free_alias(fa); - return 0; - } - return -ESRCH; + fn_free_alias(fa); + return 0; } static int trie_flush_list(struct trie *t, struct list_head *head) @@ -1706,9 +1663,8 @@ static int trie_flush_list(struct trie *t, struct list_head *head) list_for_each_entry_safe(fa, fa_node, head, fa_list) { struct fib_info *fi = fa->fa_info; - - if (fi && (fi->fib_flags&RTNH_F_DEAD)) { + if (fi && (fi->fib_flags&RTNH_F_DEAD)) { write_lock_bh(&fib_lock); list_del(&fa->fa_list); write_unlock_bh(&fib_lock); @@ -1728,11 +1684,9 @@ static int trie_flush_leaf(struct trie *t, struct leaf *l) struct leaf_info *li = NULL; hlist_for_each_entry_safe(li, node, tmp, lih, hlist) { - found += trie_flush_list(t, &li->falh); if (list_empty(&li->falh)) { - write_lock_bh(&fib_lock); hlist_del(&li->hlist); write_unlock_bh(&fib_lock); @@ -1757,8 +1711,7 @@ static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf) return (struct leaf *) t->trie; p = (struct tnode*) t->trie; /* Start */ - } - else + } else p = (struct tnode *) NODE_PARENT(c); while (p) { @@ -1771,29 +1724,28 @@ static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf) pos = 0; last = 1 << p->bits; - for(idx = pos; idx < last ; idx++) { - if (p->child[idx]) { - - /* Decend if tnode */ - - while (IS_TNODE(p->child[idx])) { - p = (struct tnode*) p->child[idx]; - idx = 0; - - /* Rightmost non-NULL branch */ - if (p && IS_TNODE(p)) - while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++; - - /* Done with this tnode? */ - if (idx >= (1 << p->bits) || p->child[idx] == NULL ) - goto up; - } - return (struct leaf*) p->child[idx]; + for (idx = pos; idx < last ; idx++) { + if (!p->child[idx]) + continue; + + /* Decend if tnode */ + while (IS_TNODE(p->child[idx])) { + p = (struct tnode*) p->child[idx]; + idx = 0; + + /* Rightmost non-NULL branch */ + if (p && IS_TNODE(p)) + while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++; + + /* Done with this tnode? */ + if (idx >= (1 << p->bits) || p->child[idx] == NULL) + goto up; } + return (struct leaf*) p->child[idx]; } up: /* No more children go up one step */ - c = (struct node*) p; + c = (struct node *) p; p = (struct tnode *) NODE_PARENT(p); } return NULL; /* Ready. Root of trie */ @@ -1807,7 +1759,7 @@ static int fn_trie_flush(struct fib_table *tb) t->revision++; - for (h=0; (l = nextleaf(t, l)) != NULL; h++) { + for (h = 0; (l = nextleaf(t, l)) != NULL; h++) { found += trie_flush_leaf(t, l); if (ll && hlist_empty(&ll->list)) @@ -1818,12 +1770,11 @@ static int fn_trie_flush(struct fib_table *tb) if (ll && hlist_empty(&ll->list)) trie_leaf_remove(t, ll->key); - if (trie_debug) - printk("trie_flush found=%d\n", found); + DBG("trie_flush found=%d\n", found); return found; } -static int trie_last_dflt=-1; +static int trie_last_dflt = -1; static void fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) @@ -1855,18 +1806,18 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib list_for_each_entry(fa, fa_head, fa_list) { struct fib_info *next_fi = fa->fa_info; - + if (fa->fa_scope != res->scope || fa->fa_type != RTN_UNICAST) continue; - + if (next_fi->fib_priority > res->fi->fib_priority) break; if (!next_fi->fib_nh[0].nh_gw || next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) continue; fa->fa_state |= FA_S_ACCESSED; - + if (fi == NULL) { if (next_fi != res->fi) break; @@ -1913,9 +1864,9 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi int i, s_i; struct fib_alias *fa; - u32 xkey=htonl(key); + u32 xkey = htonl(key); - s_i=cb->args[3]; + s_i = cb->args[3]; i = 0; list_for_each_entry(fa, fah, fa_list) { @@ -1946,10 +1897,10 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi fa->fa_info, 0) < 0) { cb->args[3] = i; return -1; - } + } i++; } - cb->args[3]=i; + cb->args[3] = i; return skb->len; } @@ -1959,10 +1910,10 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str int h, s_h; struct list_head *fa_head; struct leaf *l = NULL; - s_h=cb->args[2]; - for (h=0; (l = nextleaf(t, l)) != NULL; h++) { + s_h = cb->args[2]; + for (h = 0; (l = nextleaf(t, l)) != NULL; h++) { if (h < s_h) continue; if (h > s_h) @@ -1970,7 +1921,7 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str sizeof(cb->args) - 3*sizeof(cb->args[0])); fa_head = get_fa_head(l, plen); - + if (!fa_head) continue; @@ -1978,11 +1929,11 @@ static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, str continue; if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) { - cb->args[2]=h; + cb->args[2] = h; return -1; } } - cb->args[2]=h; + cb->args[2] = h; return skb->len; } @@ -1994,13 +1945,12 @@ static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin s_m = cb->args[1]; read_lock(&fib_lock); - for (m=0; m<=32; m++) { - + for (m = 0; m <= 32; m++) { if (m < s_m) continue; if (m > s_m) memset(&cb->args[2], 0, - sizeof(cb->args) - 2*sizeof(cb->args[0])); + sizeof(cb->args) - 2*sizeof(cb->args[0])); if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) { cb->args[1] = m; @@ -2010,7 +1960,7 @@ static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin read_unlock(&fib_lock); cb->args[1] = m; return skb->len; - out: +out: read_unlock(&fib_lock); return -1; } @@ -2051,9 +2001,9 @@ struct fib_table * __init fib_hash_init(int id) trie_init(t); if (id == RT_TABLE_LOCAL) - trie_local = t; + trie_local = t; else if (id == RT_TABLE_MAIN) - trie_main = t; + trie_main = t; if (id == RT_TABLE_LOCAL) printk("IPv4 FIB: Using LC-trie version %s\n", VERSION); @@ -2065,7 +2015,8 @@ struct fib_table * __init fib_hash_init(int id) static void putspace_seq(struct seq_file *seq, int n) { - while (n--) seq_printf(seq, " "); + while (n--) + seq_printf(seq, " "); } static void printbin_seq(struct seq_file *seq, unsigned int v, int bits) @@ -2086,29 +2037,22 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n, seq_printf(seq, "%d/", cindex); printbin_seq(seq, cindex, bits); seq_printf(seq, ": "); - } - else + } else seq_printf(seq, ": "); seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n); - if (IS_LEAF(n)) - seq_printf(seq, "key=%d.%d.%d.%d\n", - n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256); - else { - int plen = ((struct tnode *)n)->pos; - t_key prf=MASK_PFX(n->key, plen); - seq_printf(seq, "key=%d.%d.%d.%d/%d\n", - prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen); - } if (IS_LEAF(n)) { - struct leaf *l=(struct leaf *)n; + struct leaf *l = (struct leaf *)n; struct fib_alias *fa; int i; - for (i=32; i>=0; i--) - if (find_leaf_info(&l->list, i)) { - + + seq_printf(seq, "key=%d.%d.%d.%d\n", + n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256); + + for (i = 32; i >= 0; i--) + if (find_leaf_info(&l->list, i)) { struct list_head *fa_head = get_fa_head(l, i); - + if (!fa_head) continue; @@ -2118,17 +2062,16 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n, putspace_seq(seq, indent+2); seq_printf(seq, "{/%d...dumping}\n", i); - list_for_each_entry(fa, fa_head, fa_list) { putspace_seq(seq, indent+2); - if (fa->fa_info->fib_nh == NULL) { - seq_printf(seq, "Error _fib_nh=NULL\n"); - continue; - } if (fa->fa_info == NULL) { seq_printf(seq, "Error fa_info=NULL\n"); continue; } + if (fa->fa_info->fib_nh == NULL) { + seq_printf(seq, "Error _fib_nh=NULL\n"); + continue; + } seq_printf(seq, "{type=%d scope=%d TOS=%d}\n", fa->fa_type, @@ -2136,11 +2079,16 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n, fa->fa_tos); } } - } - else if (IS_TNODE(n)) { + } else { struct tnode *tn = (struct tnode *)n; + int plen = ((struct tnode *)n)->pos; + t_key prf = MASK_PFX(n->key, plen); + + seq_printf(seq, "key=%d.%d.%d.%d/%d\n", + prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen); + putspace_seq(seq, indent); seq_printf(seq, "| "); - seq_printf(seq, "{key prefix=%08x/", tn->key&TKEY_GET_MASK(0, tn->pos)); + seq_printf(seq, "{key prefix=%08x/", tn->key & TKEY_GET_MASK(0, tn->pos)); printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos); seq_printf(seq, "}\n"); putspace_seq(seq, indent); seq_printf(seq, "| "); @@ -2155,100 +2103,103 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n, static void trie_dump_seq(struct seq_file *seq, struct trie *t) { struct node *n = t->trie; - int cindex=0; - int indent=1; - int pend=0; + int cindex = 0; + int indent = 1; + int pend = 0; int depth = 0; + struct tnode *tn; read_lock(&fib_lock); seq_printf(seq, "------ trie_dump of t=%p ------\n", t); - if (n) { - printnode_seq(seq, indent, n, pend, cindex, 0); - if (IS_TNODE(n)) { - struct tnode *tn = (struct tnode *)n; - pend = tn->pos+tn->bits; - putspace_seq(seq, indent); seq_printf(seq, "\\--\n"); - indent += 3; - depth++; - - while (tn && cindex < (1 << tn->bits)) { - if (tn->child[cindex]) { - - /* Got a child */ - - printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits); - if (IS_LEAF(tn->child[cindex])) { - cindex++; - - } - else { - /* - * New tnode. Decend one level - */ - - depth++; - n = tn->child[cindex]; - tn = (struct tnode *)n; - pend = tn->pos+tn->bits; - putspace_seq(seq, indent); seq_printf(seq, "\\--\n"); - indent+=3; - cindex=0; - } - } - else - cindex++; + if (!n) { + seq_printf(seq, "------ trie is empty\n"); + + read_unlock(&fib_lock); + return; + } + + printnode_seq(seq, indent, n, pend, cindex, 0); + + if (!IS_TNODE(n)) { + read_unlock(&fib_lock); + return; + } + + tn = (struct tnode *)n; + pend = tn->pos+tn->bits; + putspace_seq(seq, indent); seq_printf(seq, "\\--\n"); + indent += 3; + depth++; + + while (tn && cindex < (1 << tn->bits)) { + if (tn->child[cindex]) { + /* Got a child */ + + printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits); + if (IS_LEAF(tn->child[cindex])) { + cindex++; + } else { /* - * Test if we are done + * New tnode. Decend one level */ - - while (cindex >= (1 << tn->bits)) { - /* - * Move upwards and test for root - * pop off all traversed nodes - */ - - if (NODE_PARENT(tn) == NULL) { - tn = NULL; - n = NULL; - break; - } - else { - cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits); - tn = NODE_PARENT(tn); - cindex++; - n = (struct node *)tn; - pend = tn->pos+tn->bits; - indent-=3; - depth--; - } - } + depth++; + tn = (struct tnode *)tn->child[cindex]; + pend = tn->pos + tn->bits; + putspace_seq(seq, indent); seq_printf(seq, "\\--\n"); + indent += 3; + cindex = 0; } + } else + cindex++; + + /* + * Test if we are done + */ + + while (cindex >= (1 << tn->bits)) { + /* + * Move upwards and test for root + * pop off all traversed nodes + */ + + if (NODE_PARENT(tn) == NULL) { + tn = NULL; + break; + } + + cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits); + cindex++; + tn = NODE_PARENT(tn); + pend = tn->pos + tn->bits; + indent -= 3; + depth--; } - else n = NULL; } - else seq_printf(seq, "------ trie is empty\n"); read_unlock(&fib_lock); } static struct trie_stat *trie_stat_new(void) { - struct trie_stat *s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL); + struct trie_stat *s; int i; - if (s) { - s->totdepth = 0; - s->maxdepth = 0; - s->tnodes = 0; - s->leaves = 0; - s->nullpointers = 0; - - for(i=0; i< MAX_CHILDS; i++) - s->nodesizes[i] = 0; - } + s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL); + if (!s) + return NULL; + + s->totdepth = 0; + s->maxdepth = 0; + s->tnodes = 0; + s->leaves = 0; + s->nullpointers = 0; + + for (i = 0; i < MAX_CHILDS; i++) + s->nodesizes[i] = 0; + return s; } @@ -2257,91 +2208,81 @@ static struct trie_stat *trie_collect_stats(struct trie *t) struct node *n = t->trie; struct trie_stat *s = trie_stat_new(); int cindex = 0; - int indent = 1; int pend = 0; int depth = 0; - read_lock(&fib_lock); + if (!s) + return NULL; + if (!n) + return s; - if (s) { - if (n) { - if (IS_TNODE(n)) { - struct tnode *tn = (struct tnode *)n; - pend = tn->pos+tn->bits; - indent += 3; - s->nodesizes[tn->bits]++; - depth++; + read_lock(&fib_lock); - while (tn && cindex < (1 << tn->bits)) { - if (tn->child[cindex]) { - /* Got a child */ - - if (IS_LEAF(tn->child[cindex])) { - cindex++; - - /* stats */ - if (depth > s->maxdepth) - s->maxdepth = depth; - s->totdepth += depth; - s->leaves++; - } - - else { - /* - * New tnode. Decend one level - */ - - s->tnodes++; - s->nodesizes[tn->bits]++; - depth++; - - n = tn->child[cindex]; - tn = (struct tnode *)n; - pend = tn->pos+tn->bits; - - indent += 3; - cindex = 0; - } - } - else { - cindex++; - s->nullpointers++; - } + if (IS_TNODE(n)) { + struct tnode *tn = (struct tnode *)n; + pend = tn->pos+tn->bits; + s->nodesizes[tn->bits]++; + depth++; + + while (tn && cindex < (1 << tn->bits)) { + if (tn->child[cindex]) { + /* Got a child */ + if (IS_LEAF(tn->child[cindex])) { + cindex++; + + /* stats */ + if (depth > s->maxdepth) + s->maxdepth = depth; + s->totdepth += depth; + s->leaves++; + } else { /* - * Test if we are done + * New tnode. Decend one level */ - - while (cindex >= (1 << tn->bits)) { - - /* - * Move upwards and test for root - * pop off all traversed nodes - */ - - - if (NODE_PARENT(tn) == NULL) { - tn = NULL; - n = NULL; - break; - } - else { - cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits); - tn = NODE_PARENT(tn); - cindex++; - n = (struct node *)tn; - pend = tn->pos+tn->bits; - indent -= 3; - depth--; - } - } + + s->tnodes++; + s->nodesizes[tn->bits]++; + depth++; + + n = tn->child[cindex]; + tn = (struct tnode *)n; + pend = tn->pos+tn->bits; + + cindex = 0; } + } else { + cindex++; + s->nullpointers++; } - else n = NULL; + + /* + * Test if we are done + */ + + while (cindex >= (1 << tn->bits)) { + /* + * Move upwards and test for root + * pop off all traversed nodes + */ + + if (NODE_PARENT(tn) == NULL) { + tn = NULL; + n = NULL; + break; + } + + cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits); + tn = NODE_PARENT(tn); + cindex++; + n = (struct node *)tn; + pend = tn->pos+tn->bits; + depth--; + } } } - read_unlock(&fib_lock); + read_unlock(&fib_lock); return s; } @@ -2359,17 +2300,22 @@ static struct fib_alias *fib_triestat_get_next(struct seq_file *seq) static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos) { - void *v = NULL; + if (!ip_fib_main_table) + return NULL; - if (ip_fib_main_table) - v = *pos ? fib_triestat_get_next(seq) : SEQ_START_TOKEN; - return v; + if (*pos) + return fib_triestat_get_next(seq); + else + return SEQ_START_TOKEN; } static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; - return v == SEQ_START_TOKEN ? fib_triestat_get_first(seq) : fib_triestat_get_next(seq); + if (v == SEQ_START_TOKEN) + return fib_triestat_get_first(seq); + else + return fib_triestat_get_next(seq); } static void fib_triestat_seq_stop(struct seq_file *seq, void *v) @@ -2388,22 +2334,22 @@ static void collect_and_show(struct trie *t, struct seq_file *seq) { int bytes = 0; /* How many bytes are used, a ref is 4 bytes */ int i, max, pointers; - struct trie_stat *stat; + struct trie_stat *stat; int avdepth; stat = trie_collect_stats(t); - bytes=0; + bytes = 0; seq_printf(seq, "trie=%p\n", t); if (stat) { if (stat->leaves) - avdepth=stat->totdepth*100 / stat->leaves; + avdepth = stat->totdepth*100 / stat->leaves; else - avdepth=0; - seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100 ); + avdepth = 0; + seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100); seq_printf(seq, "Max depth: %4d\n", stat->maxdepth); - + seq_printf(seq, "Leaves: %d\n", stat->leaves); bytes += sizeof(struct leaf) * stat->leaves; seq_printf(seq, "Internal nodes: %d\n", stat->tnodes); @@ -2455,11 +2401,9 @@ static int fib_triestat_seq_show(struct seq_file *seq, void *v) if (trie_main) collect_and_show(trie_main, seq); - } - else { - snprintf(bf, sizeof(bf), - "*\t%08X\t%08X", 200, 400); - + } else { + snprintf(bf, sizeof(bf), "*\t%08X\t%08X", 200, 400); + seq_printf(seq, "%-127s\n", bf); } return 0; @@ -2520,22 +2464,27 @@ static struct fib_alias *fib_trie_get_next(struct seq_file *seq) static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos) { - void *v = NULL; + if (!ip_fib_main_table) + return NULL; - if (ip_fib_main_table) - v = *pos ? fib_trie_get_next(seq) : SEQ_START_TOKEN; - return v; + if (*pos) + return fib_trie_get_next(seq); + else + return SEQ_START_TOKEN; } static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; - return v == SEQ_START_TOKEN ? fib_trie_get_first(seq) : fib_trie_get_next(seq); + if (v == SEQ_START_TOKEN) + return fib_trie_get_first(seq); + else + return fib_trie_get_next(seq); + } static void fib_trie_seq_stop(struct seq_file *seq, void *v) { - } /* @@ -2555,9 +2504,7 @@ static int fib_trie_seq_show(struct seq_file *seq, void *v) if (trie_main) trie_dump_seq(seq, trie_main); - } - - else { + } else { snprintf(bf, sizeof(bf), "*\t%08X\t%08X", 200, 400); seq_printf(seq, "%-127s\n", bf); -- cgit v1.2.3 From 2f80b3c8262d0d646812f776db024d88d569a0c1 Mon Sep 17 00:00:00 2001 From: Robert Olsson Date: Tue, 9 Aug 2005 20:25:06 -0700 Subject: [IPV4]: fib_trie: Use ERR_PTR to handle errno return Signed-off-by: Robert Olsson Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 119 +++++++++++++++++++++++++--------------------------- 1 file changed, 57 insertions(+), 62 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 6f818cc7efd..914a4c2aae4 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -167,8 +167,8 @@ static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); static int tnode_child_length(struct tnode *tn); static struct node *resize(struct trie *t, struct tnode *tn); -static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err); -static struct tnode *halve(struct trie *t, struct tnode *tn, int *err); +static struct tnode *inflate(struct trie *t, struct tnode *tn); +static struct tnode *halve(struct trie *t, struct tnode *tn); static void tnode_free(struct tnode *tn); static void trie_dump_seq(struct seq_file *seq, struct trie *t); extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); @@ -457,6 +457,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) { int i; int err = 0; + struct tnode *old_tn; if (!tn) return NULL; @@ -559,9 +560,10 @@ static struct node *resize(struct trie *t, struct tnode *tn) 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= inflate_threshold * tnode_child_length(tn))) { - tn = inflate(t, tn, &err); - - if (err) { + old_tn = tn; + tn = inflate(t, tn); + if (IS_ERR(tn)) { + tn = old_tn; #ifdef CONFIG_IP_FIB_TRIE_STATS t->stats.resize_node_skipped++; #endif @@ -581,9 +583,10 @@ static struct node *resize(struct trie *t, struct tnode *tn) 100 * (tnode_child_length(tn) - tn->empty_children) < halve_threshold * tnode_child_length(tn)) { - tn = halve(t, tn, &err); - - if (err) { + old_tn = tn; + tn = halve(t, tn); + if (IS_ERR(tn)) { + tn = old_tn; #ifdef CONFIG_IP_FIB_TRIE_STATS t->stats.resize_node_skipped++; #endif @@ -618,7 +621,7 @@ static struct node *resize(struct trie *t, struct tnode *tn) return (struct node *) tn; } -static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) +static struct tnode *inflate(struct trie *t, struct tnode *tn) { struct tnode *inode; struct tnode *oldtnode = tn; @@ -629,10 +632,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); - if (!tn) { - *err = -ENOMEM; - return oldtnode; - } + if (!tn) + return ERR_PTR(-ENOMEM); /* * Preallocate and store tnodes before the actual work so we @@ -653,39 +654,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) left = tnode_new(inode->key&(~m), inode->pos + 1, inode->bits - 1); - - if (!left) { - *err = -ENOMEM; - break; - } + if (!left) + goto nomem; right = tnode_new(inode->key|m, inode->pos + 1, inode->bits - 1); - if (!right) { - *err = -ENOMEM; - break; - } + if (!right) { + tnode_free(left); + goto nomem; + } put_child(t, tn, 2*i, (struct node *) left); put_child(t, tn, 2*i+1, (struct node *) right); } } - if (*err) { - int size = tnode_child_length(tn); - int j; - - for (j = 0; j < size; j++) - if (tn->child[j]) - tnode_free((struct tnode *)tn->child[j]); - - tnode_free(tn); - - *err = -ENOMEM; - return oldtnode; - } - for (i = 0; i < olen; i++) { struct node *node = tnode_get_child(oldtnode, i); struct tnode *left, *right; @@ -763,9 +747,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err) } tnode_free(oldtnode); return tn; +nomem: + { + int size = tnode_child_length(tn); + int j; + + for(j = 0; j < size; j++) + if (tn->child[j]) + tnode_free((struct tnode *)tn->child[j]); + + tnode_free(tn); + + return ERR_PTR(-ENOMEM); + } } -static struct tnode *halve(struct trie *t, struct tnode *tn, int *err) +static struct tnode *halve(struct trie *t, struct tnode *tn) { struct tnode *oldtnode = tn; struct node *left, *right; @@ -776,10 +773,8 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err) tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); - if (!tn) { - *err = -ENOMEM; - return oldtnode; - } + if (!tn) + return ERR_PTR(-ENOMEM); /* * Preallocate and store tnodes before the actual work so we @@ -794,29 +789,16 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err) /* Two nonempty children */ if (left && right) { - struct tnode *newBinNode = - tnode_new(left->key, tn->pos + tn->bits, 1); - - if (!newBinNode) { - *err = -ENOMEM; - break; - } - put_child(t, tn, i/2, (struct node *)newBinNode); + struct tnode *newn; + + newn = tnode_new(left->key, tn->pos + tn->bits, 1); + + if (!newn) + goto nomem; + + put_child(t, tn, i/2, (struct node *)newn); } - } - if (*err) { - int size = tnode_child_length(tn); - int j; - - for (j = 0; j < size; j++) - if (tn->child[j]) - tnode_free((struct tnode *)tn->child[j]); - - tnode_free(tn); - - *err = -ENOMEM; - return oldtnode; } for (i = 0; i < olen; i += 2) { @@ -850,6 +832,19 @@ static struct tnode *halve(struct trie *t, struct tnode *tn, int *err) } tnode_free(oldtnode); return tn; +nomem: + { + int size = tnode_child_length(tn); + int j; + + for(j = 0; j < size; j++) + if (tn->child[j]) + tnode_free((struct tnode *)tn->child[j]); + + tnode_free(tn); + + return ERR_PTR(-ENOMEM); + } } static void trie_init(struct trie *t) -- cgit v1.2.3 From bb435b8d816582064ee0ddb1e2a6fbca67f34108 Mon Sep 17 00:00:00 2001 From: Stephen Hemmigner Date: Tue, 9 Aug 2005 20:25:39 -0700 Subject: [IPV4]: fib_trie: Use const Use const where possible and get rid of EXTRACT() macro that was never used. Signed-off-by: Stephen Hemmigner Signed-off-by: Robert Olsson Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 914a4c2aae4..395f64df6f9 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -77,7 +77,6 @@ #undef CONFIG_IP_FIB_TRIE_STATS #define MAX_CHILDS 16384 -#define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n))) #define KEYLENGTH (8*sizeof(t_key)) #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l)) #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset)) @@ -162,10 +161,8 @@ static int trie_debug = 0; #define DBG(x...) do { if (trie_debug) printk(x); } while (0) -static int tnode_full(struct tnode *tn, struct node *n); static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); -static int tnode_child_length(struct tnode *tn); static struct node *resize(struct trie *t, struct tnode *tn); static struct tnode *inflate(struct trie *t, struct tnode *tn); static struct tnode *halve(struct trie *t, struct tnode *tn); @@ -188,7 +185,7 @@ static inline struct node *tnode_get_child(struct tnode *tn, int i) return tn->child[i]; } -static inline int tnode_child_length(struct tnode *tn) +static inline int tnode_child_length(const struct tnode *tn) { return 1 << tn->bits; } @@ -400,7 +397,7 @@ static void tnode_free(struct tnode *tn) * and no bits are skipped. See discussion in dyntree paper p. 6 */ -static inline int tnode_full(struct tnode *tn, struct node *n) +static inline int tnode_full(const struct tnode *tn, const struct node *n) { if (n == NULL || IS_LEAF(n)) return 0; -- cgit v1.2.3 From 1d3de414eb20d937d82c5219fd13ee4cedc499cb Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 9 Aug 2005 20:26:55 -0700 Subject: [NETFILTER]: New iptables DCCP protocol header match Using this new iptables DCCP protocol header match, it is possible to create simplistic stateless packet filtering rules for DCCP. It permits matching of port numbers, packet type and options. Signed-off-by: Harald Welte Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/netfilter/Kconfig | 11 +++ net/ipv4/netfilter/Makefile | 1 + net/ipv4/netfilter/ipt_dccp.c | 176 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 188 insertions(+) create mode 100644 net/ipv4/netfilter/ipt_dccp.c (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 2fa26a41fa4..9f5e1d769b5 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -354,6 +354,17 @@ config IP_NF_MATCH_SCTP If you want to compile it as a module, say M here and read . If unsure, say `N'. +config IP_NF_MATCH_DCCP + tristate 'DCCP protocol match support' + depends on IP_NF_IPTABLES + help + With this option enabled, you will be able to use the iptables + `dccp' match in order to match on DCCP source/destination ports + and DCCP flags. + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + config IP_NF_MATCH_COMMENT tristate 'comment match support' depends on IP_NF_IPTABLES diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index c2ae663b723..58aa7c616e1 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -42,6 +42,7 @@ obj-$(CONFIG_IP_NF_MATCH_HELPER) += ipt_helper.o obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o obj-$(CONFIG_IP_NF_MATCH_HASHLIMIT) += ipt_hashlimit.o obj-$(CONFIG_IP_NF_MATCH_SCTP) += ipt_sctp.o +obj-$(CONFIG_IP_NF_MATCH_DCCP) += ipt_dccp.o obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o diff --git a/net/ipv4/netfilter/ipt_dccp.c b/net/ipv4/netfilter/ipt_dccp.c new file mode 100644 index 00000000000..ad3278bba6c --- /dev/null +++ b/net/ipv4/netfilter/ipt_dccp.c @@ -0,0 +1,176 @@ +/* + * iptables module for DCCP protocol header matching + * + * (C) 2005 by Harald Welte + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include + +#include +#include + +#define DCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \ + || (!!((invflag) & (option)) ^ (cond))) + +static unsigned char *dccp_optbuf; +static DEFINE_SPINLOCK(dccp_buflock); + +static inline int +dccp_find_option(u_int8_t option, + const struct sk_buff *skb, + const struct dccp_hdr *dh, + int *hotdrop) +{ + /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ + unsigned char *op; + unsigned int optoff = __dccp_hdr_len(dh); + unsigned int optlen = dh->dccph_doff*4 - __dccp_hdr_len(dh); + unsigned int i; + + if (dh->dccph_doff * 4 < __dccp_hdr_len(dh)) { + *hotdrop = 1; + return 0; + } + + if (!optlen) + return 0; + + spin_lock_bh(&dccp_buflock); + op = skb_header_pointer(skb, + skb->nh.iph->ihl*4 + optoff, + optlen, dccp_optbuf); + if (op == NULL) { + /* If we don't have the whole header, drop packet. */ + spin_unlock_bh(&dccp_buflock); + *hotdrop = 1; + return 0; + } + + for (i = 0; i < optlen; ) { + if (op[i] == option) { + spin_unlock_bh(&dccp_buflock); + return 1; + } + + if (op[i] < 2) + i++; + else + i += op[i+1]?:1; + } + + spin_unlock_bh(&dccp_buflock); + return 0; +} + + +static inline int +match_types(const struct dccp_hdr *dh, u_int16_t typemask) +{ + return (typemask & (1 << dh->dccph_type)); +} + +static inline int +match_option(u_int8_t option, const struct sk_buff *skb, + const struct dccp_hdr *dh, int *hotdrop) +{ + return dccp_find_option(option, skb, dh, hotdrop); +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_dccp_info *info = + (const struct ipt_dccp_info *)matchinfo; + struct dccp_hdr _dh, *dh; + + if (offset) + return 0; + + dh = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_dh), &_dh); + if (dh == NULL) { + *hotdrop = 1; + return 0; + } + + return DCCHECK(((ntohs(dh->dccph_sport) >= info->spts[0]) + && (ntohs(dh->dccph_sport) <= info->spts[1])), + IPT_DCCP_SRC_PORTS, info->flags, info->invflags) + && DCCHECK(((ntohs(dh->dccph_dport) >= info->dpts[0]) + && (ntohs(dh->dccph_dport) <= info->dpts[1])), + IPT_DCCP_DEST_PORTS, info->flags, info->invflags) + && DCCHECK(match_types(dh, info->typemask), + IPT_DCCP_TYPE, info->flags, info->invflags) + && DCCHECK(match_option(info->option, skb, dh, hotdrop), + IPT_DCCP_OPTION, info->flags, info->invflags); +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ipt_dccp_info *info; + + info = (const struct ipt_dccp_info *)matchinfo; + + return ip->proto == IPPROTO_DCCP + && !(ip->invflags & IPT_INV_PROTO) + && matchsize == IPT_ALIGN(sizeof(struct ipt_dccp_info)) + && !(info->flags & ~IPT_DCCP_VALID_FLAGS) + && !(info->invflags & ~IPT_DCCP_VALID_FLAGS) + && !(info->invflags & ~info->flags); +} + +static struct ipt_match dccp_match = +{ + .name = "dccp", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + int ret; + + /* doff is 8 bits, so the maximum option size is (4*256). Don't put + * this in BSS since DaveM is worried about locked TLB's for kernel + * BSS. */ + dccp_optbuf = kmalloc(256 * 4, GFP_KERNEL); + if (!dccp_optbuf) + return -ENOMEM; + ret = ipt_register_match(&dccp_match); + if (ret) + kfree(dccp_optbuf); + + return ret; +} + +static void __exit fini(void) +{ + ipt_unregister_match(&dccp_match); + kfree(dccp_optbuf); +} + +module_init(init); +module_exit(fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("Match for DCCP protocol packets"); + -- cgit v1.2.3 From 295ff7edb8f72b77d524759266f7524deae379b3 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:44:40 -0700 Subject: [TIMEWAIT]: Introduce inet_timewait_death_row That groups all of the tables and variables associated to the TCP timewait schedulling/recycling/killing code, that now can be isolated from the TCP specific code and used by other transport protocols, such as DCCP. Next changeset will move this code to net/ipv4/inet_timewait_sock.c Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/proc.c | 2 +- net/ipv4/sysctl_net_ipv4.c | 4 +- net/ipv4/tcp.c | 4 +- net/ipv4/tcp_ipv4.c | 11 +- net/ipv4/tcp_minisocks.c | 256 +++++++++++++++++++++++---------------------- 5 files changed, 141 insertions(+), 136 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 912bbcc7f41..3eadbb27187 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -65,7 +65,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count), - tcp_tw_count, atomic_read(&tcp_sockets_allocated), + tcp_death_row.tw_count, atomic_read(&tcp_sockets_allocated), atomic_read(&tcp_memory_allocated)); seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot)); seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot)); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e3289453241..ce47a345ecc 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -259,7 +259,7 @@ ctl_table ipv4_table[] = { { .ctl_name = NET_TCP_MAX_TW_BUCKETS, .procname = "tcp_max_tw_buckets", - .data = &sysctl_tcp_max_tw_buckets, + .data = &tcp_death_row.sysctl_max_tw_buckets, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec @@ -363,7 +363,7 @@ ctl_table ipv4_table[] = { { .ctl_name = NET_TCP_TW_RECYCLE, .procname = "tcp_tw_recycle", - .data = &sysctl_tcp_tw_recycle, + .data = &tcp_death_row.sysctl_tw_recycle, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4bda522d25c..0eed64a1991 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2109,12 +2109,12 @@ void __init tcp_init(void) if (order >= 4) { sysctl_local_port_range[0] = 32768; sysctl_local_port_range[1] = 61000; - sysctl_tcp_max_tw_buckets = 180000; + tcp_death_row.sysctl_max_tw_buckets = 180000; sysctl_tcp_max_orphans = 4096 << (order - 4); sysctl_max_syn_backlog = 1024; } else if (order < 3) { sysctl_local_port_range[0] = 1024 * (3 - order); - sysctl_tcp_max_tw_buckets >>= (3 - order); + tcp_death_row.sysctl_max_tw_buckets >>= (3 - order); sysctl_tcp_max_orphans >>= (3 - order); sysctl_max_syn_backlog = 128; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b966102b9f3..83f72346274 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -199,7 +199,7 @@ unique: NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); } else if (tw) { /* Silly. Should hash-dance instead... */ - tcp_tw_deschedule(tw); + inet_twsk_deschedule(tw, &tcp_death_row); NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); inet_twsk_put(tw); @@ -291,7 +291,7 @@ ok: spin_unlock(&head->lock); if (tw) { - tcp_tw_deschedule(tw); + inet_twsk_deschedule(tw, &tcp_death_row);; inet_twsk_put(tw); } @@ -366,7 +366,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) tp->write_seq = 0; } - if (sysctl_tcp_tw_recycle && + if (tcp_death_row.sysctl_tw_recycle && !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) { struct inet_peer *peer = rt_get_peer(rt); @@ -965,7 +965,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) * are made in the function processing timewait state. */ if (tmp_opt.saw_tstamp && - sysctl_tcp_tw_recycle && + tcp_death_row.sysctl_tw_recycle && (dst = inet_csk_route_req(sk, req)) != NULL && (peer = rt_get_peer((struct rtable *)dst)) != NULL && peer->v4daddr == saddr) { @@ -1305,7 +1305,8 @@ do_time_wait: ntohs(th->dest), inet_iif(skb)); if (sk2) { - tcp_tw_deschedule((struct inet_timewait_sock *)sk); + inet_twsk_deschedule((struct inet_timewait_sock *)sk, + &tcp_death_row); inet_twsk_put((struct inet_timewait_sock *)sk); sk = sk2; goto process; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 2d95afe5b39..81b9a52c50c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -35,13 +35,37 @@ #define SYNC_INIT 1 #endif -int sysctl_tcp_tw_recycle; -int sysctl_tcp_max_tw_buckets = NR_FILE*2; +/* New-style handling of TIME_WAIT sockets. */ + +static void inet_twdr_hangman(unsigned long data); +static void inet_twdr_twkill_work(void *data); +static void inet_twdr_twcal_tick(unsigned long data); int sysctl_tcp_syncookies = SYNC_INIT; int sysctl_tcp_abort_on_overflow; -static void tcp_tw_schedule(struct inet_timewait_sock *tw, int timeo); +struct inet_timewait_death_row tcp_death_row = { + .sysctl_max_tw_buckets = NR_FILE * 2, + .period = TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS, + .death_lock = SPIN_LOCK_UNLOCKED, + .hashinfo = &tcp_hashinfo, + .tw_timer = TIMER_INITIALIZER(inet_twdr_hangman, 0, + (unsigned long)&tcp_death_row), + .twkill_work = __WORK_INITIALIZER(tcp_death_row.twkill_work, + inet_twdr_twkill_work, + &tcp_death_row), +/* Short-time timewait calendar */ + + .twcal_hand = -1, + .twcal_timer = TIMER_INITIALIZER(inet_twdr_twcal_tick, 0, + (unsigned long)&tcp_death_row), +}; + +EXPORT_SYMBOL_GPL(tcp_death_row); + +static void inet_twsk_schedule(struct inet_timewait_sock *tw, + struct inet_timewait_death_row *twdr, + const int timeo); static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { @@ -52,10 +76,6 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) return (seq == e_win && seq == end_seq); } -/* New-style handling of TIME_WAIT sockets. */ - -int tcp_tw_count; - /* * * Main purpose of TIME-WAIT state is to close connection gracefully, * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN @@ -132,7 +152,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, if (!th->fin || TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) { kill_with_rst: - tcp_tw_deschedule(tw); + inet_twsk_deschedule(tw, &tcp_death_row); inet_twsk_put(tw); return TCP_TW_RST; } @@ -151,11 +171,11 @@ kill_with_rst: * do not undertsnad recycling in any case, it not * a big problem in practice. --ANK */ if (tw->tw_family == AF_INET && - sysctl_tcp_tw_recycle && tcptw->tw_ts_recent_stamp && + tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && tcp_v4_tw_remember_stamp(tw)) - tcp_tw_schedule(tw, tw->tw_timeout); + inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout); else - tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN); return TCP_TW_ACK; } @@ -188,12 +208,12 @@ kill_with_rst: */ if (sysctl_tcp_rfc1337 == 0) { kill: - tcp_tw_deschedule(tw); + inet_twsk_deschedule(tw, &tcp_death_row); inet_twsk_put(tw); return TCP_TW_SUCCESS; } } - tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN); if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; @@ -243,7 +263,7 @@ kill: * Do not reschedule in the last case. */ if (paws_reject || th->ack) - tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN); /* Send ACK. Note, we do not put the bucket, * it will be released by caller. @@ -263,10 +283,10 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) const struct tcp_sock *tp = tcp_sk(sk); int recycle_ok = 0; - if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp) + if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) recycle_ok = tp->af_specific->remember_stamp(sk); - if (tcp_tw_count < sysctl_tcp_max_tw_buckets) + if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) tw = inet_twsk_alloc(sk, state); if (tw != NULL) { @@ -306,7 +326,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) timeo = TCP_TIMEWAIT_LEN; } - tcp_tw_schedule(tw, timeo); + inet_twsk_schedule(tw, &tcp_death_row, timeo); inet_twsk_put(tw); } else { /* Sorry, if we're out of memory, just CLOSE this @@ -321,26 +341,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tcp_done(sk); } -/* Kill off TIME_WAIT sockets once their lifetime has expired. */ -static int tcp_tw_death_row_slot; - -static void tcp_twkill(unsigned long); - -/* TIME_WAIT reaping mechanism. */ -#define TCP_TWKILL_SLOTS 8 /* Please keep this a power of 2. */ -#define TCP_TWKILL_PERIOD (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS) - -#define TCP_TWKILL_QUOTA 100 - -static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS]; -static DEFINE_SPINLOCK(tw_death_lock); -static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0); -static void twkill_work(void *); -static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL); -static u32 twkill_thread_slots; - /* Returns non-zero if quota exceeded. */ -static int tcp_do_twkill_work(int slot, unsigned int quota) +static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr, + const int slot) { struct inet_timewait_sock *tw; struct hlist_node *node; @@ -356,19 +359,19 @@ static int tcp_do_twkill_work(int slot, unsigned int quota) killed = 0; ret = 0; rescan: - inet_twsk_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) { + inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) { __inet_twsk_del_dead_node(tw); - spin_unlock(&tw_death_lock); - __inet_twsk_kill(tw, &tcp_hashinfo); + spin_unlock(&twdr->death_lock); + __inet_twsk_kill(tw, twdr->hashinfo); inet_twsk_put(tw); killed++; - spin_lock(&tw_death_lock); - if (killed > quota) { + spin_lock(&twdr->death_lock); + if (killed > INET_TWDR_TWKILL_QUOTA) { ret = 1; break; } - /* While we dropped tw_death_lock, another cpu may have + /* While we dropped twdr->death_lock, another cpu may have * killed off the next TW bucket in the list, therefore * do a fresh re-read of the hlist head node with the * lock reacquired. We still use the hlist traversal @@ -377,67 +380,68 @@ rescan: goto rescan; } - tcp_tw_count -= killed; + twdr->tw_count -= killed; NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed); return ret; } -static void tcp_twkill(unsigned long dummy) +static void inet_twdr_hangman(unsigned long data) { - int need_timer, ret; + struct inet_timewait_death_row *twdr; + int unsigned need_timer; - spin_lock(&tw_death_lock); + twdr = (struct inet_timewait_death_row *)data; + spin_lock(&twdr->death_lock); - if (tcp_tw_count == 0) + if (twdr->tw_count == 0) goto out; need_timer = 0; - ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA); - if (ret) { - twkill_thread_slots |= (1 << tcp_tw_death_row_slot); + if (inet_twdr_do_twkill_work(twdr, twdr->slot)) { + twdr->thread_slots |= (1 << twdr->slot); mb(); - schedule_work(&tcp_twkill_work); + schedule_work(&twdr->twkill_work); need_timer = 1; } else { /* We purged the entire slot, anything left? */ - if (tcp_tw_count) + if (twdr->tw_count) need_timer = 1; } - tcp_tw_death_row_slot = - ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1)); + twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1)); if (need_timer) - mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD); + mod_timer(&twdr->tw_timer, jiffies + twdr->period); out: - spin_unlock(&tw_death_lock); + spin_unlock(&twdr->death_lock); } extern void twkill_slots_invalid(void); -static void twkill_work(void *dummy) +static void inet_twdr_twkill_work(void *data) { + struct inet_timewait_death_row *twdr = data; int i; - if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8)) + if ((INET_TWDR_TWKILL_SLOTS - 1) > (sizeof(twdr->thread_slots) * 8)) twkill_slots_invalid(); - while (twkill_thread_slots) { - spin_lock_bh(&tw_death_lock); - for (i = 0; i < TCP_TWKILL_SLOTS; i++) { - if (!(twkill_thread_slots & (1 << i))) + while (twdr->thread_slots) { + spin_lock_bh(&twdr->death_lock); + for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) { + if (!(twdr->thread_slots & (1 << i))) continue; - while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) { + while (inet_twdr_do_twkill_work(twdr, i) != 0) { if (need_resched()) { - spin_unlock_bh(&tw_death_lock); + spin_unlock_bh(&twdr->death_lock); schedule(); - spin_lock_bh(&tw_death_lock); + spin_lock_bh(&twdr->death_lock); } } - twkill_thread_slots &= ~(1 << i); + twdr->thread_slots &= ~(1 << i); } - spin_unlock_bh(&tw_death_lock); + spin_unlock_bh(&twdr->death_lock); } } @@ -446,28 +450,22 @@ static void twkill_work(void *dummy) */ /* This is for handling early-kills of TIME_WAIT sockets. */ -void tcp_tw_deschedule(struct inet_timewait_sock *tw) +void inet_twsk_deschedule(struct inet_timewait_sock *tw, + struct inet_timewait_death_row *twdr) { - spin_lock(&tw_death_lock); + spin_lock(&twdr->death_lock); if (inet_twsk_del_dead_node(tw)) { inet_twsk_put(tw); - if (--tcp_tw_count == 0) - del_timer(&tcp_tw_timer); + if (--twdr->tw_count == 0) + del_timer(&twdr->tw_timer); } - spin_unlock(&tw_death_lock); - __inet_twsk_kill(tw, &tcp_hashinfo); + spin_unlock(&twdr->death_lock); + __inet_twsk_kill(tw, twdr->hashinfo); } -/* Short-time timewait calendar */ - -static int tcp_twcal_hand = -1; -static int tcp_twcal_jiffie; -static void tcp_twcal_tick(unsigned long); -static struct timer_list tcp_twcal_timer = - TIMER_INITIALIZER(tcp_twcal_tick, 0, 0); -static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS]; - -static void tcp_tw_schedule(struct inet_timewait_sock *tw, const int timeo) +static void inet_twsk_schedule(struct inet_timewait_sock *tw, + struct inet_timewait_death_row *twdr, + const int timeo) { struct hlist_head *list; int slot; @@ -496,100 +494,106 @@ static void tcp_tw_schedule(struct inet_timewait_sock *tw, const int timeo) * is greater than TS tick!) and detect old duplicates with help * of PAWS. */ - slot = (timeo + (1<> TCP_TW_RECYCLE_TICK; + slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK; - spin_lock(&tw_death_lock); + spin_lock(&twdr->death_lock); /* Unlink it, if it was scheduled */ if (inet_twsk_del_dead_node(tw)) - tcp_tw_count--; + twdr->tw_count--; else atomic_inc(&tw->tw_refcnt); - if (slot >= TCP_TW_RECYCLE_SLOTS) { + if (slot >= INET_TWDR_RECYCLE_SLOTS) { /* Schedule to slow timer */ if (timeo >= TCP_TIMEWAIT_LEN) { - slot = TCP_TWKILL_SLOTS-1; + slot = INET_TWDR_TWKILL_SLOTS - 1; } else { - slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD; - if (slot >= TCP_TWKILL_SLOTS) - slot = TCP_TWKILL_SLOTS-1; + slot = (timeo + twdr->period - 1) / twdr->period; + if (slot >= INET_TWDR_TWKILL_SLOTS) + slot = INET_TWDR_TWKILL_SLOTS - 1; } tw->tw_ttd = jiffies + timeo; - slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1); - list = &tcp_tw_death_row[slot]; + slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1); + list = &twdr->cells[slot]; } else { - tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK); - - if (tcp_twcal_hand < 0) { - tcp_twcal_hand = 0; - tcp_twcal_jiffie = jiffies; - tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK); + + if (twdr->twcal_hand < 0) { + twdr->twcal_hand = 0; + twdr->twcal_jiffie = jiffies; + twdr->twcal_timer.expires = twdr->twcal_jiffie + + (slot << INET_TWDR_RECYCLE_TICK); + add_timer(&twdr->twcal_timer); } else { - if (time_after(tcp_twcal_timer.expires, jiffies + (slot<twcal_timer.expires, + jiffies + (slot << INET_TWDR_RECYCLE_TICK))) + mod_timer(&twdr->twcal_timer, + jiffies + (slot << INET_TWDR_RECYCLE_TICK)); + slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1); } - list = &tcp_twcal_row[slot]; + list = &twdr->twcal_row[slot]; } hlist_add_head(&tw->tw_death_node, list); - if (tcp_tw_count++ == 0) - mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD); - spin_unlock(&tw_death_lock); + if (twdr->tw_count++ == 0) + mod_timer(&twdr->tw_timer, jiffies + twdr->period); + spin_unlock(&twdr->death_lock); } -void tcp_twcal_tick(unsigned long dummy) +void inet_twdr_twcal_tick(unsigned long data) { + struct inet_timewait_death_row *twdr; int n, slot; unsigned long j; unsigned long now = jiffies; int killed = 0; int adv = 0; - spin_lock(&tw_death_lock); - if (tcp_twcal_hand < 0) + twdr = (struct inet_timewait_death_row *)data; + + spin_lock(&twdr->death_lock); + if (twdr->twcal_hand < 0) goto out; - slot = tcp_twcal_hand; - j = tcp_twcal_jiffie; + slot = twdr->twcal_hand; + j = twdr->twcal_jiffie; - for (n=0; ntwcal_row[slot]) { __inet_twsk_del_dead_node(tw); - __inet_twsk_kill(tw, &tcp_hashinfo); + __inet_twsk_kill(tw, twdr->hashinfo); inet_twsk_put(tw); killed++; } } else { if (!adv) { adv = 1; - tcp_twcal_jiffie = j; - tcp_twcal_hand = slot; + twdr->twcal_jiffie = j; + twdr->twcal_hand = slot; } - if (!hlist_empty(&tcp_twcal_row[slot])) { - mod_timer(&tcp_twcal_timer, j); + if (!hlist_empty(&twdr->twcal_row[slot])) { + mod_timer(&twdr->twcal_timer, j); goto out; } } - j += (1<twcal_hand = -1; out: - if ((tcp_tw_count -= killed) == 0) - del_timer(&tcp_tw_timer); + if ((twdr->tw_count -= killed) == 0) + del_timer(&twdr->tw_timer); NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed); - spin_unlock(&tw_death_lock); + spin_unlock(&twdr->death_lock); } /* This is not only more efficient than what we used to do, it eliminates @@ -929,4 +933,4 @@ EXPORT_SYMBOL(tcp_check_req); EXPORT_SYMBOL(tcp_child_process); EXPORT_SYMBOL(tcp_create_openreq_child); EXPORT_SYMBOL(tcp_timewait_state_process); -EXPORT_SYMBOL(tcp_tw_deschedule); +EXPORT_SYMBOL(inet_twsk_deschedule); -- cgit v1.2.3 From 696ab2d3bffc746fb8cf3712f066d42b9886aeed Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 9 Aug 2005 20:45:03 -0700 Subject: [TIMEWAIT]: Move inet_timewait_death_row routines to net/ipv4/inet_timewait_sock.c Also export the ones that will be used in the next changeset, when DCCP uses this infrastructure. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/inet_timewait_sock.c | 270 ++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_minisocks.c | 281 ++---------------------------------------- 2 files changed, 280 insertions(+), 271 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 22882d95f64..4d1502a4985 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -12,6 +12,7 @@ #include #include +#include /* Must be called with locally disabled BHs. */ void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo) @@ -85,6 +86,8 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, write_unlock(&ehead->lock); } +EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); + struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) { struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab, @@ -112,3 +115,270 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat return tw; } + +EXPORT_SYMBOL_GPL(inet_twsk_alloc); + +/* Returns non-zero if quota exceeded. */ +static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr, + const int slot) +{ + struct inet_timewait_sock *tw; + struct hlist_node *node; + unsigned int killed; + int ret; + + /* NOTE: compare this to previous version where lock + * was released after detaching chain. It was racy, + * because tw buckets are scheduled in not serialized context + * in 2.3 (with netfilter), and with softnet it is common, because + * soft irqs are not sequenced. + */ + killed = 0; + ret = 0; +rescan: + inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) { + __inet_twsk_del_dead_node(tw); + spin_unlock(&twdr->death_lock); + __inet_twsk_kill(tw, twdr->hashinfo); + inet_twsk_put(tw); + killed++; + spin_lock(&twdr->death_lock); + if (killed > INET_TWDR_TWKILL_QUOTA) { + ret = 1; + break; + } + + /* While we dropped twdr->death_lock, another cpu may have + * killed off the next TW bucket in the list, therefore + * do a fresh re-read of the hlist head node with the + * lock reacquired. We still use the hlist traversal + * macro in order to get the prefetches. + */ + goto rescan; + } + + twdr->tw_count -= killed; + NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed); + + return ret; +} + +void inet_twdr_hangman(unsigned long data) +{ + struct inet_timewait_death_row *twdr; + int unsigned need_timer; + + twdr = (struct inet_timewait_death_row *)data; + spin_lock(&twdr->death_lock); + + if (twdr->tw_count == 0) + goto out; + + need_timer = 0; + if (inet_twdr_do_twkill_work(twdr, twdr->slot)) { + twdr->thread_slots |= (1 << twdr->slot); + mb(); + schedule_work(&twdr->twkill_work); + need_timer = 1; + } else { + /* We purged the entire slot, anything left? */ + if (twdr->tw_count) + need_timer = 1; + } + twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1)); + if (need_timer) + mod_timer(&twdr->tw_timer, jiffies + twdr->period); +out: + spin_unlock(&twdr->death_lock); +} + +EXPORT_SYMBOL_GPL(inet_twdr_hangman); + +extern void twkill_slots_invalid(void); + +void inet_twdr_twkill_work(void *data) +{ + struct inet_timewait_death_row *twdr = data; + int i; + + if ((INET_TWDR_TWKILL_SLOTS - 1) > (sizeof(twdr->thread_slots) * 8)) + twkill_slots_invalid(); + + while (twdr->thread_slots) { + spin_lock_bh(&twdr->death_lock); + for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) { + if (!(twdr->thread_slots & (1 << i))) + continue; + + while (inet_twdr_do_twkill_work(twdr, i) != 0) { + if (need_resched()) { + spin_unlock_bh(&twdr->death_lock); + schedule(); + spin_lock_bh(&twdr->death_lock); + } + } + + twdr->thread_slots &= ~(1 << i); + } + spin_unlock_bh(&twdr->death_lock); + } +} + +EXPORT_SYMBOL_GPL(inet_twdr_twkill_work); + +/* These are always called from BH context. See callers in + * tcp_input.c to verify this. + */ + +/* This is for handling early-kills of TIME_WAIT sockets. */ +void inet_twsk_deschedule(struct inet_timewait_sock *tw, + struct inet_timewait_death_row *twdr) +{ + spin_lock(&twdr->death_lock); + if (inet_twsk_del_dead_node(tw)) { + inet_twsk_put(tw); + if (--twdr->tw_count == 0) + del_timer(&twdr->tw_timer); + } + spin_unlock(&twdr->death_lock); + __inet_twsk_kill(tw, twdr->hashinfo); +} + +EXPORT_SYMBOL(inet_twsk_deschedule); + +void inet_twsk_schedule(struct inet_timewait_sock *tw, + struct inet_timewait_death_row *twdr, + const int timeo, const int timewait_len) +{ + struct hlist_head *list; + int slot; + + /* timeout := RTO * 3.5 + * + * 3.5 = 1+2+0.5 to wait for two retransmits. + * + * RATIONALE: if FIN arrived and we entered TIME-WAIT state, + * our ACK acking that FIN can be lost. If N subsequent retransmitted + * FINs (or previous seqments) are lost (probability of such event + * is p^(N+1), where p is probability to lose single packet and + * time to detect the loss is about RTO*(2^N - 1) with exponential + * backoff). Normal timewait length is calculated so, that we + * waited at least for one retransmitted FIN (maximal RTO is 120sec). + * [ BTW Linux. following BSD, violates this requirement waiting + * only for 60sec, we should wait at least for 240 secs. + * Well, 240 consumes too much of resources 8) + * ] + * This interval is not reduced to catch old duplicate and + * responces to our wandering segments living for two MSLs. + * However, if we use PAWS to detect + * old duplicates, we can reduce the interval to bounds required + * by RTO, rather than MSL. So, if peer understands PAWS, we + * kill tw bucket after 3.5*RTO (it is important that this number + * is greater than TS tick!) and detect old duplicates with help + * of PAWS. + */ + slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK; + + spin_lock(&twdr->death_lock); + + /* Unlink it, if it was scheduled */ + if (inet_twsk_del_dead_node(tw)) + twdr->tw_count--; + else + atomic_inc(&tw->tw_refcnt); + + if (slot >= INET_TWDR_RECYCLE_SLOTS) { + /* Schedule to slow timer */ + if (timeo >= timewait_len) { + slot = INET_TWDR_TWKILL_SLOTS - 1; + } else { + slot = (timeo + twdr->period - 1) / twdr->period; + if (slot >= INET_TWDR_TWKILL_SLOTS) + slot = INET_TWDR_TWKILL_SLOTS - 1; + } + tw->tw_ttd = jiffies + timeo; + slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1); + list = &twdr->cells[slot]; + } else { + tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK); + + if (twdr->twcal_hand < 0) { + twdr->twcal_hand = 0; + twdr->twcal_jiffie = jiffies; + twdr->twcal_timer.expires = twdr->twcal_jiffie + + (slot << INET_TWDR_RECYCLE_TICK); + add_timer(&twdr->twcal_timer); + } else { + if (time_after(twdr->twcal_timer.expires, + jiffies + (slot << INET_TWDR_RECYCLE_TICK))) + mod_timer(&twdr->twcal_timer, + jiffies + (slot << INET_TWDR_RECYCLE_TICK)); + slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1); + } + list = &twdr->twcal_row[slot]; + } + + hlist_add_head(&tw->tw_death_node, list); + + if (twdr->tw_count++ == 0) + mod_timer(&twdr->tw_timer, jiffies + twdr->period); + spin_unlock(&twdr->death_lock); +} + +EXPORT_SYMBOL_GPL(inet_twsk_schedule); + +void inet_twdr_twcal_tick(unsigned long data) +{ + struct inet_timewait_death_row *twdr; + int n, slot; + unsigned long j; + unsigned long now = jiffies; + int killed = 0; + int adv = 0; + + twdr = (struct inet_timewait_death_row *)data; + + spin_lock(&twdr->death_lock); + if (twdr->twcal_hand < 0) + goto out; + + slot = twdr->twcal_hand; + j = twdr->twcal_jiffie; + + for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) { + if (time_before_eq(j, now)) { + struct hlist_node *node, *safe; + struct inet_timewait_sock *tw; + + inet_twsk_for_each_inmate_safe(tw, node, safe, + &twdr->twcal_row[slot]) { + __inet_twsk_del_dead_node(tw); + __inet_twsk_kill(tw, twdr->hashinfo); + inet_twsk_put(tw); + killed++; + } + } else { + if (!adv) { + adv = 1; + twdr->twcal_jiffie = j; + twdr->twcal_hand = slot; + } + + if (!hlist_empty(&twdr->twcal_row[slot])) { + mod_timer(&twdr->twcal_timer, j); + goto out; + } + } + j += 1 << INET_TWDR_RECYCLE_TICK; + slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1); + } + twdr->twcal_hand = -1; + +out: + if ((twdr->tw_count -= killed) == 0) + del_timer(&twdr->tw_timer); + NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed); + spin_unlock(&twdr->death_lock); +} + +EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 81b9a52c50c..dc085233d51 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -35,12 +35,6 @@ #define SYNC_INIT 1 #endif -/* New-style handling of TIME_WAIT sockets. */ - -static void inet_twdr_hangman(unsigned long data); -static void inet_twdr_twkill_work(void *data); -static void inet_twdr_twcal_tick(unsigned long data); - int sysctl_tcp_syncookies = SYNC_INIT; int sysctl_tcp_abort_on_overflow; @@ -63,10 +57,6 @@ struct inet_timewait_death_row tcp_death_row = { EXPORT_SYMBOL_GPL(tcp_death_row); -static void inet_twsk_schedule(struct inet_timewait_sock *tw, - struct inet_timewait_death_row *twdr, - const int timeo); - static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { if (seq == s_win) @@ -173,9 +163,11 @@ kill_with_rst: if (tw->tw_family == AF_INET && tcp_death_row.sysctl_tw_recycle && tcptw->tw_ts_recent_stamp && tcp_v4_tw_remember_stamp(tw)) - inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout); + inet_twsk_schedule(tw, &tcp_death_row, tw->tw_timeout, + TCP_TIMEWAIT_LEN); else - inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, + TCP_TIMEWAIT_LEN); return TCP_TW_ACK; } @@ -213,7 +205,8 @@ kill: return TCP_TW_SUCCESS; } } - inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, + TCP_TIMEWAIT_LEN); if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; @@ -263,7 +256,8 @@ kill: * Do not reschedule in the last case. */ if (paws_reject || th->ack) - inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN); + inet_twsk_schedule(tw, &tcp_death_row, TCP_TIMEWAIT_LEN, + TCP_TIMEWAIT_LEN); /* Send ACK. Note, we do not put the bucket, * it will be released by caller. @@ -326,7 +320,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) timeo = TCP_TIMEWAIT_LEN; } - inet_twsk_schedule(tw, &tcp_death_row, timeo); + inet_twsk_schedule(tw, &tcp_death_row, timeo, + TCP_TIMEWAIT_LEN); inet_twsk_put(tw); } else { /* Sorry, if we're out of memory, just CLOSE this @@ -341,261 +336,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tcp_done(sk); } -/* Returns non-zero if quota exceeded. */ -static int inet_twdr_do_twkill_work(struct inet_timewait_death_row *twdr, - const int slot) -{ - struct inet_timewait_sock *tw; - struct hlist_node *node; - unsigned int killed; - int ret; - - /* NOTE: compare this to previous version where lock - * was released after detaching chain. It was racy, - * because tw buckets are scheduled in not serialized context - * in 2.3 (with netfilter), and with softnet it is common, because - * soft irqs are not sequenced. - */ - killed = 0; - ret = 0; -rescan: - inet_twsk_for_each_inmate(tw, node, &twdr->cells[slot]) { - __inet_twsk_del_dead_node(tw); - spin_unlock(&twdr->death_lock); - __inet_twsk_kill(tw, twdr->hashinfo); - inet_twsk_put(tw); - killed++; - spin_lock(&twdr->death_lock); - if (killed > INET_TWDR_TWKILL_QUOTA) { - ret = 1; - break; - } - - /* While we dropped twdr->death_lock, another cpu may have - * killed off the next TW bucket in the list, therefore - * do a fresh re-read of the hlist head node with the - * lock reacquired. We still use the hlist traversal - * macro in order to get the prefetches. - */ - goto rescan; - } - - twdr->tw_count -= killed; - NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed); - - return ret; -} - -static void inet_twdr_hangman(unsigned long data) -{ - struct inet_timewait_death_row *twdr; - int unsigned need_timer; - - twdr = (struct inet_timewait_death_row *)data; - spin_lock(&twdr->death_lock); - - if (twdr->tw_count == 0) - goto out; - - need_timer = 0; - if (inet_twdr_do_twkill_work(twdr, twdr->slot)) { - twdr->thread_slots |= (1 << twdr->slot); - mb(); - schedule_work(&twdr->twkill_work); - need_timer = 1; - } else { - /* We purged the entire slot, anything left? */ - if (twdr->tw_count) - need_timer = 1; - } - twdr->slot = ((twdr->slot + 1) & (INET_TWDR_TWKILL_SLOTS - 1)); - if (need_timer) - mod_timer(&twdr->tw_timer, jiffies + twdr->period); -out: - spin_unlock(&twdr->death_lock); -} - -extern void twkill_slots_invalid(void); - -static void inet_twdr_twkill_work(void *data) -{ - struct inet_timewait_death_row *twdr = data; - int i; - - if ((INET_TWDR_TWKILL_SLOTS - 1) > (sizeof(twdr->thread_slots) * 8)) - twkill_slots_invalid(); - - while (twdr->thread_slots) { - spin_lock_bh(&twdr->death_lock); - for (i = 0; i < INET_TWDR_TWKILL_SLOTS; i++) { - if (!(twdr->thread_slots & (1 << i))) - continue; - - while (inet_twdr_do_twkill_work(twdr, i) != 0) { - if (need_resched()) { - spin_unlock_bh(&twdr->death_lock); - schedule(); - spin_lock_bh(&twdr->death_lock); - } - } - - twdr->thread_slots &= ~(1 << i); - } - spin_unlock_bh(&twdr->death_lock); - } -} - -/* These are always called from BH context. See callers in - * tcp_input.c to verify this. - */ - -/* This is for handling early-kills of TIME_WAIT sockets. */ -void inet_twsk_deschedule(struct inet_timewait_sock *tw, - struct inet_timewait_death_row *twdr) -{ - spin_lock(&twdr->death_lock); - if (inet_twsk_del_dead_node(tw)) { - inet_twsk_put(tw); - if (--twdr->tw_count == 0) - del_timer(&twdr->tw_timer); - } - spin_unlock(&twdr->death_lock); - __inet_twsk_kill(tw, twdr->hashinfo); -} - -static void inet_twsk_schedule(struct inet_timewait_sock *tw, - struct inet_timewait_death_row *twdr, - const int timeo) -{ - struct hlist_head *list; - int slot; - - /* timeout := RTO * 3.5 - * - * 3.5 = 1+2+0.5 to wait for two retransmits. - * - * RATIONALE: if FIN arrived and we entered TIME-WAIT state, - * our ACK acking that FIN can be lost. If N subsequent retransmitted - * FINs (or previous seqments) are lost (probability of such event - * is p^(N+1), where p is probability to lose single packet and - * time to detect the loss is about RTO*(2^N - 1) with exponential - * backoff). Normal timewait length is calculated so, that we - * waited at least for one retransmitted FIN (maximal RTO is 120sec). - * [ BTW Linux. following BSD, violates this requirement waiting - * only for 60sec, we should wait at least for 240 secs. - * Well, 240 consumes too much of resources 8) - * ] - * This interval is not reduced to catch old duplicate and - * responces to our wandering segments living for two MSLs. - * However, if we use PAWS to detect - * old duplicates, we can reduce the interval to bounds required - * by RTO, rather than MSL. So, if peer understands PAWS, we - * kill tw bucket after 3.5*RTO (it is important that this number - * is greater than TS tick!) and detect old duplicates with help - * of PAWS. - */ - slot = (timeo + (1 << INET_TWDR_RECYCLE_TICK) - 1) >> INET_TWDR_RECYCLE_TICK; - - spin_lock(&twdr->death_lock); - - /* Unlink it, if it was scheduled */ - if (inet_twsk_del_dead_node(tw)) - twdr->tw_count--; - else - atomic_inc(&tw->tw_refcnt); - - if (slot >= INET_TWDR_RECYCLE_SLOTS) { - /* Schedule to slow timer */ - if (timeo >= TCP_TIMEWAIT_LEN) { - slot = INET_TWDR_TWKILL_SLOTS - 1; - } else { - slot = (timeo + twdr->period - 1) / twdr->period; - if (slot >= INET_TWDR_TWKILL_SLOTS) - slot = INET_TWDR_TWKILL_SLOTS - 1; - } - tw->tw_ttd = jiffies + timeo; - slot = (twdr->slot + slot) & (INET_TWDR_TWKILL_SLOTS - 1); - list = &twdr->cells[slot]; - } else { - tw->tw_ttd = jiffies + (slot << INET_TWDR_RECYCLE_TICK); - - if (twdr->twcal_hand < 0) { - twdr->twcal_hand = 0; - twdr->twcal_jiffie = jiffies; - twdr->twcal_timer.expires = twdr->twcal_jiffie + - (slot << INET_TWDR_RECYCLE_TICK); - add_timer(&twdr->twcal_timer); - } else { - if (time_after(twdr->twcal_timer.expires, - jiffies + (slot << INET_TWDR_RECYCLE_TICK))) - mod_timer(&twdr->twcal_timer, - jiffies + (slot << INET_TWDR_RECYCLE_TICK)); - slot = (twdr->twcal_hand + slot) & (INET_TWDR_RECYCLE_SLOTS - 1); - } - list = &twdr->twcal_row[slot]; - } - - hlist_add_head(&tw->tw_death_node, list); - - if (twdr->tw_count++ == 0) - mod_timer(&twdr->tw_timer, jiffies + twdr->period); - spin_unlock(&twdr->death_lock); -} - -void inet_twdr_twcal_tick(unsigned long data) -{ - struct inet_timewait_death_row *twdr; - int n, slot; - unsigned long j; - unsigned long now = jiffies; - int killed = 0; - int adv = 0; - - twdr = (struct inet_timewait_death_row *)data; - - spin_lock(&twdr->death_lock); - if (twdr->twcal_hand < 0) - goto out; - - slot = twdr->twcal_hand; - j = twdr->twcal_jiffie; - - for (n = 0; n < INET_TWDR_RECYCLE_SLOTS; n++) { - if (time_before_eq(j, now)) { - struct hlist_node *node, *safe; - struct inet_timewait_sock *tw; - - inet_twsk_for_each_inmate_safe(tw, node, safe, - &twdr->twcal_row[slot]) { - __inet_twsk_del_dead_node(tw); - __inet_twsk_kill(tw, twdr->hashinfo); - inet_twsk_put(tw); - killed++; - } - } else { - if (!adv) { - adv = 1; - twdr->twcal_jiffie = j; - twdr->twcal_hand = slot; - } - - if (!hlist_empty(&twdr->twcal_row[slot])) { - mod_timer(&twdr->twcal_timer, j); - goto out; - } - } - j += 1 << INET_TWDR_RECYCLE_TICK; - slot = (slot + 1) & (INET_TWDR_RECYCLE_SLOTS - 1); - } - twdr->twcal_hand = -1; - -out: - if ((twdr->tw_count -= killed) == 0) - del_timer(&twdr->tw_timer); - NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed); - spin_unlock(&twdr->death_lock); -} - /* This is not only more efficient than what we used to do, it eliminates * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM * @@ -933,4 +673,3 @@ EXPORT_SYMBOL(tcp_check_req); EXPORT_SYMBOL(tcp_child_process); EXPORT_SYMBOL(tcp_create_openreq_child); EXPORT_SYMBOL(tcp_timewait_state_process); -EXPORT_SYMBOL(inet_twsk_deschedule); -- cgit v1.2.3 From 64ce207306debd7157f47282be94770407bec01c Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Tue, 9 Aug 2005 20:50:53 -0700 Subject: [NET]: Make NETDEBUG pure printk wrappers Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/esp4.c | 12 ++++++------ net/ipv4/icmp.c | 12 +++++------- net/ipv4/igmp.c | 2 +- net/ipv4/ip_fragment.c | 6 +++--- net/ipv4/ip_output.c | 2 +- net/ipv4/ipcomp.c | 4 ++-- net/ipv4/tcp_ipv4.c | 11 +++++------ net/ipv4/udp.c | 32 ++++++++++++++++---------------- 8 files changed, 39 insertions(+), 42 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index ba57446d5d1..b31ffc5053d 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -331,8 +331,8 @@ static void esp4_err(struct sk_buff *skb, u32 info) x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); if (!x) return; - NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", - ntohl(esph->spi), ntohl(iph->daddr))); + NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", + ntohl(esph->spi), ntohl(iph->daddr)); xfrm_state_put(x); } @@ -395,10 +395,10 @@ static int esp_init_state(struct xfrm_state *x) if (aalg_desc->uinfo.auth.icv_fullbits/8 != crypto_tfm_alg_digestsize(esp->auth.tfm)) { - NETDEBUG(printk(KERN_INFO "ESP: %s digestsize %u != %hu\n", - x->aalg->alg_name, - crypto_tfm_alg_digestsize(esp->auth.tfm), - aalg_desc->uinfo.auth.icv_fullbits/8)); + NETDEBUG(KERN_INFO "ESP: %s digestsize %u != %hu\n", + x->aalg->alg_name, + crypto_tfm_alg_digestsize(esp->auth.tfm), + aalg_desc->uinfo.auth.icv_fullbits/8); goto error; } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index badfc584997..25f66b750fd 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -627,11 +627,10 @@ static void icmp_unreach(struct sk_buff *skb) break; case ICMP_FRAG_NEEDED: if (ipv4_config.no_pmtu_disc) { - LIMIT_NETDEBUG( - printk(KERN_INFO "ICMP: %u.%u.%u.%u: " + LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: " "fragmentation needed " "and DF set.\n", - NIPQUAD(iph->daddr))); + NIPQUAD(iph->daddr)); } else { info = ip_rt_frag_needed(iph, ntohs(icmph->un.frag.mtu)); @@ -640,10 +639,9 @@ static void icmp_unreach(struct sk_buff *skb) } break; case ICMP_SR_FAILED: - LIMIT_NETDEBUG( - printk(KERN_INFO "ICMP: %u.%u.%u.%u: Source " + LIMIT_NETDEBUG(KERN_INFO "ICMP: %u.%u.%u.%u: Source " "Route Failed.\n", - NIPQUAD(iph->daddr))); + NIPQUAD(iph->daddr)); break; default: break; @@ -936,7 +934,7 @@ int icmp_rcv(struct sk_buff *skb) case CHECKSUM_HW: if (!(u16)csum_fold(skb->csum)) break; - LIMIT_NETDEBUG(printk(KERN_DEBUG "icmp v4 hw csum failure\n")); + LIMIT_NETDEBUG(KERN_DEBUG "icmp v4 hw csum failure\n"); case CHECKSUM_NONE: if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) goto error; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 5088f90835a..44607f4767b 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -904,7 +904,7 @@ int igmp_rcv(struct sk_buff *skb) case IGMP_MTRACE_RESP: break; default: - NETDEBUG(printk(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type)); + NETDEBUG(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type); } in_dev_put(in_dev); kfree_skb(skb); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index eb377ae1530..1ac64c0c5b3 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -377,7 +377,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user) return ip_frag_intern(hash, qp); out_nomem: - LIMIT_NETDEBUG(printk(KERN_ERR "ip_frag_create: no memory left !\n")); + LIMIT_NETDEBUG(KERN_ERR "ip_frag_create: no memory left !\n"); return NULL; } @@ -625,8 +625,8 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev) return head; out_nomem: - LIMIT_NETDEBUG(printk(KERN_ERR "IP: queue_glue: no memory for gluing " - "queue %p\n", qp)); + LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing " + "queue %p\n", qp); goto out_fail; out_oversize: if (net_ratelimit()) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 633945d27ac..19f24f778dc 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -573,7 +573,7 @@ slow_path: */ if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) { - NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n")); + NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n"); err = -ENOMEM; goto fail; } diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 7ded6e60f43..dcb7ee6c485 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -214,8 +214,8 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) spi, IPPROTO_COMP, AF_INET); if (!x) return; - NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n", - spi, NIPQUAD(iph->daddr))); + NETDEBUG(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n", + spi, NIPQUAD(iph->daddr)); xfrm_state_put(x); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 83f72346274..32a0ebc589d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -990,11 +990,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) * to destinations, already remembered * to the moment of synflood. */ - LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open " - "request from %u.%u." - "%u.%u/%u\n", - NIPQUAD(saddr), - ntohs(skb->h.th->source))); + LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open " + "request from %u.%u.%u.%u/%u\n", + NIPQUAD(saddr), + ntohs(skb->h.th->source)); dst_release(dst); goto drop_and_free; } @@ -1118,7 +1117,7 @@ static int tcp_v4_checksum_init(struct sk_buff *skb) skb->nh.iph->daddr, skb->csum)) return 0; - LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n")); + LIMIT_NETDEBUG(KERN_DEBUG "hw tcp v4 csum failed\n"); skb->ip_summed = CHECKSUM_NONE; } if (skb->len <= 76) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index a8135e1f528..3a5bbbe7dd8 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -629,7 +629,7 @@ back_from_confirm: /* ... which is an evident application bug. --ANK */ release_sock(sk); - LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 2\n")); + LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 2\n"); err = -EINVAL; goto out; } @@ -694,7 +694,7 @@ static int udp_sendpage(struct sock *sk, struct page *page, int offset, if (unlikely(!up->pending)) { release_sock(sk); - LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 3\n")); + LIMIT_NETDEBUG(KERN_DEBUG "udp cork app bug 3\n"); return -EINVAL; } @@ -1103,7 +1103,7 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, skb->ip_summed = CHECKSUM_UNNECESSARY; if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) return 0; - LIMIT_NETDEBUG(printk(KERN_DEBUG "udp v4 hw csum failure.\n")); + LIMIT_NETDEBUG(KERN_DEBUG "udp v4 hw csum failure.\n"); skb->ip_summed = CHECKSUM_NONE; } if (skb->ip_summed != CHECKSUM_UNNECESSARY) @@ -1182,13 +1182,13 @@ int udp_rcv(struct sk_buff *skb) return(0); short_packet: - LIMIT_NETDEBUG(printk(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n", - NIPQUAD(saddr), - ntohs(uh->source), - ulen, - len, - NIPQUAD(daddr), - ntohs(uh->dest))); + LIMIT_NETDEBUG(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n", + NIPQUAD(saddr), + ntohs(uh->source), + ulen, + len, + NIPQUAD(daddr), + ntohs(uh->dest)); no_header: UDP_INC_STATS_BH(UDP_MIB_INERRORS); kfree_skb(skb); @@ -1199,12 +1199,12 @@ csum_error: * RFC1122: OK. Discards the bad packet silently (as far as * the network is concerned, anyway) as per 4.1.3.4 (MUST). */ - LIMIT_NETDEBUG(printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n", - NIPQUAD(saddr), - ntohs(uh->source), - NIPQUAD(daddr), - ntohs(uh->dest), - ulen)); + LIMIT_NETDEBUG(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n", + NIPQUAD(saddr), + ntohs(uh->source), + NIPQUAD(daddr), + ntohs(uh->dest), + ulen); drop: UDP_INC_STATS_BH(UDP_MIB_INERRORS); kfree_skb(skb); -- cgit v1.2.3 From 6687e988d9aeaccad6774e6a8304f681f3ec0a03 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 10 Aug 2005 04:03:31 -0300 Subject: [ICSK]: Move TCP congestion avoidance members to icsk This changeset basically moves tcp_sk()->{ca_ops,ca_state,etc} to inet_csk(), minimal renaming/moving done in this changeset to ease review. Most of it is just changes of struct tcp_sock * to struct sock * parameters. With this we move to a state closer to two interesting goals: 1. Generalisation of net/ipv4/tcp_diag.c, becoming inet_diag.c, being used for any INET transport protocol that has struct inet_hashinfo and are derived from struct inet_connection_sock. Keeps the userspace API, that will just not display DCCP sockets, while newer versions of tools can support DCCP. 2. INET generic transport pluggable Congestion Avoidance infrastructure, using the current TCP CA infrastructure with DCCP. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 3 +- net/ipv4/tcp.c | 12 +-- net/ipv4/tcp_bic.c | 46 +++++---- net/ipv4/tcp_cong.c | 44 ++++---- net/ipv4/tcp_diag.c | 16 +-- net/ipv4/tcp_highspeed.c | 17 +-- net/ipv4/tcp_htcp.c | 53 ++++++---- net/ipv4/tcp_hybla.c | 31 +++--- net/ipv4/tcp_input.c | 223 ++++++++++++++++++++++------------------ net/ipv4/tcp_ipv4.c | 9 +- net/ipv4/tcp_minisocks.c | 5 +- net/ipv4/tcp_output.c | 36 ++++--- net/ipv4/tcp_scalable.c | 6 +- net/ipv4/tcp_timer.c | 26 ++--- net/ipv4/tcp_vegas.c | 44 ++++---- net/ipv4/tcp_westwood.c | 58 ++++++----- 16 files changed, 348 insertions(+), 281 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 026630a15ea..fe3c6d3d0c9 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -508,7 +508,8 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, newsk->sk_write_space = sk_stream_write_space; newicsk->icsk_retransmits = 0; - newicsk->icsk_backoff = 0; + newicsk->icsk_backoff = 0; + newicsk->icsk_probes_out = 0; /* Deinitialize accept_queue to trap illegal accesses. */ memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0eed64a1991..02848e72e9c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1671,11 +1671,11 @@ int tcp_disconnect(struct sock *sk, int flags) tp->write_seq = 1; icsk->icsk_backoff = 0; tp->snd_cwnd = 2; - tp->probes_out = 0; + icsk->icsk_probes_out = 0; tp->packets_out = 0; tp->snd_ssthresh = 0x7fffffff; tp->snd_cwnd_cnt = 0; - tcp_set_ca_state(tp, TCP_CA_Open); + tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); inet_csk_delack_init(sk); sk->sk_send_head = NULL; @@ -1718,7 +1718,7 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, name[val] = 0; lock_sock(sk); - err = tcp_set_congestion_control(tp, name); + err = tcp_set_congestion_control(sk, name); release_sock(sk); return err; } @@ -1886,9 +1886,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) memset(info, 0, sizeof(*info)); info->tcpi_state = sk->sk_state; - info->tcpi_ca_state = tp->ca_state; + info->tcpi_ca_state = icsk->icsk_ca_state; info->tcpi_retransmits = icsk->icsk_retransmits; - info->tcpi_probes = tp->probes_out; + info->tcpi_probes = icsk->icsk_probes_out; info->tcpi_backoff = icsk->icsk_backoff; if (tp->rx_opt.tstamp_ok) @@ -2016,7 +2016,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, len = min_t(unsigned int, len, TCP_CA_NAME_MAX); if (put_user(len, optlen)) return -EFAULT; - if (copy_to_user(optval, tp->ca_ops->name, len)) + if (copy_to_user(optval, icsk->icsk_ca_ops->name, len)) return -EFAULT; return 0; default: diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index ec38d45d664..b940346de4e 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -86,11 +86,11 @@ static inline void bictcp_reset(struct bictcp *ca) ca->delayed_ack = 2 << ACK_RATIO_SHIFT; } -static void bictcp_init(struct tcp_sock *tp) +static void bictcp_init(struct sock *sk) { - bictcp_reset(tcp_ca(tp)); + bictcp_reset(inet_csk_ca(sk)); if (initial_ssthresh) - tp->snd_ssthresh = initial_ssthresh; + tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } /* @@ -156,9 +156,10 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) /* Detect low utilization in congestion avoidance */ -static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag) +static inline void bictcp_low_utilization(struct sock *sk, int flag) { - struct bictcp *ca = tcp_ca(tp); + const struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); u32 dist, delay; /* No time stamp */ @@ -208,12 +209,13 @@ static inline void bictcp_low_utilization(struct tcp_sock *tp, int flag) } -static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack, +static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 seq_rtt, u32 in_flight, int data_acked) { - struct bictcp *ca = tcp_ca(tp); + struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); - bictcp_low_utilization(tp, data_acked); + bictcp_low_utilization(sk, data_acked); if (in_flight < tp->snd_cwnd) return; @@ -242,9 +244,10 @@ static void bictcp_cong_avoid(struct tcp_sock *tp, u32 ack, * behave like Reno until low_window is reached, * then increase congestion window slowly */ -static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp) +static u32 bictcp_recalc_ssthresh(struct sock *sk) { - struct bictcp *ca = tcp_ca(tp); + const struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); ca->epoch_start = 0; /* end of epoch */ @@ -269,31 +272,34 @@ static u32 bictcp_recalc_ssthresh(struct tcp_sock *tp) return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); } -static u32 bictcp_undo_cwnd(struct tcp_sock *tp) +static u32 bictcp_undo_cwnd(struct sock *sk) { - struct bictcp *ca = tcp_ca(tp); - + const struct tcp_sock *tp = tcp_sk(sk); + const struct bictcp *ca = inet_csk_ca(sk); return max(tp->snd_cwnd, ca->last_max_cwnd); } -static u32 bictcp_min_cwnd(struct tcp_sock *tp) +static u32 bictcp_min_cwnd(struct sock *sk) { + const struct tcp_sock *tp = tcp_sk(sk); return tp->snd_ssthresh; } -static void bictcp_state(struct tcp_sock *tp, u8 new_state) +static void bictcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Loss) - bictcp_reset(tcp_ca(tp)); + bictcp_reset(inet_csk_ca(sk)); } /* Track delayed acknowledgement ratio using sliding window * ratio = (15*ratio + sample) / 16 */ -static void bictcp_acked(struct tcp_sock *tp, u32 cnt) +static void bictcp_acked(struct sock *sk, u32 cnt) { - if (cnt > 0 && tp->ca_state == TCP_CA_Open) { - struct bictcp *ca = tcp_ca(tp); + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) { + struct bictcp *ca = inet_csk_ca(sk); cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; ca->delayed_ack += cnt; } @@ -314,7 +320,7 @@ static struct tcp_congestion_ops bictcp = { static int __init bictcp_register(void) { - BUG_ON(sizeof(struct bictcp) > TCP_CA_PRIV_SIZE); + BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE); return tcp_register_congestion_control(&bictcp); } diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 4970d10a778..bbf2d6624e8 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -73,33 +73,36 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); /* Assign choice of congestion control. */ -void tcp_init_congestion_control(struct tcp_sock *tp) +void tcp_init_congestion_control(struct sock *sk) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_congestion_ops *ca; - if (tp->ca_ops != &tcp_init_congestion_ops) + if (icsk->icsk_ca_ops != &tcp_init_congestion_ops) return; rcu_read_lock(); list_for_each_entry_rcu(ca, &tcp_cong_list, list) { if (try_module_get(ca->owner)) { - tp->ca_ops = ca; + icsk->icsk_ca_ops = ca; break; } } rcu_read_unlock(); - if (tp->ca_ops->init) - tp->ca_ops->init(tp); + if (icsk->icsk_ca_ops->init) + icsk->icsk_ca_ops->init(sk); } /* Manage refcounts on socket close. */ -void tcp_cleanup_congestion_control(struct tcp_sock *tp) +void tcp_cleanup_congestion_control(struct sock *sk) { - if (tp->ca_ops->release) - tp->ca_ops->release(tp); - module_put(tp->ca_ops->owner); + struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ca_ops->release) + icsk->icsk_ca_ops->release(sk); + module_put(icsk->icsk_ca_ops->owner); } /* Used by sysctl to change default congestion control */ @@ -143,14 +146,15 @@ void tcp_get_default_congestion_control(char *name) } /* Change congestion control for socket */ -int tcp_set_congestion_control(struct tcp_sock *tp, const char *name) +int tcp_set_congestion_control(struct sock *sk, const char *name) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_congestion_ops *ca; int err = 0; rcu_read_lock(); ca = tcp_ca_find(name); - if (ca == tp->ca_ops) + if (ca == icsk->icsk_ca_ops) goto out; if (!ca) @@ -160,10 +164,10 @@ int tcp_set_congestion_control(struct tcp_sock *tp, const char *name) err = -EBUSY; else { - tcp_cleanup_congestion_control(tp); - tp->ca_ops = ca; - if (tp->ca_ops->init) - tp->ca_ops->init(tp); + tcp_cleanup_congestion_control(sk); + icsk->icsk_ca_ops = ca; + if (icsk->icsk_ca_ops->init) + icsk->icsk_ca_ops->init(sk); } out: rcu_read_unlock(); @@ -177,9 +181,11 @@ int tcp_set_congestion_control(struct tcp_sock *tp, const char *name) /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ -void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight, +void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int flag) { + struct tcp_sock *tp = tcp_sk(sk); + if (in_flight < tp->snd_cwnd) return; @@ -202,15 +208,17 @@ void tcp_reno_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, u32 in_flight, EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); /* Slow start threshold is half the congestion window (min 2) */ -u32 tcp_reno_ssthresh(struct tcp_sock *tp) +u32 tcp_reno_ssthresh(struct sock *sk) { + const struct tcp_sock *tp = tcp_sk(sk); return max(tp->snd_cwnd >> 1U, 2U); } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); /* Lower bound on congestion window. */ -u32 tcp_reno_min_cwnd(struct tcp_sock *tp) +u32 tcp_reno_min_cwnd(struct sock *sk) { + const struct tcp_sock *tp = tcp_sk(sk); return tp->snd_ssthresh/2; } EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 5f4c74f45e8..4288ecfec9a 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -66,10 +66,10 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, if (ext & (1<<(TCPDIAG_INFO-1))) info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); - if (ext & (1<<(TCPDIAG_CONG-1))) { - size_t len = strlen(tp->ca_ops->name); + if ((ext & (1 << (TCPDIAG_CONG - 1))) && icsk->icsk_ca_ops) { + size_t len = strlen(icsk->icsk_ca_ops->name); strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1), - tp->ca_ops->name); + icsk->icsk_ca_ops->name); } } r->tcpdiag_family = sk->sk_family; @@ -136,18 +136,17 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, r->tcpdiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { r->tcpdiag_timer = 4; - r->tcpdiag_retrans = tp->probes_out; + r->tcpdiag_retrans = icsk->icsk_probes_out; r->tcpdiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); } else if (timer_pending(&sk->sk_timer)) { r->tcpdiag_timer = 2; - r->tcpdiag_retrans = tp->probes_out; + r->tcpdiag_retrans = icsk->icsk_probes_out; r->tcpdiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires); } else { r->tcpdiag_timer = 0; r->tcpdiag_expires = 0; } #undef EXPIRES_IN_MS - r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq; r->tcpdiag_wqueue = tp->write_seq - tp->snd_una; r->tcpdiag_uid = sock_i_uid(sk); @@ -163,8 +162,9 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, if (info) tcp_get_info(sk, info); - if (sk->sk_state < TCP_TIME_WAIT && tp->ca_ops->get_info) - tp->ca_ops->get_info(tp, ext, skb); + if (sk->sk_state < TCP_TIME_WAIT && + icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info) + icsk->icsk_ca_ops->get_info(sk, ext, skb); nlh->nlmsg_len = skb->tail - b; return skb->len; diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 36c51f8136b..6acc04bde08 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -98,9 +98,10 @@ struct hstcp { u32 ai; }; -static void hstcp_init(struct tcp_sock *tp) +static void hstcp_init(struct sock *sk) { - struct hstcp *ca = tcp_ca(tp); + struct tcp_sock *tp = tcp_sk(sk); + struct hstcp *ca = inet_csk_ca(sk); ca->ai = 0; @@ -109,10 +110,11 @@ static void hstcp_init(struct tcp_sock *tp) tp->snd_cwnd_clamp = min_t(u32, tp->snd_cwnd_clamp, 0xffffffff/128); } -static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt, +static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, u32 in_flight, int good) { - struct hstcp *ca = tcp_ca(tp); + struct tcp_sock *tp = tcp_sk(sk); + struct hstcp *ca = inet_csk_ca(sk); if (in_flight < tp->snd_cwnd) return; @@ -143,9 +145,10 @@ static void hstcp_cong_avoid(struct tcp_sock *tp, u32 adk, u32 rtt, } } -static u32 hstcp_ssthresh(struct tcp_sock *tp) +static u32 hstcp_ssthresh(struct sock *sk) { - struct hstcp *ca = tcp_ca(tp); + const struct tcp_sock *tp = tcp_sk(sk); + const struct hstcp *ca = inet_csk_ca(sk); /* Do multiplicative decrease */ return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U); @@ -164,7 +167,7 @@ static struct tcp_congestion_ops tcp_highspeed = { static int __init hstcp_register(void) { - BUG_ON(sizeof(struct hstcp) > TCP_CA_PRIV_SIZE); + BUG_ON(sizeof(struct hstcp) > ICSK_CA_PRIV_SIZE); return tcp_register_congestion_control(&tcp_highspeed); } diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 40168275acf..e47b37984e9 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -55,18 +55,21 @@ static inline void htcp_reset(struct htcp *ca) ca->snd_cwnd_cnt2 = 0; } -static u32 htcp_cwnd_undo(struct tcp_sock *tp) +static u32 htcp_cwnd_undo(struct sock *sk) { - struct htcp *ca = tcp_ca(tp); + const struct tcp_sock *tp = tcp_sk(sk); + struct htcp *ca = inet_csk_ca(sk); ca->ccount = ca->undo_ccount; ca->maxRTT = ca->undo_maxRTT; ca->old_maxB = ca->undo_old_maxB; return max(tp->snd_cwnd, (tp->snd_ssthresh<<7)/ca->beta); } -static inline void measure_rtt(struct tcp_sock *tp) +static inline void measure_rtt(struct sock *sk) { - struct htcp *ca = tcp_ca(tp); + const struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcp_sock *tp = tcp_sk(sk); + struct htcp *ca = inet_csk_ca(sk); u32 srtt = tp->srtt>>3; /* keep track of minimum RTT seen so far, minRTT is zero at first */ @@ -74,7 +77,7 @@ static inline void measure_rtt(struct tcp_sock *tp) ca->minRTT = srtt; /* max RTT */ - if (tp->ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) { + if (icsk->icsk_ca_state == TCP_CA_Open && tp->snd_ssthresh < 0xFFFF && ca->ccount > 3) { if (ca->maxRTT < ca->minRTT) ca->maxRTT = ca->minRTT; if (ca->maxRTT < srtt && srtt <= ca->maxRTT+HZ/50) @@ -82,13 +85,16 @@ static inline void measure_rtt(struct tcp_sock *tp) } } -static void measure_achieved_throughput(struct tcp_sock *tp, u32 pkts_acked) +static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked) { - struct htcp *ca = tcp_ca(tp); + const struct inet_connection_sock *icsk = inet_csk(sk); + const struct tcp_sock *tp = tcp_sk(sk); + struct htcp *ca = inet_csk_ca(sk); u32 now = tcp_time_stamp; /* achieved throughput calculations */ - if (tp->ca_state != TCP_CA_Open && tp->ca_state != TCP_CA_Disorder) { + if (icsk->icsk_ca_state != TCP_CA_Open && + icsk->icsk_ca_state != TCP_CA_Disorder) { ca->packetcount = 0; ca->lasttime = now; return; @@ -173,9 +179,9 @@ static inline void htcp_alpha_update(struct htcp *ca) * that point do we really have a real sense of maxRTT (the queues en route * were getting just too full now). */ -static void htcp_param_update(struct tcp_sock *tp) +static void htcp_param_update(struct sock *sk) { - struct htcp *ca = tcp_ca(tp); + struct htcp *ca = inet_csk_ca(sk); u32 minRTT = ca->minRTT; u32 maxRTT = ca->maxRTT; @@ -187,17 +193,19 @@ static void htcp_param_update(struct tcp_sock *tp) ca->maxRTT = minRTT + ((maxRTT-minRTT)*95)/100; } -static u32 htcp_recalc_ssthresh(struct tcp_sock *tp) +static u32 htcp_recalc_ssthresh(struct sock *sk) { - struct htcp *ca = tcp_ca(tp); - htcp_param_update(tp); + const struct tcp_sock *tp = tcp_sk(sk); + const struct htcp *ca = inet_csk_ca(sk); + htcp_param_update(sk); return max((tp->snd_cwnd * ca->beta) >> 7, 2U); } -static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, +static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int data_acked) { - struct htcp *ca = tcp_ca(tp); + struct tcp_sock *tp = tcp_sk(sk); + struct htcp *ca = inet_csk_ca(sk); if (in_flight < tp->snd_cwnd) return; @@ -207,7 +215,7 @@ static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, if (tp->snd_cwnd < tp->snd_cwnd_clamp) tp->snd_cwnd++; } else { - measure_rtt(tp); + measure_rtt(sk); /* keep track of number of round-trip times since last backoff event */ if (ca->snd_cwnd_cnt2++ > tp->snd_cwnd) { @@ -229,28 +237,29 @@ static void htcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, } /* Lower bound on congestion window. */ -static u32 htcp_min_cwnd(struct tcp_sock *tp) +static u32 htcp_min_cwnd(struct sock *sk) { + const struct tcp_sock *tp = tcp_sk(sk); return tp->snd_ssthresh; } -static void htcp_init(struct tcp_sock *tp) +static void htcp_init(struct sock *sk) { - struct htcp *ca = tcp_ca(tp); + struct htcp *ca = inet_csk_ca(sk); memset(ca, 0, sizeof(struct htcp)); ca->alpha = ALPHA_BASE; ca->beta = BETA_MIN; } -static void htcp_state(struct tcp_sock *tp, u8 new_state) +static void htcp_state(struct sock *sk, u8 new_state) { switch (new_state) { case TCP_CA_CWR: case TCP_CA_Recovery: case TCP_CA_Loss: - htcp_reset(tcp_ca(tp)); + htcp_reset(inet_csk_ca(sk)); break; } } @@ -269,7 +278,7 @@ static struct tcp_congestion_ops htcp = { static int __init htcp_register(void) { - BUG_ON(sizeof(struct htcp) > TCP_CA_PRIV_SIZE); + BUG_ON(sizeof(struct htcp) > ICSK_CA_PRIV_SIZE); BUILD_BUG_ON(BETA_MIN >= BETA_MAX); if (!use_bandwidth_switch) htcp.pkts_acked = NULL; diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index 13a66342c30..77add63623d 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -33,19 +33,20 @@ MODULE_PARM_DESC(rtt0, "reference rout trip time (ms)"); /* This is called to refresh values for hybla parameters */ -static inline void hybla_recalc_param (struct tcp_sock *tp) +static inline void hybla_recalc_param (struct sock *sk) { - struct hybla *ca = tcp_ca(tp); + struct hybla *ca = inet_csk_ca(sk); - ca->rho_3ls = max_t(u32, tp->srtt / msecs_to_jiffies(rtt0), 8); + ca->rho_3ls = max_t(u32, tcp_sk(sk)->srtt / msecs_to_jiffies(rtt0), 8); ca->rho = ca->rho_3ls >> 3; ca->rho2_7ls = (ca->rho_3ls * ca->rho_3ls) << 1; ca->rho2 = ca->rho2_7ls >>7; } -static void hybla_init(struct tcp_sock *tp) +static void hybla_init(struct sock *sk) { - struct hybla *ca = tcp_ca(tp); + struct tcp_sock *tp = tcp_sk(sk); + struct hybla *ca = inet_csk_ca(sk); ca->rho = 0; ca->rho2 = 0; @@ -57,17 +58,16 @@ static void hybla_init(struct tcp_sock *tp) tp->snd_cwnd_clamp = 65535; /* 1st Rho measurement based on initial srtt */ - hybla_recalc_param(tp); + hybla_recalc_param(sk); /* set minimum rtt as this is the 1st ever seen */ ca->minrtt = tp->srtt; tp->snd_cwnd = ca->rho; } -static void hybla_state(struct tcp_sock *tp, u8 ca_state) +static void hybla_state(struct sock *sk, u8 ca_state) { - struct hybla *ca = tcp_ca(tp); - + struct hybla *ca = inet_csk_ca(sk); ca->hybla_en = (ca_state == TCP_CA_Open); } @@ -86,27 +86,28 @@ static inline u32 hybla_fraction(u32 odds) * o Give cwnd a new value based on the model proposed * o remember increments <1 */ -static void hybla_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, +static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int flag) { - struct hybla *ca = tcp_ca(tp); + struct tcp_sock *tp = tcp_sk(sk); + struct hybla *ca = inet_csk_ca(sk); u32 increment, odd, rho_fractions; int is_slowstart = 0; /* Recalculate rho only if this srtt is the lowest */ if (tp->srtt < ca->minrtt){ - hybla_recalc_param(tp); + hybla_recalc_param(sk); ca->minrtt = tp->srtt; } if (!ca->hybla_en) - return tcp_reno_cong_avoid(tp, ack, rtt, in_flight, flag); + return tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); if (in_flight < tp->snd_cwnd) return; if (ca->rho == 0) - hybla_recalc_param(tp); + hybla_recalc_param(sk); rho_fractions = ca->rho_3ls - (ca->rho << 3); @@ -170,7 +171,7 @@ static struct tcp_congestion_ops tcp_hybla = { static int __init hybla_register(void) { - BUG_ON(sizeof(struct hybla) > TCP_CA_PRIV_SIZE); + BUG_ON(sizeof(struct hybla) > ICSK_CA_PRIV_SIZE); return tcp_register_congestion_control(&tcp_hybla); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 71d456148de..fdd9547fb78 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -325,11 +325,12 @@ static void tcp_init_buffer_space(struct sock *sk) /* 5. Recalculate window clamp after socket hit its memory bounds. */ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) { + struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb; unsigned int app_win = tp->rcv_nxt - tp->copied_seq; int ofo_win = 0; - inet_csk(sk)->icsk_ack.quick = 0; + icsk->icsk_ack.quick = 0; skb_queue_walk(&tp->out_of_order_queue, skb) { ofo_win += skb->len; @@ -350,8 +351,8 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) app_win += ofo_win; if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf) app_win >>= 1; - if (app_win > inet_csk(sk)->icsk_ack.rcv_mss) - app_win -= inet_csk(sk)->icsk_ack.rcv_mss; + if (app_win > icsk->icsk_ack.rcv_mss) + app_win -= icsk->icsk_ack.rcv_mss; app_win = max(app_win, 2U*tp->advmss); if (!ofo_win) @@ -549,8 +550,10 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ -static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt) +static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt, u32 *usrtt) { + struct tcp_sock *tp = tcp_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); long m = mrtt; /* RTT */ /* The following amusing code comes from Jacobson's @@ -610,8 +613,8 @@ static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt, u32 *usrtt) tp->rtt_seq = tp->snd_nxt; } - if (tp->ca_ops->rtt_sample) - tp->ca_ops->rtt_sample(tp, *usrtt); + if (icsk->icsk_ca_ops->rtt_sample) + icsk->icsk_ca_ops->rtt_sample(sk, *usrtt); } /* Calculate rto without backoff. This is the second half of Van Jacobson's @@ -663,9 +666,10 @@ void tcp_update_metrics(struct sock *sk) dst_confirm(dst); if (dst && (dst->flags&DST_HOST)) { + const struct inet_connection_sock *icsk = inet_csk(sk); int m; - if (inet_csk(sk)->icsk_backoff || !tp->srtt) { + if (icsk->icsk_backoff || !tp->srtt) { /* This session failed to estimate rtt. Why? * Probably, no packets returned in time. * Reset our results. @@ -714,7 +718,7 @@ void tcp_update_metrics(struct sock *sk) tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) dst->metrics[RTAX_CWND-1] = tp->snd_cwnd; } else if (tp->snd_cwnd > tp->snd_ssthresh && - tp->ca_state == TCP_CA_Open) { + icsk->icsk_ca_state == TCP_CA_Open) { /* Cong. avoidance phase, cwnd is reliable. */ if (!dst_metric_locked(dst, RTAX_SSTHRESH)) dst->metrics[RTAX_SSTHRESH-1] = @@ -828,8 +832,10 @@ reset: } } -static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts) +static void tcp_update_reordering(struct sock *sk, const int metric, + const int ts) { + struct tcp_sock *tp = tcp_sk(sk); if (metric > tp->reordering) { tp->reordering = min(TCP_MAX_REORDERING, metric); @@ -844,7 +850,7 @@ static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts) NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER); #if FASTRETRANS_DEBUG > 1 printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n", - tp->rx_opt.sack_ok, tp->ca_state, + tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, tp->reordering, tp->fackets_out, tp->sacked_out, @@ -906,6 +912,7 @@ static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts) static int tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una) { + const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked; struct tcp_sack_block *sp = (struct tcp_sack_block *)(ptr+2); @@ -1071,7 +1078,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ * we have to account for reordering! Ugly, * but should help. */ - if (lost_retrans && tp->ca_state == TCP_CA_Recovery) { + if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) { struct sk_buff *skb; sk_stream_for_retrans_queue(skb, sk) { @@ -1100,8 +1107,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ tp->left_out = tp->sacked_out + tp->lost_out; - if ((reord < tp->fackets_out) && tp->ca_state != TCP_CA_Loss) - tcp_update_reordering(tp, ((tp->fackets_out + 1) - reord), 0); + if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss) + tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0); #if FASTRETRANS_DEBUG > 0 BUG_TRAP((int)tp->sacked_out >= 0); @@ -1118,17 +1125,18 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ */ void tcp_enter_frto(struct sock *sk) { + const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; tp->frto_counter = 1; - if (tp->ca_state <= TCP_CA_Disorder || + if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || - (tp->ca_state == TCP_CA_Loss && !inet_csk(sk)->icsk_retransmits)) { - tp->prior_ssthresh = tcp_current_ssthresh(tp); - tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); - tcp_ca_event(tp, CA_EVENT_FRTO); + (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { + tp->prior_ssthresh = tcp_current_ssthresh(sk); + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + tcp_ca_event(sk, CA_EVENT_FRTO); } /* Have to clear retransmission markers here to keep the bookkeeping @@ -1145,7 +1153,7 @@ void tcp_enter_frto(struct sock *sk) } tcp_sync_left_out(tp); - tcp_set_ca_state(tp, TCP_CA_Open); + tcp_set_ca_state(sk, TCP_CA_Open); tp->frto_highmark = tp->snd_nxt; } @@ -1191,7 +1199,7 @@ static void tcp_enter_frto_loss(struct sock *sk) tp->reordering = min_t(unsigned int, tp->reordering, sysctl_tcp_reordering); - tcp_set_ca_state(tp, TCP_CA_Loss); + tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->frto_highmark; TCP_ECN_queue_cwr(tp); } @@ -1215,16 +1223,17 @@ void tcp_clear_retrans(struct tcp_sock *tp) */ void tcp_enter_loss(struct sock *sk, int how) { + const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int cnt = 0; /* Reduce ssthresh if it has not yet been made inside this window. */ - if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || - (tp->ca_state == TCP_CA_Loss && !inet_csk(sk)->icsk_retransmits)) { - tp->prior_ssthresh = tcp_current_ssthresh(tp); - tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); - tcp_ca_event(tp, CA_EVENT_LOSS); + if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || + (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { + tp->prior_ssthresh = tcp_current_ssthresh(sk); + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + tcp_ca_event(sk, CA_EVENT_LOSS); } tp->snd_cwnd = 1; tp->snd_cwnd_cnt = 0; @@ -1255,7 +1264,7 @@ void tcp_enter_loss(struct sock *sk, int how) tp->reordering = min_t(unsigned int, tp->reordering, sysctl_tcp_reordering); - tcp_set_ca_state(tp, TCP_CA_Loss); + tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); } @@ -1272,13 +1281,14 @@ static int tcp_check_sack_reneging(struct sock *sk) */ if ((skb = skb_peek(&sk->sk_write_queue)) != NULL && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { + struct inet_connection_sock *icsk = inet_csk(sk); NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING); tcp_enter_loss(sk, 1); - inet_csk(sk)->icsk_retransmits++; + icsk->icsk_retransmits++; tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, TCP_RTO_MAX); + icsk->icsk_rto, TCP_RTO_MAX); return 1; } return 0; @@ -1431,8 +1441,9 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp) * in assumption of absent reordering, interpret this as reordering. * The only another reason could be bug in receiver TCP. */ -static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend) +static void tcp_check_reno_reordering(struct sock *sk, const int addend) { + struct tcp_sock *tp = tcp_sk(sk); u32 holes; holes = max(tp->lost_out, 1U); @@ -1440,16 +1451,17 @@ static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend) if ((tp->sacked_out + holes) > tp->packets_out) { tp->sacked_out = tp->packets_out - holes; - tcp_update_reordering(tp, tp->packets_out+addend, 0); + tcp_update_reordering(sk, tp->packets_out + addend, 0); } } /* Emulate SACKs for SACKless connection: account for a new dupack. */ -static void tcp_add_reno_sack(struct tcp_sock *tp) +static void tcp_add_reno_sack(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); tp->sacked_out++; - tcp_check_reno_reordering(tp, 0); + tcp_check_reno_reordering(sk, 0); tcp_sync_left_out(tp); } @@ -1464,7 +1476,7 @@ static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_sock *tp, int acke else tp->sacked_out -= acked-1; } - tcp_check_reno_reordering(tp, acked); + tcp_check_reno_reordering(sk, acked); tcp_sync_left_out(tp); } @@ -1538,14 +1550,16 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) } /* Decrease cwnd each second ack. */ -static void tcp_cwnd_down(struct tcp_sock *tp) +static void tcp_cwnd_down(struct sock *sk) { + const struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); int decr = tp->snd_cwnd_cnt + 1; tp->snd_cwnd_cnt = decr&1; decr >>= 1; - if (decr && tp->snd_cwnd > tp->ca_ops->min_cwnd(tp)) + if (decr && tp->snd_cwnd > icsk->icsk_ca_ops->min_cwnd(sk)) tp->snd_cwnd -= decr; tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); @@ -1579,11 +1593,15 @@ static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg) #define DBGUNDO(x...) do { } while (0) #endif -static void tcp_undo_cwr(struct tcp_sock *tp, int undo) +static void tcp_undo_cwr(struct sock *sk, const int undo) { + struct tcp_sock *tp = tcp_sk(sk); + if (tp->prior_ssthresh) { - if (tp->ca_ops->undo_cwnd) - tp->snd_cwnd = tp->ca_ops->undo_cwnd(tp); + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ca_ops->undo_cwnd) + tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk); else tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); @@ -1611,9 +1629,9 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp) /* Happy end! We did not retransmit anything * or our original transmission succeeded. */ - DBGUNDO(sk, tp, tp->ca_state == TCP_CA_Loss ? "loss" : "retrans"); - tcp_undo_cwr(tp, 1); - if (tp->ca_state == TCP_CA_Loss) + DBGUNDO(sk, tp, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); + tcp_undo_cwr(sk, 1); + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); else NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO); @@ -1626,7 +1644,7 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp) tcp_moderate_cwnd(tp); return 1; } - tcp_set_ca_state(tp, TCP_CA_Open); + tcp_set_ca_state(sk, TCP_CA_Open); return 0; } @@ -1635,7 +1653,7 @@ static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp) { if (tp->undo_marker && !tp->undo_retrans) { DBGUNDO(sk, tp, "D-SACK"); - tcp_undo_cwr(tp, 1); + tcp_undo_cwr(sk, 1); tp->undo_marker = 0; NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO); } @@ -1656,10 +1674,10 @@ static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp, if (tp->retrans_out == 0) tp->retrans_stamp = 0; - tcp_update_reordering(tp, tcp_fackets_out(tp)+acked, 1); + tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); DBGUNDO(sk, tp, "Hoe"); - tcp_undo_cwr(tp, 0); + tcp_undo_cwr(sk, 0); NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO); /* So... Do not make Hoe's retransmit yet. @@ -1682,22 +1700,23 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) DBGUNDO(sk, tp, "partial loss"); tp->lost_out = 0; tp->left_out = tp->sacked_out; - tcp_undo_cwr(tp, 1); + tcp_undo_cwr(sk, 1); NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); inet_csk(sk)->icsk_retransmits = 0; tp->undo_marker = 0; if (!IsReno(tp)) - tcp_set_ca_state(tp, TCP_CA_Open); + tcp_set_ca_state(sk, TCP_CA_Open); return 1; } return 0; } -static inline void tcp_complete_cwr(struct tcp_sock *tp) +static inline void tcp_complete_cwr(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); tp->snd_cwnd_stamp = tcp_time_stamp; - tcp_ca_event(tp, CA_EVENT_COMPLETE_CWR); + tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); } static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) @@ -1708,21 +1727,21 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) tp->retrans_stamp = 0; if (flag&FLAG_ECE) - tcp_enter_cwr(tp); + tcp_enter_cwr(sk); - if (tp->ca_state != TCP_CA_CWR) { + if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { int state = TCP_CA_Open; if (tp->left_out || tp->retrans_out || tp->undo_marker) state = TCP_CA_Disorder; - if (tp->ca_state != state) { - tcp_set_ca_state(tp, state); + if (inet_csk(sk)->icsk_ca_state != state) { + tcp_set_ca_state(sk, state); tp->high_seq = tp->snd_nxt; } tcp_moderate_cwnd(tp); } else { - tcp_cwnd_down(tp); + tcp_cwnd_down(sk); } } @@ -1741,6 +1760,7 @@ static void tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, int prior_packets, int flag) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP)); @@ -1764,7 +1784,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* C. Process data loss notification, provided it is valid. */ if ((flag&FLAG_DATA_LOST) && before(tp->snd_una, tp->high_seq) && - tp->ca_state != TCP_CA_Open && + icsk->icsk_ca_state != TCP_CA_Open && tp->fackets_out > tp->reordering) { tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq); NET_INC_STATS_BH(LINUX_MIB_TCPLOSS); @@ -1775,14 +1795,14 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* E. Check state exit conditions. State can be terminated * when high_seq is ACKed. */ - if (tp->ca_state == TCP_CA_Open) { + if (icsk->icsk_ca_state == TCP_CA_Open) { if (!sysctl_tcp_frto) BUG_TRAP(tp->retrans_out == 0); tp->retrans_stamp = 0; } else if (!before(tp->snd_una, tp->high_seq)) { - switch (tp->ca_state) { + switch (icsk->icsk_ca_state) { case TCP_CA_Loss: - inet_csk(sk)->icsk_retransmits = 0; + icsk->icsk_retransmits = 0; if (tcp_try_undo_recovery(sk, tp)) return; break; @@ -1791,8 +1811,8 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* CWR is to be held something *above* high_seq * is ACKed for CWR bit to reach receiver. */ if (tp->snd_una != tp->high_seq) { - tcp_complete_cwr(tp); - tcp_set_ca_state(tp, TCP_CA_Open); + tcp_complete_cwr(sk); + tcp_set_ca_state(sk, TCP_CA_Open); } break; @@ -1803,7 +1823,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, * catching for all duplicate ACKs. */ IsReno(tp) || tp->snd_una != tp->high_seq) { tp->undo_marker = 0; - tcp_set_ca_state(tp, TCP_CA_Open); + tcp_set_ca_state(sk, TCP_CA_Open); } break; @@ -1812,17 +1832,17 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, tcp_reset_reno_sack(tp); if (tcp_try_undo_recovery(sk, tp)) return; - tcp_complete_cwr(tp); + tcp_complete_cwr(sk); break; } } /* F. Process state. */ - switch (tp->ca_state) { + switch (icsk->icsk_ca_state) { case TCP_CA_Recovery: if (prior_snd_una == tp->snd_una) { if (IsReno(tp) && is_dupack) - tcp_add_reno_sack(tp); + tcp_add_reno_sack(sk); } else { int acked = prior_packets - tp->packets_out; if (IsReno(tp)) @@ -1832,13 +1852,13 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, break; case TCP_CA_Loss: if (flag&FLAG_DATA_ACKED) - inet_csk(sk)->icsk_retransmits = 0; + icsk->icsk_retransmits = 0; if (!tcp_try_undo_loss(sk, tp)) { tcp_moderate_cwnd(tp); tcp_xmit_retransmit_queue(sk); return; } - if (tp->ca_state != TCP_CA_Open) + if (icsk->icsk_ca_state != TCP_CA_Open) return; /* Loss is undone; fall through to processing in Open state. */ default: @@ -1846,10 +1866,10 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, if (tp->snd_una != prior_snd_una) tcp_reset_reno_sack(tp); if (is_dupack) - tcp_add_reno_sack(tp); + tcp_add_reno_sack(sk); } - if (tp->ca_state == TCP_CA_Disorder) + if (icsk->icsk_ca_state == TCP_CA_Disorder) tcp_try_undo_dsack(sk, tp); if (!tcp_time_to_recover(sk, tp)) { @@ -1869,20 +1889,20 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, tp->undo_marker = tp->snd_una; tp->undo_retrans = tp->retrans_out; - if (tp->ca_state < TCP_CA_CWR) { + if (icsk->icsk_ca_state < TCP_CA_CWR) { if (!(flag&FLAG_ECE)) - tp->prior_ssthresh = tcp_current_ssthresh(tp); - tp->snd_ssthresh = tp->ca_ops->ssthresh(tp); + tp->prior_ssthresh = tcp_current_ssthresh(sk); + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); TCP_ECN_queue_cwr(tp); } tp->snd_cwnd_cnt = 0; - tcp_set_ca_state(tp, TCP_CA_Recovery); + tcp_set_ca_state(sk, TCP_CA_Recovery); } if (is_dupack || tcp_head_timedout(sk, tp)) tcp_update_scoreboard(sk, tp); - tcp_cwnd_down(tp); + tcp_cwnd_down(sk); tcp_xmit_retransmit_queue(sk); } @@ -1908,7 +1928,7 @@ static void tcp_ack_saw_tstamp(struct sock *sk, u32 *usrtt, int flag) */ struct tcp_sock *tp = tcp_sk(sk); const __u32 seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; - tcp_rtt_estimator(tp, seq_rtt, usrtt); + tcp_rtt_estimator(sk, seq_rtt, usrtt); tcp_set_rto(sk); inet_csk(sk)->icsk_backoff = 0; tcp_bound_rto(sk); @@ -1928,7 +1948,7 @@ static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 *usrtt, int flag if (flag & FLAG_RETRANS_DATA_ACKED) return; - tcp_rtt_estimator(tcp_sk(sk), seq_rtt, usrtt); + tcp_rtt_estimator(sk, seq_rtt, usrtt); tcp_set_rto(sk); inet_csk(sk)->icsk_backoff = 0; tcp_bound_rto(sk); @@ -1945,11 +1965,12 @@ static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, tcp_ack_no_tstamp(sk, seq_rtt, usrtt, flag); } -static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, +static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int good) { - tp->ca_ops->cong_avoid(tp, ack, rtt, in_flight, good); - tp->snd_cwnd_stamp = tcp_time_stamp; + const struct inet_connection_sock *icsk = inet_csk(sk); + icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good); + tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; } /* Restart timer after forward progress on connection. @@ -2098,11 +2119,12 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt } if (acked&FLAG_ACKED) { + const struct inet_connection_sock *icsk = inet_csk(sk); tcp_ack_update_rtt(sk, acked, seq_rtt, seq_usrtt); tcp_ack_packets_out(sk, tp); - if (tp->ca_ops->pkts_acked) - tp->ca_ops->pkts_acked(tp, pkts_acked); + if (icsk->icsk_ca_ops->pkts_acked) + icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked); } #if FASTRETRANS_DEBUG > 0 @@ -2110,19 +2132,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt BUG_TRAP((int)tp->lost_out >= 0); BUG_TRAP((int)tp->retrans_out >= 0); if (!tp->packets_out && tp->rx_opt.sack_ok) { + const struct inet_connection_sock *icsk = inet_csk(sk); if (tp->lost_out) { printk(KERN_DEBUG "Leak l=%u %d\n", - tp->lost_out, tp->ca_state); + tp->lost_out, icsk->icsk_ca_state); tp->lost_out = 0; } if (tp->sacked_out) { printk(KERN_DEBUG "Leak s=%u %d\n", - tp->sacked_out, tp->ca_state); + tp->sacked_out, icsk->icsk_ca_state); tp->sacked_out = 0; } if (tp->retrans_out) { printk(KERN_DEBUG "Leak r=%u %d\n", - tp->retrans_out, tp->ca_state); + tp->retrans_out, icsk->icsk_ca_state); tp->retrans_out = 0; } } @@ -2152,16 +2175,17 @@ static void tcp_ack_probe(struct sock *sk) } } -static inline int tcp_ack_is_dubious(struct tcp_sock *tp, int flag) +static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag) { return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || - tp->ca_state != TCP_CA_Open); + inet_csk(sk)->icsk_ca_state != TCP_CA_Open); } -static inline int tcp_may_raise_cwnd(struct tcp_sock *tp, int flag) +static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) { + const struct tcp_sock *tp = tcp_sk(sk); return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && - !((1<ca_state)&(TCPF_CA_Recovery|TCPF_CA_CWR)); + !((1 << inet_csk(sk)->icsk_ca_state) & (TCPF_CA_Recovery | TCPF_CA_CWR)); } /* Check that window update is acceptable. @@ -2251,6 +2275,7 @@ static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); u32 prior_snd_una = tp->snd_una; u32 ack_seq = TCP_SKB_CB(skb)->seq; @@ -2278,7 +2303,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) tp->snd_una = ack; flag |= FLAG_WIN_UPDATE; - tcp_ca_event(tp, CA_EVENT_FAST_ACK); + tcp_ca_event(sk, CA_EVENT_FAST_ACK); NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); } else { @@ -2295,7 +2320,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) flag |= FLAG_ECE; - tcp_ca_event(tp, CA_EVENT_SLOW_ACK); + tcp_ca_event(sk, CA_EVENT_SLOW_ACK); } /* We passed data and got it acked, remove any soft error @@ -2311,19 +2336,19 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) /* See if we can take anything off of the retransmit queue. */ flag |= tcp_clean_rtx_queue(sk, &seq_rtt, - tp->ca_ops->rtt_sample ? &seq_usrtt : NULL); + icsk->icsk_ca_ops->rtt_sample ? &seq_usrtt : NULL); if (tp->frto_counter) tcp_process_frto(sk, prior_snd_una); - if (tcp_ack_is_dubious(tp, flag)) { + if (tcp_ack_is_dubious(sk, flag)) { /* Advanve CWND, if state allows this. */ - if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(tp, flag)) - tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 0); + if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag)) + tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0); tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); } else { if ((flag & FLAG_DATA_ACKED)) - tcp_cong_avoid(tp, ack, seq_rtt, prior_in_flight, 1); + tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1); } if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) @@ -2332,7 +2357,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) return 1; no_queue: - tp->probes_out = 0; + icsk->icsk_probes_out = 0; /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than @@ -3301,12 +3326,12 @@ void tcp_cwnd_application_limited(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - if (tp->ca_state == TCP_CA_Open && + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { /* Limited by application or receiver window. */ u32 win_used = max(tp->snd_cwnd_used, 2U); if (win_used < tp->snd_cwnd) { - tp->snd_ssthresh = tcp_current_ssthresh(tp); + tp->snd_ssthresh = tcp_current_ssthresh(sk); tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; } tp->snd_cwnd_used = 0; @@ -3935,7 +3960,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_init_metrics(sk); - tcp_init_congestion_control(tp); + tcp_init_congestion_control(sk); /* Prevent spurious tcp_cwnd_restart() on first data * packet. @@ -4212,7 +4237,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, tcp_init_metrics(sk); - tcp_init_congestion_control(tp); + tcp_init_congestion_control(sk); /* Prevent spurious tcp_cwnd_restart() on * first data packet. diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 32a0ebc589d..97bbf595230 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1409,13 +1409,14 @@ struct tcp_func ipv4_specific = { */ static int tcp_v4_init_sock(struct sock *sk) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); skb_queue_head_init(&tp->out_of_order_queue); tcp_init_xmit_timers(sk); tcp_prequeue_init(tp); - inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; + icsk->icsk_rto = TCP_TIMEOUT_INIT; tp->mdev = TCP_TIMEOUT_INIT; /* So many TCP implementations out there (incorrectly) count the @@ -1433,7 +1434,7 @@ static int tcp_v4_init_sock(struct sock *sk) tp->mss_cache = 536; tp->reordering = sysctl_tcp_reordering; - tp->ca_ops = &tcp_init_congestion_ops; + icsk->icsk_ca_ops = &tcp_init_congestion_ops; sk->sk_state = TCP_CLOSE; @@ -1456,7 +1457,7 @@ int tcp_v4_destroy_sock(struct sock *sk) tcp_clear_xmit_timers(sk); - tcp_cleanup_congestion_control(tp); + tcp_cleanup_congestion_control(sk); /* Cleanup up the write buffer. */ sk_stream_writequeue_purge(sk); @@ -1883,7 +1884,7 @@ static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i) jiffies_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, sock_i_uid(sp), - tp->probes_out, + icsk->icsk_probes_out, sock_i_ino(sp), atomic_read(&sp->sk_refcnt), sp, icsk->icsk_rto, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index dc085233d51..a88db28b0af 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -384,9 +384,9 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->frto_counter = 0; newtp->frto_highmark = 0; - newtp->ca_ops = &tcp_reno; + newicsk->icsk_ca_ops = &tcp_reno; - tcp_set_ca_state(newtp, TCP_CA_Open); + tcp_set_ca_state(newsk, TCP_CA_Open); tcp_init_xmit_timers(newsk); skb_queue_head_init(&newtp->out_of_order_queue); newtp->rcv_wup = treq->rcv_isn + 1; @@ -399,7 +399,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newtp->rx_opt.dsack = 0; newtp->rx_opt.eff_sacks = 0; - newtp->probes_out = 0; newtp->rx_opt.num_sacks = 0; newtp->urg_data = 0; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f458eacb5ef..267b0fcbfc9 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -112,9 +112,9 @@ static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst) u32 restart_cwnd = tcp_init_cwnd(tp, dst); u32 cwnd = tp->snd_cwnd; - tcp_ca_event(tp, CA_EVENT_CWND_RESTART); + tcp_ca_event(sk, CA_EVENT_CWND_RESTART); - tp->snd_ssthresh = tcp_current_ssthresh(tp); + tp->snd_ssthresh = tcp_current_ssthresh(sk); restart_cwnd = min(restart_cwnd, cwnd); while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd) @@ -265,6 +265,7 @@ static __inline__ u16 tcp_select_window(struct sock *sk) static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) { if (skb != NULL) { + const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -280,7 +281,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) #define SYSCTL_FLAG_SACK 0x4 /* If congestion control is doing timestamping */ - if (tp->ca_ops->rtt_sample) + if (icsk->icsk_ca_ops->rtt_sample) do_gettimeofday(&skb->stamp); sysctl_flags = 0; @@ -308,7 +309,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) } if (tcp_packets_in_flight(tp) == 0) - tcp_ca_event(tp, CA_EVENT_TX_START); + tcp_ca_event(sk, CA_EVENT_TX_START); th = (struct tcphdr *) skb_push(skb, tcp_header_size); skb->h.th = th; @@ -366,7 +367,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) if (err <= 0) return err; - tcp_enter_cwr(tp); + tcp_enter_cwr(sk); /* NET_XMIT_CN is special. It does not guarantee, * that this packet is lost. It tells that device @@ -905,12 +906,13 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, */ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) { + const struct inet_connection_sock *icsk = inet_csk(sk); u32 send_win, cong_win, limit, in_flight; if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) return 0; - if (tp->ca_state != TCP_CA_Open) + if (icsk->icsk_ca_state != TCP_CA_Open) return 0; in_flight = tcp_packets_in_flight(tp); @@ -1287,6 +1289,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m */ void tcp_simple_retransmit(struct sock *sk) { + const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; unsigned int mss = tcp_current_mss(sk, 0); @@ -1317,12 +1320,12 @@ void tcp_simple_retransmit(struct sock *sk) * in network, but units changed and effective * cwnd/ssthresh really reduced now. */ - if (tp->ca_state != TCP_CA_Loss) { + if (icsk->icsk_ca_state != TCP_CA_Loss) { tp->high_seq = tp->snd_nxt; - tp->snd_ssthresh = tcp_current_ssthresh(tp); + tp->snd_ssthresh = tcp_current_ssthresh(sk); tp->prior_ssthresh = 0; tp->undo_marker = 0; - tcp_set_ca_state(tp, TCP_CA_Loss); + tcp_set_ca_state(sk, TCP_CA_Loss); } tcp_xmit_retransmit_queue(sk); } @@ -1462,6 +1465,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) */ void tcp_xmit_retransmit_queue(struct sock *sk) { + const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int packet_cnt = tp->lost_out; @@ -1485,7 +1489,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { if (tcp_retransmit_skb(sk, skb)) return; - if (tp->ca_state != TCP_CA_Loss) + if (icsk->icsk_ca_state != TCP_CA_Loss) NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); else NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS); @@ -1507,7 +1511,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) /* OK, demanded retransmission is finished. */ /* Forward retransmissions are possible only during Recovery. */ - if (tp->ca_state != TCP_CA_Recovery) + if (icsk->icsk_ca_state != TCP_CA_Recovery) return; /* No forward retransmissions in Reno are possible. */ @@ -2028,7 +2032,7 @@ void tcp_send_probe0(struct sock *sk) if (tp->packets_out || !sk->sk_send_head) { /* Cancel probe timer, if it is not required. */ - tp->probes_out = 0; + icsk->icsk_probes_out = 0; icsk->icsk_backoff = 0; return; } @@ -2036,19 +2040,19 @@ void tcp_send_probe0(struct sock *sk) if (err <= 0) { if (icsk->icsk_backoff < sysctl_tcp_retries2) icsk->icsk_backoff++; - tp->probes_out++; + icsk->icsk_probes_out++; inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), TCP_RTO_MAX); } else { /* If packet was not sent due to local congestion, - * do not backoff and do not remember probes_out. + * do not backoff and do not remember icsk_probes_out. * Let local senders to fight for local resources. * * Use accumulated backoff yet. */ - if (!tp->probes_out) - tp->probes_out=1; + if (!icsk->icsk_probes_out) + icsk->icsk_probes_out = 1; inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RESOURCE_PROBE_INTERVAL), diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 70e108e15c7..327770bf552 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -16,9 +16,10 @@ #define TCP_SCALABLE_AI_CNT 50U #define TCP_SCALABLE_MD_SCALE 3 -static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, +static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int flag) { + struct tcp_sock *tp = tcp_sk(sk); if (in_flight < tp->snd_cwnd) return; @@ -35,8 +36,9 @@ static void tcp_scalable_cong_avoid(struct tcp_sock *tp, u32 ack, u32 rtt, tp->snd_cwnd_stamp = tcp_time_stamp; } -static u32 tcp_scalable_ssthresh(struct tcp_sock *tp) +static u32 tcp_scalable_ssthresh(struct sock *sk) { + const struct tcp_sock *tp = tcp_sk(sk); return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 72cec698183..415ee47ac1c 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -233,11 +233,12 @@ out_unlock: static void tcp_probe_timer(struct sock *sk) { + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int max_probes; if (tp->packets_out || !sk->sk_send_head) { - tp->probes_out = 0; + icsk->icsk_probes_out = 0; return; } @@ -248,7 +249,7 @@ static void tcp_probe_timer(struct sock *sk) * FIXME: We ought not to do it, Solaris 2.5 actually has fixing * this behaviour in Solaris down as a bug fix. [AC] * - * Let me to explain. probes_out is zeroed by incoming ACKs + * Let me to explain. icsk_probes_out is zeroed by incoming ACKs * even if they advertise zero window. Hence, connection is killed only * if we received no ACKs for normal connection timeout. It is not killed * only because window stays zero for some time, window may be zero @@ -259,16 +260,15 @@ static void tcp_probe_timer(struct sock *sk) max_probes = sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { - const struct inet_connection_sock *icsk = inet_csk(sk); const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX); max_probes = tcp_orphan_retries(sk, alive); - if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes)) + if (tcp_out_of_resources(sk, alive || icsk->icsk_probes_out <= max_probes)) return; } - if (tp->probes_out > max_probes) { + if (icsk->icsk_probes_out > max_probes) { tcp_write_err(sk); } else { /* Only send another probe if we didn't close things up. */ @@ -319,19 +319,20 @@ static void tcp_retransmit_timer(struct sock *sk) goto out; if (icsk->icsk_retransmits == 0) { - if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) { + if (icsk->icsk_ca_state == TCP_CA_Disorder || + icsk->icsk_ca_state == TCP_CA_Recovery) { if (tp->rx_opt.sack_ok) { - if (tp->ca_state == TCP_CA_Recovery) + if (icsk->icsk_ca_state == TCP_CA_Recovery) NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL); else NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES); } else { - if (tp->ca_state == TCP_CA_Recovery) + if (icsk->icsk_ca_state == TCP_CA_Recovery) NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL); else NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES); } - } else if (tp->ca_state == TCP_CA_Loss) { + } else if (icsk->icsk_ca_state == TCP_CA_Loss) { NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES); } else { NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS); @@ -449,6 +450,7 @@ void tcp_set_keepalive(struct sock *sk, int val) static void tcp_keepalive_timer (unsigned long data) { struct sock *sk = (struct sock *) data; + struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); __u32 elapsed; @@ -490,14 +492,14 @@ static void tcp_keepalive_timer (unsigned long data) elapsed = tcp_time_stamp - tp->rcv_tstamp; if (elapsed >= keepalive_time_when(tp)) { - if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) || - (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) { + if ((!tp->keepalive_probes && icsk->icsk_probes_out >= sysctl_tcp_keepalive_probes) || + (tp->keepalive_probes && icsk->icsk_probes_out >= tp->keepalive_probes)) { tcp_send_active_reset(sk, GFP_ATOMIC); tcp_write_err(sk); goto out; } if (tcp_write_wakeup(sk) <= 0) { - tp->probes_out++; + icsk->icsk_probes_out++; elapsed = keepalive_intvl_when(tp); } else { /* If keepalive was lost due to local congestion, diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 9bd443db519..054de24efee 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -82,9 +82,10 @@ struct vegas { * Instead we must wait until the completion of an RTT during * which we actually receive ACKs. */ -static inline void vegas_enable(struct tcp_sock *tp) +static inline void vegas_enable(struct sock *sk) { - struct vegas *vegas = tcp_ca(tp); + const struct tcp_sock *tp = tcp_sk(sk); + struct vegas *vegas = inet_csk_ca(sk); /* Begin taking Vegas samples next time we send something. */ vegas->doing_vegas_now = 1; @@ -97,19 +98,19 @@ static inline void vegas_enable(struct tcp_sock *tp) } /* Stop taking Vegas samples for now. */ -static inline void vegas_disable(struct tcp_sock *tp) +static inline void vegas_disable(struct sock *sk) { - struct vegas *vegas = tcp_ca(tp); + struct vegas *vegas = inet_csk_ca(sk); vegas->doing_vegas_now = 0; } -static void tcp_vegas_init(struct tcp_sock *tp) +static void tcp_vegas_init(struct sock *sk) { - struct vegas *vegas = tcp_ca(tp); + struct vegas *vegas = inet_csk_ca(sk); vegas->baseRTT = 0x7fffffff; - vegas_enable(tp); + vegas_enable(sk); } /* Do RTT sampling needed for Vegas. @@ -120,9 +121,9 @@ static void tcp_vegas_init(struct tcp_sock *tp) * o min-filter RTT samples from a much longer window (forever for now) * to find the propagation delay (baseRTT) */ -static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt) +static void tcp_vegas_rtt_calc(struct sock *sk, u32 usrtt) { - struct vegas *vegas = tcp_ca(tp); + struct vegas *vegas = inet_csk_ca(sk); u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */ /* Filter to find propagation delay: */ @@ -136,13 +137,13 @@ static void tcp_vegas_rtt_calc(struct tcp_sock *tp, u32 usrtt) vegas->cntRTT++; } -static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state) +static void tcp_vegas_state(struct sock *sk, u8 ca_state) { if (ca_state == TCP_CA_Open) - vegas_enable(tp); + vegas_enable(sk); else - vegas_disable(tp); + vegas_disable(sk); } /* @@ -154,20 +155,21 @@ static void tcp_vegas_state(struct tcp_sock *tp, u8 ca_state) * packets, _then_ we can make Vegas calculations * again. */ -static void tcp_vegas_cwnd_event(struct tcp_sock *tp, enum tcp_ca_event event) +static void tcp_vegas_cwnd_event(struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START) - tcp_vegas_init(tp); + tcp_vegas_init(sk); } -static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack, +static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 seq_rtt, u32 in_flight, int flag) { - struct vegas *vegas = tcp_ca(tp); + struct tcp_sock *tp = tcp_sk(sk); + struct vegas *vegas = inet_csk_ca(sk); if (!vegas->doing_vegas_now) - return tcp_reno_cong_avoid(tp, ack, seq_rtt, in_flight, flag); + return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); /* The key players are v_beg_snd_una and v_beg_snd_nxt. * @@ -219,7 +221,7 @@ static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack, * but that's not too awful, since we're taking the min, * rather than averaging. */ - tcp_vegas_rtt_calc(tp, seq_rtt*1000); + tcp_vegas_rtt_calc(sk, seq_rtt * 1000); /* We do the Vegas calculations only if we got enough RTT * samples that we can be reasonably sure that we got @@ -359,10 +361,10 @@ static void tcp_vegas_cong_avoid(struct tcp_sock *tp, u32 ack, } /* Extract info for Tcp socket info provided via netlink. */ -static void tcp_vegas_get_info(struct tcp_sock *tp, u32 ext, +static void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) { - const struct vegas *ca = tcp_ca(tp); + const struct vegas *ca = inet_csk_ca(sk); if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { struct tcpvegas_info *info; @@ -393,7 +395,7 @@ static struct tcp_congestion_ops tcp_vegas = { static int __init tcp_vegas_register(void) { - BUG_ON(sizeof(struct vegas) > TCP_CA_PRIV_SIZE); + BUG_ON(sizeof(struct vegas) > ICSK_CA_PRIV_SIZE); tcp_register_congestion_control(&tcp_vegas); return 0; } diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index ef827242c94..d8a5a2b92e3 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -40,9 +40,9 @@ struct westwood { * way as soon as possible. It will reasonably happen within the first * RTT period of the connection lifetime. */ -static void tcp_westwood_init(struct tcp_sock *tp) +static void tcp_westwood_init(struct sock *sk) { - struct westwood *w = tcp_ca(tp); + struct westwood *w = inet_csk_ca(sk); w->bk = 0; w->bw_ns_est = 0; @@ -51,7 +51,7 @@ static void tcp_westwood_init(struct tcp_sock *tp) w->cumul_ack = 0; w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; w->rtt_win_sx = tcp_time_stamp; - w->snd_una = tp->snd_una; + w->snd_una = tcp_sk(sk)->snd_una; } /* @@ -74,11 +74,11 @@ static inline void westwood_filter(struct westwood *w, u32 delta) * Called after processing group of packets. * but all westwood needs is the last sample of srtt. */ -static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt) +static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt) { - struct westwood *w = tcp_ca(tp); + struct westwood *w = inet_csk_ca(sk); if (cnt > 0) - w->rtt = tp->srtt >> 3; + w->rtt = tcp_sk(sk)->srtt >> 3; } /* @@ -86,9 +86,9 @@ static void tcp_westwood_pkts_acked(struct tcp_sock *tp, u32 cnt) * It updates RTT evaluation window if it is the right moment to do * it. If so it calls filter for evaluating bandwidth. */ -static void westwood_update_window(struct tcp_sock *tp) +static void westwood_update_window(struct sock *sk) { - struct westwood *w = tcp_ca(tp); + struct westwood *w = inet_csk_ca(sk); s32 delta = tcp_time_stamp - w->rtt_win_sx; /* @@ -114,11 +114,12 @@ static void westwood_update_window(struct tcp_sock *tp) * header prediction is successful. In such case in fact update is * straight forward and doesn't need any particular care. */ -static inline void westwood_fast_bw(struct tcp_sock *tp) +static inline void westwood_fast_bw(struct sock *sk) { - struct westwood *w = tcp_ca(tp); + const struct tcp_sock *tp = tcp_sk(sk); + struct westwood *w = inet_csk_ca(sk); - westwood_update_window(tp); + westwood_update_window(sk); w->bk += tp->snd_una - w->snd_una; w->snd_una = tp->snd_una; @@ -130,9 +131,10 @@ static inline void westwood_fast_bw(struct tcp_sock *tp) * This function evaluates cumul_ack for evaluating bk in case of * delayed or partial acks. */ -static inline u32 westwood_acked_count(struct tcp_sock *tp) +static inline u32 westwood_acked_count(struct sock *sk) { - struct westwood *w = tcp_ca(tp); + const struct tcp_sock *tp = tcp_sk(sk); + struct westwood *w = inet_csk_ca(sk); w->cumul_ack = tp->snd_una - w->snd_una; @@ -160,9 +162,10 @@ static inline u32 westwood_acked_count(struct tcp_sock *tp) return w->cumul_ack; } -static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp) +static inline u32 westwood_bw_rttmin(const struct sock *sk) { - struct westwood *w = tcp_ca(tp); + const struct tcp_sock *tp = tcp_sk(sk); + const struct westwood *w = inet_csk_ca(sk); return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); } @@ -172,31 +175,32 @@ static inline u32 westwood_bw_rttmin(const struct tcp_sock *tp) * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 * so avoids ever returning 0. */ -static u32 tcp_westwood_cwnd_min(struct tcp_sock *tp) +static u32 tcp_westwood_cwnd_min(struct sock *sk) { - return westwood_bw_rttmin(tp); + return westwood_bw_rttmin(sk); } -static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event) +static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) { - struct westwood *w = tcp_ca(tp); + struct tcp_sock *tp = tcp_sk(sk); + struct westwood *w = inet_csk_ca(sk); switch(event) { case CA_EVENT_FAST_ACK: - westwood_fast_bw(tp); + westwood_fast_bw(sk); break; case CA_EVENT_COMPLETE_CWR: - tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(tp); + tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(sk); break; case CA_EVENT_FRTO: - tp->snd_ssthresh = westwood_bw_rttmin(tp); + tp->snd_ssthresh = westwood_bw_rttmin(sk); break; case CA_EVENT_SLOW_ACK: - westwood_update_window(tp); - w->bk += westwood_acked_count(tp); + westwood_update_window(sk); + w->bk += westwood_acked_count(sk); w->rtt_min = min(w->rtt, w->rtt_min); break; @@ -208,10 +212,10 @@ static void tcp_westwood_event(struct tcp_sock *tp, enum tcp_ca_event event) /* Extract info for Tcp socket info provided via netlink. */ -static void tcp_westwood_info(struct tcp_sock *tp, u32 ext, +static void tcp_westwood_info(struct sock *sk, u32 ext, struct sk_buff *skb) { - const struct westwood *ca = tcp_ca(tp); + const struct westwood *ca = inet_csk_ca(sk); if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { struct rtattr *rta; struct tcpvegas_info *info; @@ -242,7 +246,7 @@ static struct tcp_congestion_ops tcp_westwood = { static int __init tcp_westwood_register(void) { - BUG_ON(sizeof(struct westwood) > TCP_CA_PRIV_SIZE); + BUG_ON(sizeof(struct westwood) > ICSK_CA_PRIV_SIZE); return tcp_register_congestion_control(&tcp_westwood); } -- cgit v1.2.3 From 540722ffc3a0d7e11d97a13e1ce6f3bc23b061c1 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 10 Aug 2005 05:54:28 -0300 Subject: [TCPDIAG]: Implement cheapest way of supporting DCCPDIAG_GETSOCK With ugly ifdefs, etc, but this actually: 1. keeps the existing ABI, i.e. no need to recompile the iproute2 utilities if not interested in DCCP. 2. Provides all the tcp_diag functionality in DCCP, with just a small patch that makes iproute2 support DCCP. Of course I'll get this cleaned-up in time, but for now I think its OK to be this way to quickly get this functionality. iproute2-ss050808 patch at: http://vger.kernel.org/~acme/iproute2-ss050808.dccp.patch Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/tcp_diag.c | 86 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 61 insertions(+), 25 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 4288ecfec9a..f5fc84aaa9b 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -45,11 +45,15 @@ static struct sock *tcpnl; #define TCPDIAG_PUT(skb, attrtype, attrlen) \ RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) +#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) +extern struct inet_hashinfo dccp_hashinfo; +#endif + static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, - int ext, u32 pid, u32 seq, u16 nlmsg_flags) + int ext, u32 pid, u32 seq, u16 nlmsg_flags, + const struct nlmsghdr *unlh) { const struct inet_sock *inet = inet_sk(sk); - struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); struct tcpdiagmsg *r; struct nlmsghdr *nlh; @@ -57,7 +61,7 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, struct tcpdiag_meminfo *minfo = NULL; unsigned char *b = skb->tail; - nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); + nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); nlh->nlmsg_flags = nlmsg_flags; r = NLMSG_DATA(nlh); if (sk->sk_state != TCP_TIME_WAIT) { @@ -147,8 +151,20 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, r->tcpdiag_expires = 0; } #undef EXPIRES_IN_MS - r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq; - r->tcpdiag_wqueue = tp->write_seq - tp->snd_una; + /* + * Ahem... for now we'll have some knowledge about TCP -acme + * But this is just one of two small exceptions, both in this + * function, so lets close our eyes for some 15 lines or so... 8) + * -acme + */ + if (sk->sk_protocol == IPPROTO_TCP) { + const struct tcp_sock *tp = tcp_sk(sk); + + r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq; + r->tcpdiag_wqueue = tp->write_seq - tp->snd_una; + } else + r->tcpdiag_rqueue = r->tcpdiag_wqueue = 0; + r->tcpdiag_uid = sock_i_uid(sk); r->tcpdiag_inode = sock_i_ino(sk); @@ -159,8 +175,13 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, minfo->tcpdiag_tmem = atomic_read(&sk->sk_wmem_alloc); } - if (info) - tcp_get_info(sk, info); + /* Ahem... for now we'll have some knowledge about TCP -acme */ + if (info) { + if (sk->sk_protocol == IPPROTO_TCP) + tcp_get_info(sk, info); + else + memset(info, 0, sizeof(*info)); + } if (sk->sk_state < TCP_TIME_WAIT && icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info) @@ -194,9 +215,13 @@ static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) struct sock *sk; struct tcpdiagreq *req = NLMSG_DATA(nlh); struct sk_buff *rep; - + struct inet_hashinfo *hashinfo = &tcp_hashinfo; +#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) + if (nlh->nlmsg_type == DCCPDIAG_GETSOCK) + hashinfo = &dccp_hashinfo; +#endif if (req->tcpdiag_family == AF_INET) { - sk = inet_lookup(&tcp_hashinfo, req->id.tcpdiag_dst[0], + sk = inet_lookup(hashinfo, req->id.tcpdiag_dst[0], req->id.tcpdiag_dport, req->id.tcpdiag_src[0], req->id.tcpdiag_sport, req->id.tcpdiag_if); } @@ -230,7 +255,7 @@ static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) if (tcpdiag_fill(rep, sk, req->tcpdiag_ext, NETLINK_CB(in_skb).pid, - nlh->nlmsg_seq, 0) <= 0) + nlh->nlmsg_seq, 0, nlh) <= 0) BUG(); err = netlink_unicast(tcpnl, rep, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); @@ -436,12 +461,13 @@ static int tcpdiag_dump_sock(struct sk_buff *skb, struct sock *sk, } return tcpdiag_fill(skb, sk, r->tcpdiag_ext, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, NLM_F_MULTI); + cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); } static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk, struct request_sock *req, - u32 pid, u32 seq) + u32 pid, u32 seq, + const struct nlmsghdr *unlh) { const struct inet_request_sock *ireq = inet_rsk(req); struct inet_sock *inet = inet_sk(sk); @@ -450,7 +476,7 @@ static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk, struct nlmsghdr *nlh; long tmo; - nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); + nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); nlh->nlmsg_flags = NLM_F_MULTI; r = NLMSG_DATA(nlh); @@ -526,7 +552,7 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk, entry.userlocks = sk->sk_userlocks; } - for (j = s_j; j < TCP_SYNQ_HSIZE; j++) { + for (j = s_j; j < lopt->nr_table_entries; j++) { struct request_sock *req, *head = lopt->syn_table[j]; reqnum = 0; @@ -561,7 +587,7 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk, err = tcpdiag_fill_req(skb, sk, req, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq); + cb->nlh->nlmsg_seq, cb->nlh); if (err < 0) { cb->args[3] = j + 1; cb->args[4] = reqnum; @@ -583,20 +609,26 @@ static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb) int i, num; int s_i, s_num; struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); + struct inet_hashinfo *hashinfo; s_i = cb->args[1]; s_num = num = cb->args[2]; - + hashinfo = &tcp_hashinfo; +#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) + if (cb->nlh->nlmsg_type == DCCPDIAG_GETSOCK) + hashinfo = &dccp_hashinfo; +#endif if (cb->args[0] == 0) { if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV))) goto skip_listen_ht; - inet_listen_lock(&tcp_hashinfo); + + inet_listen_lock(hashinfo); for (i = s_i; i < INET_LHTABLE_SIZE; i++) { struct sock *sk; struct hlist_node *node; num = 0; - sk_for_each(sk, node, &tcp_hashinfo.listening_hash[i]) { + sk_for_each(sk, node, &hashinfo->listening_hash[i]) { struct inet_sock *inet = inet_sk(sk); if (num < s_num) { @@ -614,7 +646,7 @@ static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb) goto syn_recv; if (tcpdiag_dump_sock(skb, sk, cb) < 0) { - inet_listen_unlock(&tcp_hashinfo); + inet_listen_unlock(hashinfo); goto done; } @@ -623,7 +655,7 @@ syn_recv: goto next_listen; if (tcpdiag_dump_reqs(skb, sk, cb) < 0) { - inet_listen_unlock(&tcp_hashinfo); + inet_listen_unlock(hashinfo); goto done; } @@ -637,7 +669,7 @@ next_listen: cb->args[3] = 0; cb->args[4] = 0; } - inet_listen_unlock(&tcp_hashinfo); + inet_listen_unlock(hashinfo); skip_listen_ht: cb->args[0] = 1; s_i = num = s_num = 0; @@ -646,8 +678,8 @@ skip_listen_ht: if (!(r->tcpdiag_states&~(TCPF_LISTEN|TCPF_SYN_RECV))) return skb->len; - for (i = s_i; i < tcp_hashinfo.ehash_size; i++) { - struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[i]; + for (i = s_i; i < hashinfo->ehash_size; i++) { + struct inet_ehash_bucket *head = &hashinfo->ehash[i]; struct sock *sk; struct hlist_node *node; @@ -679,7 +711,7 @@ next_normal: if (r->tcpdiag_states&TCPF_TIME_WAIT) { sk_for_each(sk, node, - &tcp_hashinfo.ehash[i + tcp_hashinfo.ehash_size].chain) { + &hashinfo->ehash[i + hashinfo->ehash_size].chain) { struct inet_sock *inet = inet_sk(sk); if (num < s_num) @@ -719,7 +751,11 @@ tcpdiag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) return 0; - if (nlh->nlmsg_type != TCPDIAG_GETSOCK) + if (nlh->nlmsg_type != TCPDIAG_GETSOCK +#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) + && nlh->nlmsg_type != DCCPDIAG_GETSOCK +#endif + ) goto err_inval; if (NLMSG_LENGTH(sizeof(struct tcpdiagreq)) > skb->len) -- cgit v1.2.3 From e41aac41e3856c87fee52c5b8bca71705d15449b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 11 Aug 2005 14:37:16 -0700 Subject: [TCPDIAG]: Introduce CONFIG_IP_TCPDIAG_DCCP Similar to CONFIG_IP_TCPDIAG_IPV6 Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 9 ++++++--- net/ipv4/tcp_diag.c | 8 ++++---- 2 files changed, 10 insertions(+), 7 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 0b3d9f1d806..c844954c1ad 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -419,15 +419,18 @@ config IP_TCPDIAG ---help--- Support for TCP socket monitoring interface used by native Linux tools such as ss. ss is included in iproute2, currently downloadable - at . If you want IPv6 support - and have selected IPv6 as a module, you need to build this as a - module too. + at . If you want IPv6 or DCCP + support and have selected IPv6 or DCCP as a module, you need to build + this as a module too. If unsure, say Y. config IP_TCPDIAG_IPV6 def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6) +config IP_TCPDIAG_DCCP + def_bool (IP_TCPDIAG=y && IP_DCCP=y) || (IP_TCPDIAG=m && IP_DCCP) + config TCP_CONG_ADVANCED bool "TCP: advanced congestion control" ---help--- diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index f5fc84aaa9b..8bf495c698f 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -45,7 +45,7 @@ static struct sock *tcpnl; #define TCPDIAG_PUT(skb, attrtype, attrlen) \ RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) -#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) +#ifdef CONFIG_IP_TCPDIAG_DCCP extern struct inet_hashinfo dccp_hashinfo; #endif @@ -216,7 +216,7 @@ static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) struct tcpdiagreq *req = NLMSG_DATA(nlh); struct sk_buff *rep; struct inet_hashinfo *hashinfo = &tcp_hashinfo; -#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) +#ifdef CONFIG_IP_TCPDIAG_DCCP if (nlh->nlmsg_type == DCCPDIAG_GETSOCK) hashinfo = &dccp_hashinfo; #endif @@ -614,7 +614,7 @@ static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb) s_i = cb->args[1]; s_num = num = cb->args[2]; hashinfo = &tcp_hashinfo; -#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) +#ifdef CONFIG_IP_TCPDIAG_DCCP if (cb->nlh->nlmsg_type == DCCPDIAG_GETSOCK) hashinfo = &dccp_hashinfo; #endif @@ -752,7 +752,7 @@ tcpdiag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return 0; if (nlh->nlmsg_type != TCPDIAG_GETSOCK -#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) +#ifdef CONFIG_IP_TCPDIAG_DCCP && nlh->nlmsg_type != DCCPDIAG_GETSOCK #endif ) -- cgit v1.2.3 From 505cbfc577f3fa778005e2800b869eca25727d5f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 12 Aug 2005 09:19:38 -0300 Subject: [IPV6]: Generalise the tcp_v6_lookup routines In the same way as was done with the v4 counterparts, this will be moved to inet6_hashtables.c. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 3 --- net/ipv4/tcp_diag.c | 40 +++++++++++++++++----------------------- 2 files changed, 17 insertions(+), 26 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index c844954c1ad..a79b4f9c10c 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -425,9 +425,6 @@ config IP_TCPDIAG If unsure, say Y. -config IP_TCPDIAG_IPV6 - def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6) - config IP_TCPDIAG_DCCP def_bool (IP_TCPDIAG=y && IP_DCCP=y) || (IP_TCPDIAG=m && IP_DCCP) diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 8bf495c698f..b812191b2f5 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -24,6 +24,10 @@ #include #include #include +#include +#include +#include +#include #include #include @@ -102,7 +106,7 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, r->tcpdiag_wqueue = 0; r->tcpdiag_uid = 0; r->tcpdiag_inode = 0; -#ifdef CONFIG_IP_TCPDIAG_IPV6 +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) if (r->tcpdiag_family == AF_INET6) { const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); @@ -121,7 +125,7 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, r->id.tcpdiag_src[0] = inet->rcv_saddr; r->id.tcpdiag_dst[0] = inet->daddr; -#ifdef CONFIG_IP_TCPDIAG_IPV6 +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) if (r->tcpdiag_family == AF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -196,19 +200,6 @@ nlmsg_failure: return -1; } -#ifdef CONFIG_IP_TCPDIAG_IPV6 -extern struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport, - struct in6_addr *daddr, u16 dport, - int dif); -#else -static inline struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport, - struct in6_addr *daddr, u16 dport, - int dif) -{ - return NULL; -} -#endif - static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) { int err; @@ -225,11 +216,14 @@ static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) req->id.tcpdiag_dport, req->id.tcpdiag_src[0], req->id.tcpdiag_sport, req->id.tcpdiag_if); } -#ifdef CONFIG_IP_TCPDIAG_IPV6 +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) else if (req->tcpdiag_family == AF_INET6) { - sk = tcp_v6_lookup((struct in6_addr*)req->id.tcpdiag_dst, req->id.tcpdiag_dport, - (struct in6_addr*)req->id.tcpdiag_src, req->id.tcpdiag_sport, - req->id.tcpdiag_if); + sk = inet6_lookup(hashinfo, + (struct in6_addr*)req->id.tcpdiag_dst, + req->id.tcpdiag_dport, + (struct in6_addr*)req->id.tcpdiag_src, + req->id.tcpdiag_sport, + req->id.tcpdiag_if); } #endif else { @@ -440,7 +434,7 @@ static int tcpdiag_dump_sock(struct sk_buff *skb, struct sock *sk, struct inet_sock *inet = inet_sk(sk); entry.family = sk->sk_family; -#ifdef CONFIG_IP_TCPDIAG_IPV6 +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) if (entry.family == AF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -502,7 +496,7 @@ static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk, r->tcpdiag_wqueue = 0; r->tcpdiag_uid = sock_i_uid(sk); r->tcpdiag_inode = 0; -#ifdef CONFIG_IP_TCPDIAG_IPV6 +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) if (r->tcpdiag_family == AF_INET6) { ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src, &tcp6_rsk(req)->loc_addr); @@ -567,13 +561,13 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk, if (bc) { entry.saddr = -#ifdef CONFIG_IP_TCPDIAG_IPV6 +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) (entry.family == AF_INET6) ? tcp6_rsk(req)->loc_addr.s6_addr32 : #endif &ireq->loc_addr; entry.daddr = -#ifdef CONFIG_IP_TCPDIAG_IPV6 +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) (entry.family == AF_INET6) ? tcp6_rsk(req)->rmt_addr.s6_addr32 : #endif -- cgit v1.2.3 From 5324a040ccc708998e61ea93e669b81312f0ae11 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 12 Aug 2005 09:26:18 -0300 Subject: [INET6_HASHTABLES]: Move inet6_lookup functions to net/ipv6/inet6_hashtables.c Doing this we allow tcp_diag to support IPV6 even if tcp_diag is compiled statically and IPV6 is compiled as a module, removing the previous restriction while not building any IPV6 code if it is not selected. Now to work on the tcpdiag_register infrastructure and then to rename the whole thing to inetdiag, reflecting its by then completely generic nature. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index a79b4f9c10c..960c02faf44 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -419,9 +419,7 @@ config IP_TCPDIAG ---help--- Support for TCP socket monitoring interface used by native Linux tools such as ss. ss is included in iproute2, currently downloadable - at . If you want IPv6 or DCCP - support and have selected IPv6 or DCCP as a module, you need to build - this as a module too. + at . If unsure, say Y. -- cgit v1.2.3 From 4f5736c4c7cf6f9bd8db82b712cfdd51c87e06b9 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 12 Aug 2005 09:27:49 -0300 Subject: [TCPDIAG]: Introduce inet_diag_{register,unregister} Next changeset will rename tcp_diag to inet_diag and move the tcp_diag code out of it and into a new tcp_diag.c, similar to the net/dccp/diag.c introduced in this changeset, completing the transition to a generic inet_diag infrastructure. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 3 -- net/ipv4/tcp_diag.c | 153 +++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 111 insertions(+), 45 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 960c02faf44..1e6db2a896b 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -423,9 +423,6 @@ config IP_TCPDIAG If unsure, say Y. -config IP_TCPDIAG_DCCP - def_bool (IP_TCPDIAG=y && IP_DCCP=y) || (IP_TCPDIAG=m && IP_DCCP) - config TCP_CONG_ADVANCED bool "TCP: advanced congestion control" ---help--- diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index b812191b2f5..b13b71cb9ce 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -34,6 +34,8 @@ #include +static const struct inet_diag_handler **inet_diag_table; + struct tcpdiag_entry { u32 *saddr; @@ -61,18 +63,24 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, const struct inet_connection_sock *icsk = inet_csk(sk); struct tcpdiagmsg *r; struct nlmsghdr *nlh; - struct tcp_info *info = NULL; + void *info = NULL; struct tcpdiag_meminfo *minfo = NULL; unsigned char *b = skb->tail; + const struct inet_diag_handler *handler; + + handler = inet_diag_table[unlh->nlmsg_type]; + BUG_ON(handler == NULL); nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); nlh->nlmsg_flags = nlmsg_flags; + r = NLMSG_DATA(nlh); if (sk->sk_state != TCP_TIME_WAIT) { if (ext & (1<<(TCPDIAG_MEMINFO-1))) minfo = TCPDIAG_PUT(skb, TCPDIAG_MEMINFO, sizeof(*minfo)); if (ext & (1<<(TCPDIAG_INFO-1))) - info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); + info = TCPDIAG_PUT(skb, TCPDIAG_INFO, + handler->idiag_info_size); if ((ext & (1 << (TCPDIAG_CONG - 1))) && icsk->icsk_ca_ops) { size_t len = strlen(icsk->icsk_ca_ops->name); @@ -155,19 +163,6 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, r->tcpdiag_expires = 0; } #undef EXPIRES_IN_MS - /* - * Ahem... for now we'll have some knowledge about TCP -acme - * But this is just one of two small exceptions, both in this - * function, so lets close our eyes for some 15 lines or so... 8) - * -acme - */ - if (sk->sk_protocol == IPPROTO_TCP) { - const struct tcp_sock *tp = tcp_sk(sk); - - r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq; - r->tcpdiag_wqueue = tp->write_seq - tp->snd_una; - } else - r->tcpdiag_rqueue = r->tcpdiag_wqueue = 0; r->tcpdiag_uid = sock_i_uid(sk); r->tcpdiag_inode = sock_i_ino(sk); @@ -179,13 +174,7 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, minfo->tcpdiag_tmem = atomic_read(&sk->sk_wmem_alloc); } - /* Ahem... for now we'll have some knowledge about TCP -acme */ - if (info) { - if (sk->sk_protocol == IPPROTO_TCP) - tcp_get_info(sk, info); - else - memset(info, 0, sizeof(*info)); - } + handler->idiag_get_info(sk, r, info); if (sk->sk_state < TCP_TIME_WAIT && icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info) @@ -206,11 +195,13 @@ static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) struct sock *sk; struct tcpdiagreq *req = NLMSG_DATA(nlh); struct sk_buff *rep; - struct inet_hashinfo *hashinfo = &tcp_hashinfo; -#ifdef CONFIG_IP_TCPDIAG_DCCP - if (nlh->nlmsg_type == DCCPDIAG_GETSOCK) - hashinfo = &dccp_hashinfo; -#endif + struct inet_hashinfo *hashinfo; + const struct inet_diag_handler *handler; + + handler = inet_diag_table[nlh->nlmsg_type]; + BUG_ON(handler == NULL); + hashinfo = handler->idiag_hashinfo; + if (req->tcpdiag_family == AF_INET) { sk = inet_lookup(hashinfo, req->id.tcpdiag_dst[0], req->id.tcpdiag_dport, req->id.tcpdiag_src[0], @@ -241,9 +232,10 @@ static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) goto out; err = -ENOMEM; - rep = alloc_skb(NLMSG_SPACE(sizeof(struct tcpdiagmsg)+ - sizeof(struct tcpdiag_meminfo)+ - sizeof(struct tcp_info)+64), GFP_KERNEL); + rep = alloc_skb(NLMSG_SPACE((sizeof(struct tcpdiagmsg) + + sizeof(struct tcpdiag_meminfo) + + handler->idiag_info_size + 64)), + GFP_KERNEL); if (!rep) goto out; @@ -603,15 +595,16 @@ static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb) int i, num; int s_i, s_num; struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); + const struct inet_diag_handler *handler; struct inet_hashinfo *hashinfo; + handler = inet_diag_table[cb->nlh->nlmsg_type]; + BUG_ON(handler == NULL); + hashinfo = handler->idiag_hashinfo; + s_i = cb->args[1]; s_num = num = cb->args[2]; - hashinfo = &tcp_hashinfo; -#ifdef CONFIG_IP_TCPDIAG_DCCP - if (cb->nlh->nlmsg_type == DCCPDIAG_GETSOCK) - hashinfo = &dccp_hashinfo; -#endif + if (cb->args[0] == 0) { if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV))) goto skip_listen_ht; @@ -745,13 +738,12 @@ tcpdiag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) return 0; - if (nlh->nlmsg_type != TCPDIAG_GETSOCK -#ifdef CONFIG_IP_TCPDIAG_DCCP - && nlh->nlmsg_type != DCCPDIAG_GETSOCK -#endif - ) + if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX) goto err_inval; + if (inet_diag_table[nlh->nlmsg_type] == NULL) + return -ENOENT; + if (NLMSG_LENGTH(sizeof(struct tcpdiagreq)) > skb->len) goto err_inval; @@ -803,18 +795,95 @@ static void tcpdiag_rcv(struct sock *sk, int len) } } +static void tcp_diag_get_info(struct sock *sk, struct tcpdiagmsg *r, + void *_info) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct tcp_info *info = _info; + + r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq; + r->tcpdiag_wqueue = tp->write_seq - tp->snd_una; + if (info != NULL) + tcp_get_info(sk, info); +} + +static struct inet_diag_handler tcp_diag_handler = { + .idiag_hashinfo = &tcp_hashinfo, + .idiag_get_info = tcp_diag_get_info, + .idiag_type = TCPDIAG_GETSOCK, + .idiag_info_size = sizeof(struct tcp_info), +}; + +static DEFINE_SPINLOCK(inet_diag_register_lock); + +int inet_diag_register(const struct inet_diag_handler *h) +{ + const __u16 type = h->idiag_type; + int err = -EINVAL; + + if (type >= INET_DIAG_GETSOCK_MAX) + goto out; + + spin_lock(&inet_diag_register_lock); + err = -EEXIST; + if (inet_diag_table[type] == NULL) { + inet_diag_table[type] = h; + err = 0; + } + spin_unlock(&inet_diag_register_lock); +out: + return err; +} +EXPORT_SYMBOL_GPL(inet_diag_register); + +void inet_diag_unregister(const struct inet_diag_handler *h) +{ + const __u16 type = h->idiag_type; + + if (type >= INET_DIAG_GETSOCK_MAX) + return; + + spin_lock(&inet_diag_register_lock); + inet_diag_table[type] = NULL; + spin_unlock(&inet_diag_register_lock); + + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(inet_diag_unregister); + static int __init tcpdiag_init(void) { + const int inet_diag_table_size = (INET_DIAG_GETSOCK_MAX * + sizeof(struct inet_diag_handler *)); + int err = -ENOMEM; + + inet_diag_table = kmalloc(inet_diag_table_size, GFP_KERNEL); + if (!inet_diag_table) + goto out; + + memset(inet_diag_table, 0, inet_diag_table_size); + tcpnl = netlink_kernel_create(NETLINK_TCPDIAG, tcpdiag_rcv, THIS_MODULE); if (tcpnl == NULL) - return -ENOMEM; - return 0; + goto out_free_table; + + err = inet_diag_register(&tcp_diag_handler); + if (err) + goto out_sock_release; +out: + return err; +out_sock_release: + sock_release(tcpnl->sk_socket); +out_free_table: + kfree(inet_diag_table); + goto out; } static void __exit tcpdiag_exit(void) { sock_release(tcpnl->sk_socket); + kfree(inet_diag_table); } module_init(tcpdiag_init); -- cgit v1.2.3 From 73c1f4a033675f168df7e98bbeeafca3c644b8a6 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 12 Aug 2005 12:51:49 -0300 Subject: [TCPDIAG]: Just rename everything to inet_diag Next changeset will rename tcp_diag.[ch] to inet_diag.[ch]. I'm taking this longer route so as to easy review, making clear the changes made all along the way. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 10 +- net/ipv4/Makefile | 2 +- net/ipv4/tcp_diag.c | 391 ++++++++++++++++++++++++------------------------ net/ipv4/tcp_vegas.c | 4 +- net/ipv4/tcp_westwood.c | 4 +- 5 files changed, 206 insertions(+), 205 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 1e6db2a896b..019e88d8f29 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -413,13 +413,13 @@ config INET_TUNNEL If unsure, say Y. -config IP_TCPDIAG - tristate "IP: TCP socket monitoring interface" +config IP_INET_DIAG + tristate "IP: INET socket monitoring interface" default y ---help--- - Support for TCP socket monitoring interface used by native Linux - tools such as ss. ss is included in iproute2, currently downloadable - at . + Support for INET (TCP, DCCP, etc) socket monitoring interface used by + native Linux tools such as ss. ss is included in iproute2, currently + downloadable at . If unsure, say Y. diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index ea0e1d87dc7..9b1c894039a 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -30,7 +30,7 @@ obj-$(CONFIG_IP_ROUTE_MULTIPATH_WRANDOM) += multipath_wrandom.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_IP_VS) += ipvs/ -obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o +obj-$(CONFIG_IP_INET_DIAG) += tcp_diag.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index b13b71cb9ce..24abe82e23a 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -1,7 +1,7 @@ /* - * tcp_diag.c Module for monitoring TCP sockets. + * inet_diag.c Module for monitoring INET transport protocols sockets. * - * Version: $Id: tcp_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $ + * Version: $Id: inet_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $ * * Authors: Alexey Kuznetsov, * @@ -36,8 +36,7 @@ static const struct inet_diag_handler **inet_diag_table; -struct tcpdiag_entry -{ +struct inet_diag_entry { u32 *saddr; u32 *daddr; u16 sport; @@ -46,25 +45,21 @@ struct tcpdiag_entry u16 userlocks; }; -static struct sock *tcpnl; +static struct sock *idiagnl; -#define TCPDIAG_PUT(skb, attrtype, attrlen) \ +#define INET_DIAG_PUT(skb, attrtype, attrlen) \ RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) -#ifdef CONFIG_IP_TCPDIAG_DCCP -extern struct inet_hashinfo dccp_hashinfo; -#endif - -static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, +static int inet_diag_fill(struct sk_buff *skb, struct sock *sk, int ext, u32 pid, u32 seq, u16 nlmsg_flags, const struct nlmsghdr *unlh) { const struct inet_sock *inet = inet_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); - struct tcpdiagmsg *r; + struct inet_diag_msg *r; struct nlmsghdr *nlh; void *info = NULL; - struct tcpdiag_meminfo *minfo = NULL; + struct inet_diag_meminfo *minfo = NULL; unsigned char *b = skb->tail; const struct inet_diag_handler *handler; @@ -76,51 +71,52 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, r = NLMSG_DATA(nlh); if (sk->sk_state != TCP_TIME_WAIT) { - if (ext & (1<<(TCPDIAG_MEMINFO-1))) - minfo = TCPDIAG_PUT(skb, TCPDIAG_MEMINFO, sizeof(*minfo)); - if (ext & (1<<(TCPDIAG_INFO-1))) - info = TCPDIAG_PUT(skb, TCPDIAG_INFO, + if (ext & (1 << (INET_DIAG_MEMINFO - 1))) + minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, + sizeof(*minfo)); + if (ext & (1 << (INET_DIAG_INFO - 1))) + info = INET_DIAG_PUT(skb, INET_DIAG_INFO, handler->idiag_info_size); - if ((ext & (1 << (TCPDIAG_CONG - 1))) && icsk->icsk_ca_ops) { + if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) { size_t len = strlen(icsk->icsk_ca_ops->name); - strcpy(TCPDIAG_PUT(skb, TCPDIAG_CONG, len+1), + strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1), icsk->icsk_ca_ops->name); } } - r->tcpdiag_family = sk->sk_family; - r->tcpdiag_state = sk->sk_state; - r->tcpdiag_timer = 0; - r->tcpdiag_retrans = 0; + r->idiag_family = sk->sk_family; + r->idiag_state = sk->sk_state; + r->idiag_timer = 0; + r->idiag_retrans = 0; - r->id.tcpdiag_if = sk->sk_bound_dev_if; - r->id.tcpdiag_cookie[0] = (u32)(unsigned long)sk; - r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); + r->id.idiag_if = sk->sk_bound_dev_if; + r->id.idiag_cookie[0] = (u32)(unsigned long)sk; + r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); - if (r->tcpdiag_state == TCP_TIME_WAIT) { + if (r->idiag_state == TCP_TIME_WAIT) { const struct inet_timewait_sock *tw = inet_twsk(sk); long tmo = tw->tw_ttd - jiffies; if (tmo < 0) tmo = 0; - r->id.tcpdiag_sport = tw->tw_sport; - r->id.tcpdiag_dport = tw->tw_dport; - r->id.tcpdiag_src[0] = tw->tw_rcv_saddr; - r->id.tcpdiag_dst[0] = tw->tw_daddr; - r->tcpdiag_state = tw->tw_substate; - r->tcpdiag_timer = 3; - r->tcpdiag_expires = (tmo*1000+HZ-1)/HZ; - r->tcpdiag_rqueue = 0; - r->tcpdiag_wqueue = 0; - r->tcpdiag_uid = 0; - r->tcpdiag_inode = 0; + r->id.idiag_sport = tw->tw_sport; + r->id.idiag_dport = tw->tw_dport; + r->id.idiag_src[0] = tw->tw_rcv_saddr; + r->id.idiag_dst[0] = tw->tw_daddr; + r->idiag_state = tw->tw_substate; + r->idiag_timer = 3; + r->idiag_expires = (tmo * 1000 + HZ - 1) / HZ; + r->idiag_rqueue = 0; + r->idiag_wqueue = 0; + r->idiag_uid = 0; + r->idiag_inode = 0; #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - if (r->tcpdiag_family == AF_INET6) { + if (r->idiag_family == AF_INET6) { const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); - ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src, + ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, &tcp6tw->tw_v6_rcv_saddr); - ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst, + ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, &tcp6tw->tw_v6_daddr); } #endif @@ -128,18 +124,18 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, return skb->len; } - r->id.tcpdiag_sport = inet->sport; - r->id.tcpdiag_dport = inet->dport; - r->id.tcpdiag_src[0] = inet->rcv_saddr; - r->id.tcpdiag_dst[0] = inet->daddr; + r->id.idiag_sport = inet->sport; + r->id.idiag_dport = inet->dport; + r->id.idiag_src[0] = inet->rcv_saddr; + r->id.idiag_dst[0] = inet->daddr; #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - if (r->tcpdiag_family == AF_INET6) { + if (r->idiag_family == AF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); - ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src, + ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, &np->rcv_saddr); - ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst, + ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, &np->daddr); } #endif @@ -147,31 +143,31 @@ static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, #define EXPIRES_IN_MS(tmo) ((tmo-jiffies)*1000+HZ-1)/HZ if (icsk->icsk_pending == ICSK_TIME_RETRANS) { - r->tcpdiag_timer = 1; - r->tcpdiag_retrans = icsk->icsk_retransmits; - r->tcpdiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); + r->idiag_timer = 1; + r->idiag_retrans = icsk->icsk_retransmits; + r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { - r->tcpdiag_timer = 4; - r->tcpdiag_retrans = icsk->icsk_probes_out; - r->tcpdiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); + r->idiag_timer = 4; + r->idiag_retrans = icsk->icsk_probes_out; + r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); } else if (timer_pending(&sk->sk_timer)) { - r->tcpdiag_timer = 2; - r->tcpdiag_retrans = icsk->icsk_probes_out; - r->tcpdiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires); + r->idiag_timer = 2; + r->idiag_retrans = icsk->icsk_probes_out; + r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires); } else { - r->tcpdiag_timer = 0; - r->tcpdiag_expires = 0; + r->idiag_timer = 0; + r->idiag_expires = 0; } #undef EXPIRES_IN_MS - r->tcpdiag_uid = sock_i_uid(sk); - r->tcpdiag_inode = sock_i_ino(sk); + r->idiag_uid = sock_i_uid(sk); + r->idiag_inode = sock_i_ino(sk); if (minfo) { - minfo->tcpdiag_rmem = atomic_read(&sk->sk_rmem_alloc); - minfo->tcpdiag_wmem = sk->sk_wmem_queued; - minfo->tcpdiag_fmem = sk->sk_forward_alloc; - minfo->tcpdiag_tmem = atomic_read(&sk->sk_wmem_alloc); + minfo->idiag_rmem = atomic_read(&sk->sk_rmem_alloc); + minfo->idiag_wmem = sk->sk_wmem_queued; + minfo->idiag_fmem = sk->sk_forward_alloc; + minfo->idiag_tmem = atomic_read(&sk->sk_wmem_alloc); } handler->idiag_get_info(sk, r, info); @@ -189,11 +185,11 @@ nlmsg_failure: return -1; } -static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) +static int inet_diag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) { int err; struct sock *sk; - struct tcpdiagreq *req = NLMSG_DATA(nlh); + struct inet_diag_req *req = NLMSG_DATA(nlh); struct sk_buff *rep; struct inet_hashinfo *hashinfo; const struct inet_diag_handler *handler; @@ -202,19 +198,19 @@ static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) BUG_ON(handler == NULL); hashinfo = handler->idiag_hashinfo; - if (req->tcpdiag_family == AF_INET) { - sk = inet_lookup(hashinfo, req->id.tcpdiag_dst[0], - req->id.tcpdiag_dport, req->id.tcpdiag_src[0], - req->id.tcpdiag_sport, req->id.tcpdiag_if); + if (req->idiag_family == AF_INET) { + sk = inet_lookup(hashinfo, req->id.idiag_dst[0], + req->id.idiag_dport, req->id.idiag_src[0], + req->id.idiag_sport, req->id.idiag_if); } #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - else if (req->tcpdiag_family == AF_INET6) { + else if (req->idiag_family == AF_INET6) { sk = inet6_lookup(hashinfo, - (struct in6_addr*)req->id.tcpdiag_dst, - req->id.tcpdiag_dport, - (struct in6_addr*)req->id.tcpdiag_src, - req->id.tcpdiag_sport, - req->id.tcpdiag_if); + (struct in6_addr *)req->id.idiag_dst, + req->id.idiag_dport, + (struct in6_addr *)req->id.idiag_src, + req->id.idiag_sport, + req->id.idiag_if); } #endif else { @@ -225,26 +221,27 @@ static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) return -ENOENT; err = -ESTALE; - if ((req->id.tcpdiag_cookie[0] != TCPDIAG_NOCOOKIE || - req->id.tcpdiag_cookie[1] != TCPDIAG_NOCOOKIE) && - ((u32)(unsigned long)sk != req->id.tcpdiag_cookie[0] || - (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.tcpdiag_cookie[1])) + if ((req->id.idiag_cookie[0] != INET_DIAG_NOCOOKIE || + req->id.idiag_cookie[1] != INET_DIAG_NOCOOKIE) && + ((u32)(unsigned long)sk != req->id.idiag_cookie[0] || + (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.idiag_cookie[1])) goto out; err = -ENOMEM; - rep = alloc_skb(NLMSG_SPACE((sizeof(struct tcpdiagmsg) + - sizeof(struct tcpdiag_meminfo) + + rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) + + sizeof(struct inet_diag_meminfo) + handler->idiag_info_size + 64)), GFP_KERNEL); if (!rep) goto out; - if (tcpdiag_fill(rep, sk, req->tcpdiag_ext, + if (inet_diag_fill(rep, sk, req->idiag_ext, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, 0, nlh) <= 0) BUG(); - err = netlink_unicast(tcpnl, rep, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); + err = netlink_unicast(idiagnl, rep, NETLINK_CB(in_skb).pid, + MSG_DONTWAIT); if (err > 0) err = 0; @@ -285,42 +282,42 @@ static int bitstring_match(const u32 *a1, const u32 *a2, int bits) } -static int tcpdiag_bc_run(const void *bc, int len, - const struct tcpdiag_entry *entry) +static int inet_diag_bc_run(const void *bc, int len, + const struct inet_diag_entry *entry) { while (len > 0) { int yes = 1; - const struct tcpdiag_bc_op *op = bc; + const struct inet_diag_bc_op *op = bc; switch (op->code) { - case TCPDIAG_BC_NOP: + case INET_DIAG_BC_NOP: break; - case TCPDIAG_BC_JMP: + case INET_DIAG_BC_JMP: yes = 0; break; - case TCPDIAG_BC_S_GE: + case INET_DIAG_BC_S_GE: yes = entry->sport >= op[1].no; break; - case TCPDIAG_BC_S_LE: + case INET_DIAG_BC_S_LE: yes = entry->dport <= op[1].no; break; - case TCPDIAG_BC_D_GE: + case INET_DIAG_BC_D_GE: yes = entry->dport >= op[1].no; break; - case TCPDIAG_BC_D_LE: + case INET_DIAG_BC_D_LE: yes = entry->dport <= op[1].no; break; - case TCPDIAG_BC_AUTO: + case INET_DIAG_BC_AUTO: yes = !(entry->userlocks & SOCK_BINDPORT_LOCK); break; - case TCPDIAG_BC_S_COND: - case TCPDIAG_BC_D_COND: + case INET_DIAG_BC_S_COND: + case INET_DIAG_BC_D_COND: { - struct tcpdiag_hostcond *cond = (struct tcpdiag_hostcond*)(op+1); + struct inet_diag_hostcond *cond = (struct inet_diag_hostcond*)(op+1); u32 *addr; if (cond->port != -1 && - cond->port != (op->code == TCPDIAG_BC_S_COND ? + cond->port != (op->code == INET_DIAG_BC_S_COND ? entry->sport : entry->dport)) { yes = 0; break; @@ -329,7 +326,7 @@ static int tcpdiag_bc_run(const void *bc, int len, if (cond->prefix_len == 0) break; - if (op->code == TCPDIAG_BC_S_COND) + if (op->code == INET_DIAG_BC_S_COND) addr = entry->saddr; else addr = entry->daddr; @@ -362,7 +359,7 @@ static int tcpdiag_bc_run(const void *bc, int len, static int valid_cc(const void *bc, int len, int cc) { while (len >= 0) { - const struct tcpdiag_bc_op *op = bc; + const struct inet_diag_bc_op *op = bc; if (cc > len) return 0; @@ -376,33 +373,33 @@ static int valid_cc(const void *bc, int len, int cc) return 0; } -static int tcpdiag_bc_audit(const void *bytecode, int bytecode_len) +static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) { const unsigned char *bc = bytecode; int len = bytecode_len; while (len > 0) { - struct tcpdiag_bc_op *op = (struct tcpdiag_bc_op*)bc; + struct inet_diag_bc_op *op = (struct inet_diag_bc_op*)bc; //printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len); switch (op->code) { - case TCPDIAG_BC_AUTO: - case TCPDIAG_BC_S_COND: - case TCPDIAG_BC_D_COND: - case TCPDIAG_BC_S_GE: - case TCPDIAG_BC_S_LE: - case TCPDIAG_BC_D_GE: - case TCPDIAG_BC_D_LE: + case INET_DIAG_BC_AUTO: + case INET_DIAG_BC_S_COND: + case INET_DIAG_BC_D_COND: + case INET_DIAG_BC_S_GE: + case INET_DIAG_BC_S_LE: + case INET_DIAG_BC_D_GE: + case INET_DIAG_BC_D_LE: if (op->yes < 4 || op->yes > len+4) return -EINVAL; - case TCPDIAG_BC_JMP: + case INET_DIAG_BC_JMP: if (op->no < 4 || op->no > len+4) return -EINVAL; if (op->no < len && !valid_cc(bytecode, bytecode_len, len-op->no)) return -EINVAL; break; - case TCPDIAG_BC_NOP: + case INET_DIAG_BC_NOP: if (op->yes < 4 || op->yes > len+4) return -EINVAL; break; @@ -415,13 +412,13 @@ static int tcpdiag_bc_audit(const void *bytecode, int bytecode_len) return len == 0 ? 0 : -EINVAL; } -static int tcpdiag_dump_sock(struct sk_buff *skb, struct sock *sk, +static int inet_diag_dump_sock(struct sk_buff *skb, struct sock *sk, struct netlink_callback *cb) { - struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); + struct inet_diag_req *r = NLMSG_DATA(cb->nlh); if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { - struct tcpdiag_entry entry; + struct inet_diag_entry entry; struct rtattr *bc = (struct rtattr *)(r + 1); struct inet_sock *inet = inet_sk(sk); @@ -442,15 +439,15 @@ static int tcpdiag_dump_sock(struct sk_buff *skb, struct sock *sk, entry.dport = ntohs(inet->dport); entry.userlocks = sk->sk_userlocks; - if (!tcpdiag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) + if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) return 0; } - return tcpdiag_fill(skb, sk, r->tcpdiag_ext, NETLINK_CB(cb->skb).pid, + return inet_diag_fill(skb, sk, r->idiag_ext, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); } -static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk, +static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, struct request_sock *req, u32 pid, u32 seq, const struct nlmsghdr *unlh) @@ -458,7 +455,7 @@ static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk, const struct inet_request_sock *ireq = inet_rsk(req); struct inet_sock *inet = inet_sk(sk); unsigned char *b = skb->tail; - struct tcpdiagmsg *r; + struct inet_diag_msg *r; struct nlmsghdr *nlh; long tmo; @@ -466,33 +463,33 @@ static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk, nlh->nlmsg_flags = NLM_F_MULTI; r = NLMSG_DATA(nlh); - r->tcpdiag_family = sk->sk_family; - r->tcpdiag_state = TCP_SYN_RECV; - r->tcpdiag_timer = 1; - r->tcpdiag_retrans = req->retrans; + r->idiag_family = sk->sk_family; + r->idiag_state = TCP_SYN_RECV; + r->idiag_timer = 1; + r->idiag_retrans = req->retrans; - r->id.tcpdiag_if = sk->sk_bound_dev_if; - r->id.tcpdiag_cookie[0] = (u32)(unsigned long)req; - r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1); + r->id.idiag_if = sk->sk_bound_dev_if; + r->id.idiag_cookie[0] = (u32)(unsigned long)req; + r->id.idiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1); tmo = req->expires - jiffies; if (tmo < 0) tmo = 0; - r->id.tcpdiag_sport = inet->sport; - r->id.tcpdiag_dport = ireq->rmt_port; - r->id.tcpdiag_src[0] = ireq->loc_addr; - r->id.tcpdiag_dst[0] = ireq->rmt_addr; - r->tcpdiag_expires = jiffies_to_msecs(tmo), - r->tcpdiag_rqueue = 0; - r->tcpdiag_wqueue = 0; - r->tcpdiag_uid = sock_i_uid(sk); - r->tcpdiag_inode = 0; + r->id.idiag_sport = inet->sport; + r->id.idiag_dport = ireq->rmt_port; + r->id.idiag_src[0] = ireq->loc_addr; + r->id.idiag_dst[0] = ireq->rmt_addr; + r->idiag_expires = jiffies_to_msecs(tmo); + r->idiag_rqueue = 0; + r->idiag_wqueue = 0; + r->idiag_uid = sock_i_uid(sk); + r->idiag_inode = 0; #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - if (r->tcpdiag_family == AF_INET6) { - ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src, + if (r->idiag_family == AF_INET6) { + ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, &tcp6_rsk(req)->loc_addr); - ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst, + ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, &tcp6_rsk(req)->rmt_addr); } #endif @@ -505,11 +502,11 @@ nlmsg_failure: return -1; } -static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk, +static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, struct netlink_callback *cb) { - struct tcpdiag_entry entry; - struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); + struct inet_diag_entry entry; + struct inet_diag_req *r = NLMSG_DATA(cb->nlh); struct inet_connection_sock *icsk = inet_csk(sk); struct listen_sock *lopt; struct rtattr *bc = NULL; @@ -547,8 +544,8 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk, if (reqnum < s_reqnum) continue; - if (r->id.tcpdiag_dport != ireq->rmt_port && - r->id.tcpdiag_dport) + if (r->id.idiag_dport != ireq->rmt_port && + r->id.idiag_dport) continue; if (bc) { @@ -566,12 +563,12 @@ static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk, &ireq->rmt_addr; entry.dport = ntohs(ireq->rmt_port); - if (!tcpdiag_bc_run(RTA_DATA(bc), + if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) continue; } - err = tcpdiag_fill_req(skb, sk, req, + err = inet_diag_fill_req(skb, sk, req, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, cb->nlh); if (err < 0) { @@ -590,11 +587,11 @@ out: return err; } -static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb) +static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { int i, num; int s_i, s_num; - struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); + struct inet_diag_req *r = NLMSG_DATA(cb->nlh); const struct inet_diag_handler *handler; struct inet_hashinfo *hashinfo; @@ -606,7 +603,7 @@ static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb) s_num = num = cb->args[2]; if (cb->args[0] == 0) { - if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV))) + if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV))) goto skip_listen_ht; inet_listen_lock(hashinfo); @@ -623,25 +620,25 @@ static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb) continue; } - if (r->id.tcpdiag_sport != inet->sport && - r->id.tcpdiag_sport) + if (r->id.idiag_sport != inet->sport && + r->id.idiag_sport) goto next_listen; - if (!(r->tcpdiag_states&TCPF_LISTEN) || - r->id.tcpdiag_dport || + if (!(r->idiag_states & TCPF_LISTEN) || + r->id.idiag_dport || cb->args[3] > 0) goto syn_recv; - if (tcpdiag_dump_sock(skb, sk, cb) < 0) { + if (inet_diag_dump_sock(skb, sk, cb) < 0) { inet_listen_unlock(hashinfo); goto done; } syn_recv: - if (!(r->tcpdiag_states&TCPF_SYN_RECV)) + if (!(r->idiag_states & TCPF_SYN_RECV)) goto next_listen; - if (tcpdiag_dump_reqs(skb, sk, cb) < 0) { + if (inet_diag_dump_reqs(skb, sk, cb) < 0) { inet_listen_unlock(hashinfo); goto done; } @@ -662,7 +659,7 @@ skip_listen_ht: s_i = num = s_num = 0; } - if (!(r->tcpdiag_states&~(TCPF_LISTEN|TCPF_SYN_RECV))) + if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV))) return skb->len; for (i = s_i; i < hashinfo->ehash_size; i++) { @@ -681,14 +678,14 @@ skip_listen_ht: if (num < s_num) goto next_normal; - if (!(r->tcpdiag_states & (1 << sk->sk_state))) + if (!(r->idiag_states & (1 << sk->sk_state))) goto next_normal; - if (r->id.tcpdiag_sport != inet->sport && - r->id.tcpdiag_sport) + if (r->id.idiag_sport != inet->sport && + r->id.idiag_sport) goto next_normal; - if (r->id.tcpdiag_dport != inet->dport && r->id.tcpdiag_dport) + if (r->id.idiag_dport != inet->dport && r->id.idiag_dport) goto next_normal; - if (tcpdiag_dump_sock(skb, sk, cb) < 0) { + if (inet_diag_dump_sock(skb, sk, cb) < 0) { read_unlock_bh(&head->lock); goto done; } @@ -696,20 +693,20 @@ next_normal: ++num; } - if (r->tcpdiag_states&TCPF_TIME_WAIT) { + if (r->idiag_states & TCPF_TIME_WAIT) { sk_for_each(sk, node, &hashinfo->ehash[i + hashinfo->ehash_size].chain) { struct inet_sock *inet = inet_sk(sk); if (num < s_num) goto next_dying; - if (r->id.tcpdiag_sport != inet->sport && - r->id.tcpdiag_sport) + if (r->id.idiag_sport != inet->sport && + r->id.idiag_sport) goto next_dying; - if (r->id.tcpdiag_dport != inet->dport && - r->id.tcpdiag_dport) + if (r->id.idiag_dport != inet->dport && + r->id.idiag_dport) goto next_dying; - if (tcpdiag_dump_sock(skb, sk, cb) < 0) { + if (inet_diag_dump_sock(skb, sk, cb) < 0) { read_unlock_bh(&head->lock); goto done; } @@ -726,14 +723,14 @@ done: return skb->len; } -static int tcpdiag_dump_done(struct netlink_callback *cb) +static int inet_diag_dump_done(struct netlink_callback *cb) { return 0; } static __inline__ int -tcpdiag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) return 0; @@ -744,24 +741,28 @@ tcpdiag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (inet_diag_table[nlh->nlmsg_type] == NULL) return -ENOENT; - if (NLMSG_LENGTH(sizeof(struct tcpdiagreq)) > skb->len) + if (NLMSG_LENGTH(sizeof(struct inet_diag_req)) > skb->len) goto err_inval; if (nlh->nlmsg_flags&NLM_F_DUMP) { - if (nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(struct tcpdiagreq))) { - struct rtattr *rta = (struct rtattr*)(NLMSG_DATA(nlh) + sizeof(struct tcpdiagreq)); - if (rta->rta_type != TCPDIAG_REQ_BYTECODE || + if (nlh->nlmsg_len > + (4 + NLMSG_SPACE(sizeof(struct inet_diag_req)))) { + struct rtattr *rta = (void *)(NLMSG_DATA(nlh) + + sizeof(struct inet_diag_req)); + if (rta->rta_type != INET_DIAG_REQ_BYTECODE || rta->rta_len < 8 || - rta->rta_len > nlh->nlmsg_len - NLMSG_SPACE(sizeof(struct tcpdiagreq))) + rta->rta_len > + (nlh->nlmsg_len - + NLMSG_SPACE(sizeof(struct inet_diag_req)))) goto err_inval; - if (tcpdiag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta))) + if (inet_diag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta))) goto err_inval; } - return netlink_dump_start(tcpnl, skb, nlh, - tcpdiag_dump, - tcpdiag_dump_done); + return netlink_dump_start(idiagnl, skb, nlh, + inet_diag_dump, + inet_diag_dump_done); } else { - return tcpdiag_get_exact(skb, nlh); + return inet_diag_get_exact(skb, nlh); } err_inval: @@ -769,7 +770,7 @@ err_inval: } -static inline void tcpdiag_rcv_skb(struct sk_buff *skb) +static inline void inet_diag_rcv_skb(struct sk_buff *skb) { int err; struct nlmsghdr * nlh; @@ -778,31 +779,31 @@ static inline void tcpdiag_rcv_skb(struct sk_buff *skb) nlh = (struct nlmsghdr *)skb->data; if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) return; - err = tcpdiag_rcv_msg(skb, nlh); + err = inet_diag_rcv_msg(skb, nlh); if (err || nlh->nlmsg_flags & NLM_F_ACK) netlink_ack(skb, nlh, err); } } -static void tcpdiag_rcv(struct sock *sk, int len) +static void inet_diag_rcv(struct sock *sk, int len) { struct sk_buff *skb; unsigned int qlen = skb_queue_len(&sk->sk_receive_queue); while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) { - tcpdiag_rcv_skb(skb); + inet_diag_rcv_skb(skb); kfree_skb(skb); } } -static void tcp_diag_get_info(struct sock *sk, struct tcpdiagmsg *r, +static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *_info) { const struct tcp_sock *tp = tcp_sk(sk); struct tcp_info *info = _info; - r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq; - r->tcpdiag_wqueue = tp->write_seq - tp->snd_una; + r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq; + r->idiag_wqueue = tp->write_seq - tp->snd_una; if (info != NULL) tcp_get_info(sk, info); } @@ -851,7 +852,7 @@ void inet_diag_unregister(const struct inet_diag_handler *h) } EXPORT_SYMBOL_GPL(inet_diag_unregister); -static int __init tcpdiag_init(void) +static int __init inet_diag_init(void) { const int inet_diag_table_size = (INET_DIAG_GETSOCK_MAX * sizeof(struct inet_diag_handler *)); @@ -863,9 +864,9 @@ static int __init tcpdiag_init(void) memset(inet_diag_table, 0, inet_diag_table_size); - tcpnl = netlink_kernel_create(NETLINK_TCPDIAG, tcpdiag_rcv, - THIS_MODULE); - if (tcpnl == NULL) + idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, inet_diag_rcv, + THIS_MODULE); + if (idiagnl == NULL) goto out_free_table; err = inet_diag_register(&tcp_diag_handler); @@ -874,18 +875,18 @@ static int __init tcpdiag_init(void) out: return err; out_sock_release: - sock_release(tcpnl->sk_socket); + sock_release(idiagnl->sk_socket); out_free_table: kfree(inet_diag_table); goto out; } -static void __exit tcpdiag_exit(void) +static void __exit inet_diag_exit(void) { - sock_release(tcpnl->sk_socket); + sock_release(idiagnl->sk_socket); kfree(inet_diag_table); } -module_init(tcpdiag_init); -module_exit(tcpdiag_exit); +module_init(inet_diag_init); +module_exit(inet_diag_exit); MODULE_LICENSE("GPL"); diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 054de24efee..8cef9dc11fb 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -365,10 +365,10 @@ static void tcp_vegas_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) { const struct vegas *ca = inet_csk_ca(sk); - if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { struct tcpvegas_info *info; - info = RTA_DATA(__RTA_PUT(skb, TCPDIAG_VEGASINFO, + info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO, sizeof(*info))); info->tcpv_enabled = ca->doing_vegas_now; diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index d8a5a2b92e3..39510031787 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -216,11 +216,11 @@ static void tcp_westwood_info(struct sock *sk, u32 ext, struct sk_buff *skb) { const struct westwood *ca = inet_csk_ca(sk); - if (ext & (1<<(TCPDIAG_VEGASINFO-1))) { + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { struct rtattr *rta; struct tcpvegas_info *info; - rta = __RTA_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*info)); + rta = __RTA_PUT(skb, INET_DIAG_VEGASINFO, sizeof(*info)); info = RTA_DATA(rta); info->tcpv_enabled = 1; info->tcpv_rttcnt = 0; -- cgit v1.2.3 From a8c2190ee7da1a1dc68ff1a6b5f03feb61e523a5 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 12 Aug 2005 12:56:38 -0300 Subject: [INET_DIAG]: Rename tcp_diag.[ch] to inet_diag.[ch] Next changeset will introduce net/ipv4/tcp_diag.c, moving the code that was put transitioanlly in inet_diag.c. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/Makefile | 2 +- net/ipv4/inet_diag.c | 893 ++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_diag.c | 892 ----------------------------------------------- net/ipv4/tcp_vegas.c | 2 +- net/ipv4/tcp_westwood.c | 2 +- 5 files changed, 896 insertions(+), 895 deletions(-) create mode 100644 net/ipv4/inet_diag.c delete mode 100644 net/ipv4/tcp_diag.c (limited to 'net/ipv4') diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 9b1c894039a..fe5accbb56b 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -30,7 +30,7 @@ obj-$(CONFIG_IP_ROUTE_MULTIPATH_WRANDOM) += multipath_wrandom.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_IP_VS) += ipvs/ -obj-$(CONFIG_IP_INET_DIAG) += tcp_diag.o +obj-$(CONFIG_IP_INET_DIAG) += inet_diag.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c new file mode 100644 index 00000000000..3bd510941da --- /dev/null +++ b/net/ipv4/inet_diag.c @@ -0,0 +1,893 @@ +/* + * inet_diag.c Module for monitoring INET transport protocols sockets. + * + * Version: $Id: inet_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $ + * + * Authors: Alexey Kuznetsov, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +static const struct inet_diag_handler **inet_diag_table; + +struct inet_diag_entry { + u32 *saddr; + u32 *daddr; + u16 sport; + u16 dport; + u16 family; + u16 userlocks; +}; + +static struct sock *idiagnl; + +#define INET_DIAG_PUT(skb, attrtype, attrlen) \ + RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) + +static int inet_diag_fill(struct sk_buff *skb, struct sock *sk, + int ext, u32 pid, u32 seq, u16 nlmsg_flags, + const struct nlmsghdr *unlh) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_diag_msg *r; + struct nlmsghdr *nlh; + void *info = NULL; + struct inet_diag_meminfo *minfo = NULL; + unsigned char *b = skb->tail; + const struct inet_diag_handler *handler; + + handler = inet_diag_table[unlh->nlmsg_type]; + BUG_ON(handler == NULL); + + nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); + nlh->nlmsg_flags = nlmsg_flags; + + r = NLMSG_DATA(nlh); + if (sk->sk_state != TCP_TIME_WAIT) { + if (ext & (1 << (INET_DIAG_MEMINFO - 1))) + minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, + sizeof(*minfo)); + if (ext & (1 << (INET_DIAG_INFO - 1))) + info = INET_DIAG_PUT(skb, INET_DIAG_INFO, + handler->idiag_info_size); + + if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) { + size_t len = strlen(icsk->icsk_ca_ops->name); + strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1), + icsk->icsk_ca_ops->name); + } + } + r->idiag_family = sk->sk_family; + r->idiag_state = sk->sk_state; + r->idiag_timer = 0; + r->idiag_retrans = 0; + + r->id.idiag_if = sk->sk_bound_dev_if; + r->id.idiag_cookie[0] = (u32)(unsigned long)sk; + r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); + + if (r->idiag_state == TCP_TIME_WAIT) { + const struct inet_timewait_sock *tw = inet_twsk(sk); + long tmo = tw->tw_ttd - jiffies; + if (tmo < 0) + tmo = 0; + + r->id.idiag_sport = tw->tw_sport; + r->id.idiag_dport = tw->tw_dport; + r->id.idiag_src[0] = tw->tw_rcv_saddr; + r->id.idiag_dst[0] = tw->tw_daddr; + r->idiag_state = tw->tw_substate; + r->idiag_timer = 3; + r->idiag_expires = (tmo * 1000 + HZ - 1) / HZ; + r->idiag_rqueue = 0; + r->idiag_wqueue = 0; + r->idiag_uid = 0; + r->idiag_inode = 0; +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + if (r->idiag_family == AF_INET6) { + const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); + + ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, + &tcp6tw->tw_v6_rcv_saddr); + ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, + &tcp6tw->tw_v6_daddr); + } +#endif + nlh->nlmsg_len = skb->tail - b; + return skb->len; + } + + r->id.idiag_sport = inet->sport; + r->id.idiag_dport = inet->dport; + r->id.idiag_src[0] = inet->rcv_saddr; + r->id.idiag_dst[0] = inet->daddr; + +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + if (r->idiag_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, + &np->rcv_saddr); + ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, + &np->daddr); + } +#endif + +#define EXPIRES_IN_MS(tmo) ((tmo - jiffies) * 1000 + HZ - 1) / HZ + + if (icsk->icsk_pending == ICSK_TIME_RETRANS) { + r->idiag_timer = 1; + r->idiag_retrans = icsk->icsk_retransmits; + r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); + } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { + r->idiag_timer = 4; + r->idiag_retrans = icsk->icsk_probes_out; + r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); + } else if (timer_pending(&sk->sk_timer)) { + r->idiag_timer = 2; + r->idiag_retrans = icsk->icsk_probes_out; + r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires); + } else { + r->idiag_timer = 0; + r->idiag_expires = 0; + } +#undef EXPIRES_IN_MS + + r->idiag_uid = sock_i_uid(sk); + r->idiag_inode = sock_i_ino(sk); + + if (minfo) { + minfo->idiag_rmem = atomic_read(&sk->sk_rmem_alloc); + minfo->idiag_wmem = sk->sk_wmem_queued; + minfo->idiag_fmem = sk->sk_forward_alloc; + minfo->idiag_tmem = atomic_read(&sk->sk_wmem_alloc); + } + + handler->idiag_get_info(sk, r, info); + + if (sk->sk_state < TCP_TIME_WAIT && + icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info) + icsk->icsk_ca_ops->get_info(sk, ext, skb); + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +rtattr_failure: +nlmsg_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int inet_diag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) +{ + int err; + struct sock *sk; + struct inet_diag_req *req = NLMSG_DATA(nlh); + struct sk_buff *rep; + struct inet_hashinfo *hashinfo; + const struct inet_diag_handler *handler; + + handler = inet_diag_table[nlh->nlmsg_type]; + BUG_ON(handler == NULL); + hashinfo = handler->idiag_hashinfo; + + if (req->idiag_family == AF_INET) { + sk = inet_lookup(hashinfo, req->id.idiag_dst[0], + req->id.idiag_dport, req->id.idiag_src[0], + req->id.idiag_sport, req->id.idiag_if); + } +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + else if (req->idiag_family == AF_INET6) { + sk = inet6_lookup(hashinfo, + (struct in6_addr *)req->id.idiag_dst, + req->id.idiag_dport, + (struct in6_addr *)req->id.idiag_src, + req->id.idiag_sport, + req->id.idiag_if); + } +#endif + else { + return -EINVAL; + } + + if (sk == NULL) + return -ENOENT; + + err = -ESTALE; + if ((req->id.idiag_cookie[0] != INET_DIAG_NOCOOKIE || + req->id.idiag_cookie[1] != INET_DIAG_NOCOOKIE) && + ((u32)(unsigned long)sk != req->id.idiag_cookie[0] || + (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.idiag_cookie[1])) + goto out; + + err = -ENOMEM; + rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) + + sizeof(struct inet_diag_meminfo) + + handler->idiag_info_size + 64)), + GFP_KERNEL); + if (!rep) + goto out; + + if (inet_diag_fill(rep, sk, req->idiag_ext, + NETLINK_CB(in_skb).pid, + nlh->nlmsg_seq, 0, nlh) <= 0) + BUG(); + + err = netlink_unicast(idiagnl, rep, NETLINK_CB(in_skb).pid, + MSG_DONTWAIT); + if (err > 0) + err = 0; + +out: + if (sk) { + if (sk->sk_state == TCP_TIME_WAIT) + inet_twsk_put((struct inet_timewait_sock *)sk); + else + sock_put(sk); + } + return err; +} + +static int bitstring_match(const u32 *a1, const u32 *a2, int bits) +{ + int words = bits >> 5; + + bits &= 0x1f; + + if (words) { + if (memcmp(a1, a2, words << 2)) + return 0; + } + if (bits) { + __u32 w1, w2; + __u32 mask; + + w1 = a1[words]; + w2 = a2[words]; + + mask = htonl((0xffffffff) << (32 - bits)); + + if ((w1 ^ w2) & mask) + return 0; + } + + return 1; +} + + +static int inet_diag_bc_run(const void *bc, int len, + const struct inet_diag_entry *entry) +{ + while (len > 0) { + int yes = 1; + const struct inet_diag_bc_op *op = bc; + + switch (op->code) { + case INET_DIAG_BC_NOP: + break; + case INET_DIAG_BC_JMP: + yes = 0; + break; + case INET_DIAG_BC_S_GE: + yes = entry->sport >= op[1].no; + break; + case INET_DIAG_BC_S_LE: + yes = entry->dport <= op[1].no; + break; + case INET_DIAG_BC_D_GE: + yes = entry->dport >= op[1].no; + break; + case INET_DIAG_BC_D_LE: + yes = entry->dport <= op[1].no; + break; + case INET_DIAG_BC_AUTO: + yes = !(entry->userlocks & SOCK_BINDPORT_LOCK); + break; + case INET_DIAG_BC_S_COND: + case INET_DIAG_BC_D_COND: { + struct inet_diag_hostcond *cond; + u32 *addr; + + cond = (struct inet_diag_hostcond *)(op + 1); + if (cond->port != -1 && + cond->port != (op->code == INET_DIAG_BC_S_COND ? + entry->sport : entry->dport)) { + yes = 0; + break; + } + + if (cond->prefix_len == 0) + break; + + if (op->code == INET_DIAG_BC_S_COND) + addr = entry->saddr; + else + addr = entry->daddr; + + if (bitstring_match(addr, cond->addr, cond->prefix_len)) + break; + if (entry->family == AF_INET6 && + cond->family == AF_INET) { + if (addr[0] == 0 && addr[1] == 0 && + addr[2] == htonl(0xffff) && + bitstring_match(addr + 3, cond->addr, + cond->prefix_len)) + break; + } + yes = 0; + break; + } + } + + if (yes) { + len -= op->yes; + bc += op->yes; + } else { + len -= op->no; + bc += op->no; + } + } + return (len == 0); +} + +static int valid_cc(const void *bc, int len, int cc) +{ + while (len >= 0) { + const struct inet_diag_bc_op *op = bc; + + if (cc > len) + return 0; + if (cc == len) + return 1; + if (op->yes < 4) + return 0; + len -= op->yes; + bc += op->yes; + } + return 0; +} + +static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) +{ + const unsigned char *bc = bytecode; + int len = bytecode_len; + + while (len > 0) { + struct inet_diag_bc_op *op = (struct inet_diag_bc_op *)bc; + +//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len); + switch (op->code) { + case INET_DIAG_BC_AUTO: + case INET_DIAG_BC_S_COND: + case INET_DIAG_BC_D_COND: + case INET_DIAG_BC_S_GE: + case INET_DIAG_BC_S_LE: + case INET_DIAG_BC_D_GE: + case INET_DIAG_BC_D_LE: + if (op->yes < 4 || op->yes > len + 4) + return -EINVAL; + case INET_DIAG_BC_JMP: + if (op->no < 4 || op->no > len + 4) + return -EINVAL; + if (op->no < len && + !valid_cc(bytecode, bytecode_len, len - op->no)) + return -EINVAL; + break; + case INET_DIAG_BC_NOP: + if (op->yes < 4 || op->yes > len + 4) + return -EINVAL; + break; + default: + return -EINVAL; + } + bc += op->yes; + len -= op->yes; + } + return len == 0 ? 0 : -EINVAL; +} + +static int inet_diag_dump_sock(struct sk_buff *skb, struct sock *sk, + struct netlink_callback *cb) +{ + struct inet_diag_req *r = NLMSG_DATA(cb->nlh); + + if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { + struct inet_diag_entry entry; + struct rtattr *bc = (struct rtattr *)(r + 1); + struct inet_sock *inet = inet_sk(sk); + + entry.family = sk->sk_family; +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + if (entry.family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + entry.saddr = np->rcv_saddr.s6_addr32; + entry.daddr = np->daddr.s6_addr32; + } else +#endif + { + entry.saddr = &inet->rcv_saddr; + entry.daddr = &inet->daddr; + } + entry.sport = inet->num; + entry.dport = ntohs(inet->dport); + entry.userlocks = sk->sk_userlocks; + + if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) + return 0; + } + + return inet_diag_fill(skb, sk, r->idiag_ext, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); +} + +static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, + struct request_sock *req, + u32 pid, u32 seq, + const struct nlmsghdr *unlh) +{ + const struct inet_request_sock *ireq = inet_rsk(req); + struct inet_sock *inet = inet_sk(sk); + unsigned char *b = skb->tail; + struct inet_diag_msg *r; + struct nlmsghdr *nlh; + long tmo; + + nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); + nlh->nlmsg_flags = NLM_F_MULTI; + r = NLMSG_DATA(nlh); + + r->idiag_family = sk->sk_family; + r->idiag_state = TCP_SYN_RECV; + r->idiag_timer = 1; + r->idiag_retrans = req->retrans; + + r->id.idiag_if = sk->sk_bound_dev_if; + r->id.idiag_cookie[0] = (u32)(unsigned long)req; + r->id.idiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1); + + tmo = req->expires - jiffies; + if (tmo < 0) + tmo = 0; + + r->id.idiag_sport = inet->sport; + r->id.idiag_dport = ireq->rmt_port; + r->id.idiag_src[0] = ireq->loc_addr; + r->id.idiag_dst[0] = ireq->rmt_addr; + r->idiag_expires = jiffies_to_msecs(tmo); + r->idiag_rqueue = 0; + r->idiag_wqueue = 0; + r->idiag_uid = sock_i_uid(sk); + r->idiag_inode = 0; +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + if (r->idiag_family == AF_INET6) { + ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, + &tcp6_rsk(req)->loc_addr); + ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, + &tcp6_rsk(req)->rmt_addr); + } +#endif + nlh->nlmsg_len = skb->tail - b; + + return skb->len; + +nlmsg_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, + struct netlink_callback *cb) +{ + struct inet_diag_entry entry; + struct inet_diag_req *r = NLMSG_DATA(cb->nlh); + struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt; + struct rtattr *bc = NULL; + struct inet_sock *inet = inet_sk(sk); + int j, s_j; + int reqnum, s_reqnum; + int err = 0; + + s_j = cb->args[3]; + s_reqnum = cb->args[4]; + + if (s_j > 0) + s_j--; + + entry.family = sk->sk_family; + + read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + + lopt = icsk->icsk_accept_queue.listen_opt; + if (!lopt || !lopt->qlen) + goto out; + + if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { + bc = (struct rtattr *)(r + 1); + entry.sport = inet->num; + entry.userlocks = sk->sk_userlocks; + } + + for (j = s_j; j < lopt->nr_table_entries; j++) { + struct request_sock *req, *head = lopt->syn_table[j]; + + reqnum = 0; + for (req = head; req; reqnum++, req = req->dl_next) { + struct inet_request_sock *ireq = inet_rsk(req); + + if (reqnum < s_reqnum) + continue; + if (r->id.idiag_dport != ireq->rmt_port && + r->id.idiag_dport) + continue; + + if (bc) { + entry.saddr = +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + (entry.family == AF_INET6) ? + tcp6_rsk(req)->loc_addr.s6_addr32 : +#endif + &ireq->loc_addr; + entry.daddr = +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) + (entry.family == AF_INET6) ? + tcp6_rsk(req)->rmt_addr.s6_addr32 : +#endif + &ireq->rmt_addr; + entry.dport = ntohs(ireq->rmt_port); + + if (!inet_diag_bc_run(RTA_DATA(bc), + RTA_PAYLOAD(bc), &entry)) + continue; + } + + err = inet_diag_fill_req(skb, sk, req, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, cb->nlh); + if (err < 0) { + cb->args[3] = j + 1; + cb->args[4] = reqnum; + goto out; + } + } + + s_reqnum = 0; + } + +out: + read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + + return err; +} + +static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int i, num; + int s_i, s_num; + struct inet_diag_req *r = NLMSG_DATA(cb->nlh); + const struct inet_diag_handler *handler; + struct inet_hashinfo *hashinfo; + + handler = inet_diag_table[cb->nlh->nlmsg_type]; + BUG_ON(handler == NULL); + hashinfo = handler->idiag_hashinfo; + + s_i = cb->args[1]; + s_num = num = cb->args[2]; + + if (cb->args[0] == 0) { + if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV))) + goto skip_listen_ht; + + inet_listen_lock(hashinfo); + for (i = s_i; i < INET_LHTABLE_SIZE; i++) { + struct sock *sk; + struct hlist_node *node; + + num = 0; + sk_for_each(sk, node, &hashinfo->listening_hash[i]) { + struct inet_sock *inet = inet_sk(sk); + + if (num < s_num) { + num++; + continue; + } + + if (r->id.idiag_sport != inet->sport && + r->id.idiag_sport) + goto next_listen; + + if (!(r->idiag_states & TCPF_LISTEN) || + r->id.idiag_dport || + cb->args[3] > 0) + goto syn_recv; + + if (inet_diag_dump_sock(skb, sk, cb) < 0) { + inet_listen_unlock(hashinfo); + goto done; + } + +syn_recv: + if (!(r->idiag_states & TCPF_SYN_RECV)) + goto next_listen; + + if (inet_diag_dump_reqs(skb, sk, cb) < 0) { + inet_listen_unlock(hashinfo); + goto done; + } + +next_listen: + cb->args[3] = 0; + cb->args[4] = 0; + ++num; + } + + s_num = 0; + cb->args[3] = 0; + cb->args[4] = 0; + } + inet_listen_unlock(hashinfo); +skip_listen_ht: + cb->args[0] = 1; + s_i = num = s_num = 0; + } + + if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV))) + return skb->len; + + for (i = s_i; i < hashinfo->ehash_size; i++) { + struct inet_ehash_bucket *head = &hashinfo->ehash[i]; + struct sock *sk; + struct hlist_node *node; + + if (i > s_i) + s_num = 0; + + read_lock_bh(&head->lock); + + num = 0; + sk_for_each(sk, node, &head->chain) { + struct inet_sock *inet = inet_sk(sk); + + if (num < s_num) + goto next_normal; + if (!(r->idiag_states & (1 << sk->sk_state))) + goto next_normal; + if (r->id.idiag_sport != inet->sport && + r->id.idiag_sport) + goto next_normal; + if (r->id.idiag_dport != inet->dport && r->id.idiag_dport) + goto next_normal; + if (inet_diag_dump_sock(skb, sk, cb) < 0) { + read_unlock_bh(&head->lock); + goto done; + } +next_normal: + ++num; + } + + if (r->idiag_states & TCPF_TIME_WAIT) { + sk_for_each(sk, node, + &hashinfo->ehash[i + hashinfo->ehash_size].chain) { + struct inet_sock *inet = inet_sk(sk); + + if (num < s_num) + goto next_dying; + if (r->id.idiag_sport != inet->sport && + r->id.idiag_sport) + goto next_dying; + if (r->id.idiag_dport != inet->dport && + r->id.idiag_dport) + goto next_dying; + if (inet_diag_dump_sock(skb, sk, cb) < 0) { + read_unlock_bh(&head->lock); + goto done; + } +next_dying: + ++num; + } + } + read_unlock_bh(&head->lock); + } + +done: + cb->args[1] = i; + cb->args[2] = num; + return skb->len; +} + +static int inet_diag_dump_done(struct netlink_callback *cb) +{ + return 0; +} + + +static __inline__ int +inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) + return 0; + + if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX) + goto err_inval; + + if (inet_diag_table[nlh->nlmsg_type] == NULL) + return -ENOENT; + + if (NLMSG_LENGTH(sizeof(struct inet_diag_req)) > skb->len) + goto err_inval; + + if (nlh->nlmsg_flags&NLM_F_DUMP) { + if (nlh->nlmsg_len > + (4 + NLMSG_SPACE(sizeof(struct inet_diag_req)))) { + struct rtattr *rta = (void *)(NLMSG_DATA(nlh) + + sizeof(struct inet_diag_req)); + if (rta->rta_type != INET_DIAG_REQ_BYTECODE || + rta->rta_len < 8 || + rta->rta_len > + (nlh->nlmsg_len - + NLMSG_SPACE(sizeof(struct inet_diag_req)))) + goto err_inval; + if (inet_diag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta))) + goto err_inval; + } + return netlink_dump_start(idiagnl, skb, nlh, + inet_diag_dump, + inet_diag_dump_done); + } else { + return inet_diag_get_exact(skb, nlh); + } + +err_inval: + return -EINVAL; +} + + +static inline void inet_diag_rcv_skb(struct sk_buff *skb) +{ + int err; + struct nlmsghdr * nlh; + + if (skb->len >= NLMSG_SPACE(0)) { + nlh = (struct nlmsghdr *)skb->data; + if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) + return; + err = inet_diag_rcv_msg(skb, nlh); + if (err || nlh->nlmsg_flags & NLM_F_ACK) + netlink_ack(skb, nlh, err); + } +} + +static void inet_diag_rcv(struct sock *sk, int len) +{ + struct sk_buff *skb; + unsigned int qlen = skb_queue_len(&sk->sk_receive_queue); + + while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) { + inet_diag_rcv_skb(skb); + kfree_skb(skb); + } +} + +static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, + void *_info) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct tcp_info *info = _info; + + r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq; + r->idiag_wqueue = tp->write_seq - tp->snd_una; + if (info != NULL) + tcp_get_info(sk, info); +} + +static struct inet_diag_handler tcp_diag_handler = { + .idiag_hashinfo = &tcp_hashinfo, + .idiag_get_info = tcp_diag_get_info, + .idiag_type = TCPDIAG_GETSOCK, + .idiag_info_size = sizeof(struct tcp_info), +}; + +static DEFINE_SPINLOCK(inet_diag_register_lock); + +int inet_diag_register(const struct inet_diag_handler *h) +{ + const __u16 type = h->idiag_type; + int err = -EINVAL; + + if (type >= INET_DIAG_GETSOCK_MAX) + goto out; + + spin_lock(&inet_diag_register_lock); + err = -EEXIST; + if (inet_diag_table[type] == NULL) { + inet_diag_table[type] = h; + err = 0; + } + spin_unlock(&inet_diag_register_lock); +out: + return err; +} +EXPORT_SYMBOL_GPL(inet_diag_register); + +void inet_diag_unregister(const struct inet_diag_handler *h) +{ + const __u16 type = h->idiag_type; + + if (type >= INET_DIAG_GETSOCK_MAX) + return; + + spin_lock(&inet_diag_register_lock); + inet_diag_table[type] = NULL; + spin_unlock(&inet_diag_register_lock); + + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(inet_diag_unregister); + +static int __init inet_diag_init(void) +{ + const int inet_diag_table_size = (INET_DIAG_GETSOCK_MAX * + sizeof(struct inet_diag_handler *)); + int err = -ENOMEM; + + inet_diag_table = kmalloc(inet_diag_table_size, GFP_KERNEL); + if (!inet_diag_table) + goto out; + + memset(inet_diag_table, 0, inet_diag_table_size); + + idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, inet_diag_rcv, + THIS_MODULE); + if (idiagnl == NULL) + goto out_free_table; + + err = inet_diag_register(&tcp_diag_handler); + if (err) + goto out_sock_release; +out: + return err; +out_sock_release: + sock_release(idiagnl->sk_socket); +out_free_table: + kfree(inet_diag_table); + goto out; +} + +static void __exit inet_diag_exit(void) +{ + sock_release(idiagnl->sk_socket); + kfree(inet_diag_table); +} + +module_init(inet_diag_init); +module_exit(inet_diag_exit); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c deleted file mode 100644 index 24abe82e23a..00000000000 --- a/net/ipv4/tcp_diag.c +++ /dev/null @@ -1,892 +0,0 @@ -/* - * inet_diag.c Module for monitoring INET transport protocols sockets. - * - * Version: $Id: inet_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $ - * - * Authors: Alexey Kuznetsov, - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -static const struct inet_diag_handler **inet_diag_table; - -struct inet_diag_entry { - u32 *saddr; - u32 *daddr; - u16 sport; - u16 dport; - u16 family; - u16 userlocks; -}; - -static struct sock *idiagnl; - -#define INET_DIAG_PUT(skb, attrtype, attrlen) \ - RTA_DATA(__RTA_PUT(skb, attrtype, attrlen)) - -static int inet_diag_fill(struct sk_buff *skb, struct sock *sk, - int ext, u32 pid, u32 seq, u16 nlmsg_flags, - const struct nlmsghdr *unlh) -{ - const struct inet_sock *inet = inet_sk(sk); - const struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_diag_msg *r; - struct nlmsghdr *nlh; - void *info = NULL; - struct inet_diag_meminfo *minfo = NULL; - unsigned char *b = skb->tail; - const struct inet_diag_handler *handler; - - handler = inet_diag_table[unlh->nlmsg_type]; - BUG_ON(handler == NULL); - - nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); - nlh->nlmsg_flags = nlmsg_flags; - - r = NLMSG_DATA(nlh); - if (sk->sk_state != TCP_TIME_WAIT) { - if (ext & (1 << (INET_DIAG_MEMINFO - 1))) - minfo = INET_DIAG_PUT(skb, INET_DIAG_MEMINFO, - sizeof(*minfo)); - if (ext & (1 << (INET_DIAG_INFO - 1))) - info = INET_DIAG_PUT(skb, INET_DIAG_INFO, - handler->idiag_info_size); - - if ((ext & (1 << (INET_DIAG_CONG - 1))) && icsk->icsk_ca_ops) { - size_t len = strlen(icsk->icsk_ca_ops->name); - strcpy(INET_DIAG_PUT(skb, INET_DIAG_CONG, len + 1), - icsk->icsk_ca_ops->name); - } - } - r->idiag_family = sk->sk_family; - r->idiag_state = sk->sk_state; - r->idiag_timer = 0; - r->idiag_retrans = 0; - - r->id.idiag_if = sk->sk_bound_dev_if; - r->id.idiag_cookie[0] = (u32)(unsigned long)sk; - r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); - - if (r->idiag_state == TCP_TIME_WAIT) { - const struct inet_timewait_sock *tw = inet_twsk(sk); - long tmo = tw->tw_ttd - jiffies; - if (tmo < 0) - tmo = 0; - - r->id.idiag_sport = tw->tw_sport; - r->id.idiag_dport = tw->tw_dport; - r->id.idiag_src[0] = tw->tw_rcv_saddr; - r->id.idiag_dst[0] = tw->tw_daddr; - r->idiag_state = tw->tw_substate; - r->idiag_timer = 3; - r->idiag_expires = (tmo * 1000 + HZ - 1) / HZ; - r->idiag_rqueue = 0; - r->idiag_wqueue = 0; - r->idiag_uid = 0; - r->idiag_inode = 0; -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - if (r->idiag_family == AF_INET6) { - const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); - - ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, - &tcp6tw->tw_v6_rcv_saddr); - ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, - &tcp6tw->tw_v6_daddr); - } -#endif - nlh->nlmsg_len = skb->tail - b; - return skb->len; - } - - r->id.idiag_sport = inet->sport; - r->id.idiag_dport = inet->dport; - r->id.idiag_src[0] = inet->rcv_saddr; - r->id.idiag_dst[0] = inet->daddr; - -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - if (r->idiag_family == AF_INET6) { - struct ipv6_pinfo *np = inet6_sk(sk); - - ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, - &np->rcv_saddr); - ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, - &np->daddr); - } -#endif - -#define EXPIRES_IN_MS(tmo) ((tmo-jiffies)*1000+HZ-1)/HZ - - if (icsk->icsk_pending == ICSK_TIME_RETRANS) { - r->idiag_timer = 1; - r->idiag_retrans = icsk->icsk_retransmits; - r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); - } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { - r->idiag_timer = 4; - r->idiag_retrans = icsk->icsk_probes_out; - r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); - } else if (timer_pending(&sk->sk_timer)) { - r->idiag_timer = 2; - r->idiag_retrans = icsk->icsk_probes_out; - r->idiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires); - } else { - r->idiag_timer = 0; - r->idiag_expires = 0; - } -#undef EXPIRES_IN_MS - - r->idiag_uid = sock_i_uid(sk); - r->idiag_inode = sock_i_ino(sk); - - if (minfo) { - minfo->idiag_rmem = atomic_read(&sk->sk_rmem_alloc); - minfo->idiag_wmem = sk->sk_wmem_queued; - minfo->idiag_fmem = sk->sk_forward_alloc; - minfo->idiag_tmem = atomic_read(&sk->sk_wmem_alloc); - } - - handler->idiag_get_info(sk, r, info); - - if (sk->sk_state < TCP_TIME_WAIT && - icsk->icsk_ca_ops && icsk->icsk_ca_ops->get_info) - icsk->icsk_ca_ops->get_info(sk, ext, skb); - - nlh->nlmsg_len = skb->tail - b; - return skb->len; - -rtattr_failure: -nlmsg_failure: - skb_trim(skb, b - skb->data); - return -1; -} - -static int inet_diag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) -{ - int err; - struct sock *sk; - struct inet_diag_req *req = NLMSG_DATA(nlh); - struct sk_buff *rep; - struct inet_hashinfo *hashinfo; - const struct inet_diag_handler *handler; - - handler = inet_diag_table[nlh->nlmsg_type]; - BUG_ON(handler == NULL); - hashinfo = handler->idiag_hashinfo; - - if (req->idiag_family == AF_INET) { - sk = inet_lookup(hashinfo, req->id.idiag_dst[0], - req->id.idiag_dport, req->id.idiag_src[0], - req->id.idiag_sport, req->id.idiag_if); - } -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - else if (req->idiag_family == AF_INET6) { - sk = inet6_lookup(hashinfo, - (struct in6_addr *)req->id.idiag_dst, - req->id.idiag_dport, - (struct in6_addr *)req->id.idiag_src, - req->id.idiag_sport, - req->id.idiag_if); - } -#endif - else { - return -EINVAL; - } - - if (sk == NULL) - return -ENOENT; - - err = -ESTALE; - if ((req->id.idiag_cookie[0] != INET_DIAG_NOCOOKIE || - req->id.idiag_cookie[1] != INET_DIAG_NOCOOKIE) && - ((u32)(unsigned long)sk != req->id.idiag_cookie[0] || - (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.idiag_cookie[1])) - goto out; - - err = -ENOMEM; - rep = alloc_skb(NLMSG_SPACE((sizeof(struct inet_diag_msg) + - sizeof(struct inet_diag_meminfo) + - handler->idiag_info_size + 64)), - GFP_KERNEL); - if (!rep) - goto out; - - if (inet_diag_fill(rep, sk, req->idiag_ext, - NETLINK_CB(in_skb).pid, - nlh->nlmsg_seq, 0, nlh) <= 0) - BUG(); - - err = netlink_unicast(idiagnl, rep, NETLINK_CB(in_skb).pid, - MSG_DONTWAIT); - if (err > 0) - err = 0; - -out: - if (sk) { - if (sk->sk_state == TCP_TIME_WAIT) - inet_twsk_put((struct inet_timewait_sock *)sk); - else - sock_put(sk); - } - return err; -} - -static int bitstring_match(const u32 *a1, const u32 *a2, int bits) -{ - int words = bits >> 5; - - bits &= 0x1f; - - if (words) { - if (memcmp(a1, a2, words << 2)) - return 0; - } - if (bits) { - __u32 w1, w2; - __u32 mask; - - w1 = a1[words]; - w2 = a2[words]; - - mask = htonl((0xffffffff) << (32 - bits)); - - if ((w1 ^ w2) & mask) - return 0; - } - - return 1; -} - - -static int inet_diag_bc_run(const void *bc, int len, - const struct inet_diag_entry *entry) -{ - while (len > 0) { - int yes = 1; - const struct inet_diag_bc_op *op = bc; - - switch (op->code) { - case INET_DIAG_BC_NOP: - break; - case INET_DIAG_BC_JMP: - yes = 0; - break; - case INET_DIAG_BC_S_GE: - yes = entry->sport >= op[1].no; - break; - case INET_DIAG_BC_S_LE: - yes = entry->dport <= op[1].no; - break; - case INET_DIAG_BC_D_GE: - yes = entry->dport >= op[1].no; - break; - case INET_DIAG_BC_D_LE: - yes = entry->dport <= op[1].no; - break; - case INET_DIAG_BC_AUTO: - yes = !(entry->userlocks & SOCK_BINDPORT_LOCK); - break; - case INET_DIAG_BC_S_COND: - case INET_DIAG_BC_D_COND: - { - struct inet_diag_hostcond *cond = (struct inet_diag_hostcond*)(op+1); - u32 *addr; - - if (cond->port != -1 && - cond->port != (op->code == INET_DIAG_BC_S_COND ? - entry->sport : entry->dport)) { - yes = 0; - break; - } - - if (cond->prefix_len == 0) - break; - - if (op->code == INET_DIAG_BC_S_COND) - addr = entry->saddr; - else - addr = entry->daddr; - - if (bitstring_match(addr, cond->addr, cond->prefix_len)) - break; - if (entry->family == AF_INET6 && - cond->family == AF_INET) { - if (addr[0] == 0 && addr[1] == 0 && - addr[2] == htonl(0xffff) && - bitstring_match(addr+3, cond->addr, cond->prefix_len)) - break; - } - yes = 0; - break; - } - } - - if (yes) { - len -= op->yes; - bc += op->yes; - } else { - len -= op->no; - bc += op->no; - } - } - return (len == 0); -} - -static int valid_cc(const void *bc, int len, int cc) -{ - while (len >= 0) { - const struct inet_diag_bc_op *op = bc; - - if (cc > len) - return 0; - if (cc == len) - return 1; - if (op->yes < 4) - return 0; - len -= op->yes; - bc += op->yes; - } - return 0; -} - -static int inet_diag_bc_audit(const void *bytecode, int bytecode_len) -{ - const unsigned char *bc = bytecode; - int len = bytecode_len; - - while (len > 0) { - struct inet_diag_bc_op *op = (struct inet_diag_bc_op*)bc; - -//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len); - switch (op->code) { - case INET_DIAG_BC_AUTO: - case INET_DIAG_BC_S_COND: - case INET_DIAG_BC_D_COND: - case INET_DIAG_BC_S_GE: - case INET_DIAG_BC_S_LE: - case INET_DIAG_BC_D_GE: - case INET_DIAG_BC_D_LE: - if (op->yes < 4 || op->yes > len+4) - return -EINVAL; - case INET_DIAG_BC_JMP: - if (op->no < 4 || op->no > len+4) - return -EINVAL; - if (op->no < len && - !valid_cc(bytecode, bytecode_len, len-op->no)) - return -EINVAL; - break; - case INET_DIAG_BC_NOP: - if (op->yes < 4 || op->yes > len+4) - return -EINVAL; - break; - default: - return -EINVAL; - } - bc += op->yes; - len -= op->yes; - } - return len == 0 ? 0 : -EINVAL; -} - -static int inet_diag_dump_sock(struct sk_buff *skb, struct sock *sk, - struct netlink_callback *cb) -{ - struct inet_diag_req *r = NLMSG_DATA(cb->nlh); - - if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { - struct inet_diag_entry entry; - struct rtattr *bc = (struct rtattr *)(r + 1); - struct inet_sock *inet = inet_sk(sk); - - entry.family = sk->sk_family; -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - if (entry.family == AF_INET6) { - struct ipv6_pinfo *np = inet6_sk(sk); - - entry.saddr = np->rcv_saddr.s6_addr32; - entry.daddr = np->daddr.s6_addr32; - } else -#endif - { - entry.saddr = &inet->rcv_saddr; - entry.daddr = &inet->daddr; - } - entry.sport = inet->num; - entry.dport = ntohs(inet->dport); - entry.userlocks = sk->sk_userlocks; - - if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) - return 0; - } - - return inet_diag_fill(skb, sk, r->idiag_ext, NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh); -} - -static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, - struct request_sock *req, - u32 pid, u32 seq, - const struct nlmsghdr *unlh) -{ - const struct inet_request_sock *ireq = inet_rsk(req); - struct inet_sock *inet = inet_sk(sk); - unsigned char *b = skb->tail; - struct inet_diag_msg *r; - struct nlmsghdr *nlh; - long tmo; - - nlh = NLMSG_PUT(skb, pid, seq, unlh->nlmsg_type, sizeof(*r)); - nlh->nlmsg_flags = NLM_F_MULTI; - r = NLMSG_DATA(nlh); - - r->idiag_family = sk->sk_family; - r->idiag_state = TCP_SYN_RECV; - r->idiag_timer = 1; - r->idiag_retrans = req->retrans; - - r->id.idiag_if = sk->sk_bound_dev_if; - r->id.idiag_cookie[0] = (u32)(unsigned long)req; - r->id.idiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1); - - tmo = req->expires - jiffies; - if (tmo < 0) - tmo = 0; - - r->id.idiag_sport = inet->sport; - r->id.idiag_dport = ireq->rmt_port; - r->id.idiag_src[0] = ireq->loc_addr; - r->id.idiag_dst[0] = ireq->rmt_addr; - r->idiag_expires = jiffies_to_msecs(tmo); - r->idiag_rqueue = 0; - r->idiag_wqueue = 0; - r->idiag_uid = sock_i_uid(sk); - r->idiag_inode = 0; -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - if (r->idiag_family == AF_INET6) { - ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, - &tcp6_rsk(req)->loc_addr); - ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, - &tcp6_rsk(req)->rmt_addr); - } -#endif - nlh->nlmsg_len = skb->tail - b; - - return skb->len; - -nlmsg_failure: - skb_trim(skb, b - skb->data); - return -1; -} - -static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, - struct netlink_callback *cb) -{ - struct inet_diag_entry entry; - struct inet_diag_req *r = NLMSG_DATA(cb->nlh); - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt; - struct rtattr *bc = NULL; - struct inet_sock *inet = inet_sk(sk); - int j, s_j; - int reqnum, s_reqnum; - int err = 0; - - s_j = cb->args[3]; - s_reqnum = cb->args[4]; - - if (s_j > 0) - s_j--; - - entry.family = sk->sk_family; - - read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - - lopt = icsk->icsk_accept_queue.listen_opt; - if (!lopt || !lopt->qlen) - goto out; - - if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { - bc = (struct rtattr *)(r + 1); - entry.sport = inet->num; - entry.userlocks = sk->sk_userlocks; - } - - for (j = s_j; j < lopt->nr_table_entries; j++) { - struct request_sock *req, *head = lopt->syn_table[j]; - - reqnum = 0; - for (req = head; req; reqnum++, req = req->dl_next) { - struct inet_request_sock *ireq = inet_rsk(req); - - if (reqnum < s_reqnum) - continue; - if (r->id.idiag_dport != ireq->rmt_port && - r->id.idiag_dport) - continue; - - if (bc) { - entry.saddr = -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - (entry.family == AF_INET6) ? - tcp6_rsk(req)->loc_addr.s6_addr32 : -#endif - &ireq->loc_addr; - entry.daddr = -#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) - (entry.family == AF_INET6) ? - tcp6_rsk(req)->rmt_addr.s6_addr32 : -#endif - &ireq->rmt_addr; - entry.dport = ntohs(ireq->rmt_port); - - if (!inet_diag_bc_run(RTA_DATA(bc), - RTA_PAYLOAD(bc), &entry)) - continue; - } - - err = inet_diag_fill_req(skb, sk, req, - NETLINK_CB(cb->skb).pid, - cb->nlh->nlmsg_seq, cb->nlh); - if (err < 0) { - cb->args[3] = j + 1; - cb->args[4] = reqnum; - goto out; - } - } - - s_reqnum = 0; - } - -out: - read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); - - return err; -} - -static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) -{ - int i, num; - int s_i, s_num; - struct inet_diag_req *r = NLMSG_DATA(cb->nlh); - const struct inet_diag_handler *handler; - struct inet_hashinfo *hashinfo; - - handler = inet_diag_table[cb->nlh->nlmsg_type]; - BUG_ON(handler == NULL); - hashinfo = handler->idiag_hashinfo; - - s_i = cb->args[1]; - s_num = num = cb->args[2]; - - if (cb->args[0] == 0) { - if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV))) - goto skip_listen_ht; - - inet_listen_lock(hashinfo); - for (i = s_i; i < INET_LHTABLE_SIZE; i++) { - struct sock *sk; - struct hlist_node *node; - - num = 0; - sk_for_each(sk, node, &hashinfo->listening_hash[i]) { - struct inet_sock *inet = inet_sk(sk); - - if (num < s_num) { - num++; - continue; - } - - if (r->id.idiag_sport != inet->sport && - r->id.idiag_sport) - goto next_listen; - - if (!(r->idiag_states & TCPF_LISTEN) || - r->id.idiag_dport || - cb->args[3] > 0) - goto syn_recv; - - if (inet_diag_dump_sock(skb, sk, cb) < 0) { - inet_listen_unlock(hashinfo); - goto done; - } - -syn_recv: - if (!(r->idiag_states & TCPF_SYN_RECV)) - goto next_listen; - - if (inet_diag_dump_reqs(skb, sk, cb) < 0) { - inet_listen_unlock(hashinfo); - goto done; - } - -next_listen: - cb->args[3] = 0; - cb->args[4] = 0; - ++num; - } - - s_num = 0; - cb->args[3] = 0; - cb->args[4] = 0; - } - inet_listen_unlock(hashinfo); -skip_listen_ht: - cb->args[0] = 1; - s_i = num = s_num = 0; - } - - if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV))) - return skb->len; - - for (i = s_i; i < hashinfo->ehash_size; i++) { - struct inet_ehash_bucket *head = &hashinfo->ehash[i]; - struct sock *sk; - struct hlist_node *node; - - if (i > s_i) - s_num = 0; - - read_lock_bh(&head->lock); - - num = 0; - sk_for_each(sk, node, &head->chain) { - struct inet_sock *inet = inet_sk(sk); - - if (num < s_num) - goto next_normal; - if (!(r->idiag_states & (1 << sk->sk_state))) - goto next_normal; - if (r->id.idiag_sport != inet->sport && - r->id.idiag_sport) - goto next_normal; - if (r->id.idiag_dport != inet->dport && r->id.idiag_dport) - goto next_normal; - if (inet_diag_dump_sock(skb, sk, cb) < 0) { - read_unlock_bh(&head->lock); - goto done; - } -next_normal: - ++num; - } - - if (r->idiag_states & TCPF_TIME_WAIT) { - sk_for_each(sk, node, - &hashinfo->ehash[i + hashinfo->ehash_size].chain) { - struct inet_sock *inet = inet_sk(sk); - - if (num < s_num) - goto next_dying; - if (r->id.idiag_sport != inet->sport && - r->id.idiag_sport) - goto next_dying; - if (r->id.idiag_dport != inet->dport && - r->id.idiag_dport) - goto next_dying; - if (inet_diag_dump_sock(skb, sk, cb) < 0) { - read_unlock_bh(&head->lock); - goto done; - } -next_dying: - ++num; - } - } - read_unlock_bh(&head->lock); - } - -done: - cb->args[1] = i; - cb->args[2] = num; - return skb->len; -} - -static int inet_diag_dump_done(struct netlink_callback *cb) -{ - return 0; -} - - -static __inline__ int -inet_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) -{ - if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) - return 0; - - if (nlh->nlmsg_type >= INET_DIAG_GETSOCK_MAX) - goto err_inval; - - if (inet_diag_table[nlh->nlmsg_type] == NULL) - return -ENOENT; - - if (NLMSG_LENGTH(sizeof(struct inet_diag_req)) > skb->len) - goto err_inval; - - if (nlh->nlmsg_flags&NLM_F_DUMP) { - if (nlh->nlmsg_len > - (4 + NLMSG_SPACE(sizeof(struct inet_diag_req)))) { - struct rtattr *rta = (void *)(NLMSG_DATA(nlh) + - sizeof(struct inet_diag_req)); - if (rta->rta_type != INET_DIAG_REQ_BYTECODE || - rta->rta_len < 8 || - rta->rta_len > - (nlh->nlmsg_len - - NLMSG_SPACE(sizeof(struct inet_diag_req)))) - goto err_inval; - if (inet_diag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta))) - goto err_inval; - } - return netlink_dump_start(idiagnl, skb, nlh, - inet_diag_dump, - inet_diag_dump_done); - } else { - return inet_diag_get_exact(skb, nlh); - } - -err_inval: - return -EINVAL; -} - - -static inline void inet_diag_rcv_skb(struct sk_buff *skb) -{ - int err; - struct nlmsghdr * nlh; - - if (skb->len >= NLMSG_SPACE(0)) { - nlh = (struct nlmsghdr *)skb->data; - if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) - return; - err = inet_diag_rcv_msg(skb, nlh); - if (err || nlh->nlmsg_flags & NLM_F_ACK) - netlink_ack(skb, nlh, err); - } -} - -static void inet_diag_rcv(struct sock *sk, int len) -{ - struct sk_buff *skb; - unsigned int qlen = skb_queue_len(&sk->sk_receive_queue); - - while (qlen-- && (skb = skb_dequeue(&sk->sk_receive_queue))) { - inet_diag_rcv_skb(skb); - kfree_skb(skb); - } -} - -static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, - void *_info) -{ - const struct tcp_sock *tp = tcp_sk(sk); - struct tcp_info *info = _info; - - r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq; - r->idiag_wqueue = tp->write_seq - tp->snd_una; - if (info != NULL) - tcp_get_info(sk, info); -} - -static struct inet_diag_handler tcp_diag_handler = { - .idiag_hashinfo = &tcp_hashinfo, - .idiag_get_info = tcp_diag_get_info, - .idiag_type = TCPDIAG_GETSOCK, - .idiag_info_size = sizeof(struct tcp_info), -}; - -static DEFINE_SPINLOCK(inet_diag_register_lock); - -int inet_diag_register(const struct inet_diag_handler *h) -{ - const __u16 type = h->idiag_type; - int err = -EINVAL; - - if (type >= INET_DIAG_GETSOCK_MAX) - goto out; - - spin_lock(&inet_diag_register_lock); - err = -EEXIST; - if (inet_diag_table[type] == NULL) { - inet_diag_table[type] = h; - err = 0; - } - spin_unlock(&inet_diag_register_lock); -out: - return err; -} -EXPORT_SYMBOL_GPL(inet_diag_register); - -void inet_diag_unregister(const struct inet_diag_handler *h) -{ - const __u16 type = h->idiag_type; - - if (type >= INET_DIAG_GETSOCK_MAX) - return; - - spin_lock(&inet_diag_register_lock); - inet_diag_table[type] = NULL; - spin_unlock(&inet_diag_register_lock); - - synchronize_rcu(); -} -EXPORT_SYMBOL_GPL(inet_diag_unregister); - -static int __init inet_diag_init(void) -{ - const int inet_diag_table_size = (INET_DIAG_GETSOCK_MAX * - sizeof(struct inet_diag_handler *)); - int err = -ENOMEM; - - inet_diag_table = kmalloc(inet_diag_table_size, GFP_KERNEL); - if (!inet_diag_table) - goto out; - - memset(inet_diag_table, 0, inet_diag_table_size); - - idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, inet_diag_rcv, - THIS_MODULE); - if (idiagnl == NULL) - goto out_free_table; - - err = inet_diag_register(&tcp_diag_handler); - if (err) - goto out_sock_release; -out: - return err; -out_sock_release: - sock_release(idiagnl->sk_socket); -out_free_table: - kfree(inet_diag_table); - goto out; -} - -static void __exit inet_diag_exit(void) -{ - sock_release(idiagnl->sk_socket); - kfree(inet_diag_table); -} - -module_init(inet_diag_init); -module_exit(inet_diag_exit); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 8cef9dc11fb..93c5f92070f 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 39510031787..0c340c3756c 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include /* TCP Westwood structure */ -- cgit v1.2.3 From 17b085eacef81a6286bd478f2ec75e04abb091cb Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 12 Aug 2005 12:59:17 -0300 Subject: [INET_DIAG]: Move the tcp_diag interface to the proper place With this the previous setup is back, i.e. tcp_diag can be built as a module, as dccp_diag and both share the infrastructure available in inet_diag. If one selects CONFIG_INET_DIAG as module CONFIG_INET_TCP_DIAG will also be built as a module, as will CONFIG_INET_DCCP_DIAG, if CONFIG_IP_DCCP was selected static or as a module, if CONFIG_INET_DIAG is y, being statically linked CONFIG_INET_TCP_DIAG will follow suit and CONFIG_INET_DCCP_DIAG will be built in the same manner as CONFIG_IP_DCCP. Now to aim at UDP, converting it to use inet_hashinfo, so that we can use iproute2 for UDP sockets as well. Ah, just to show an example of this new infrastructure working for DCCP :-) [root@qemu ~]# ./ss -dane State Recv-Q Send-Q Local Address:Port Peer Address:Port LISTEN 0 0 *:5001 *:* ino:942 sk:cfd503a0 ESTAB 0 0 127.0.0.1:5001 127.0.0.1:32770 ino:943 sk:cfd50a60 ESTAB 0 0 127.0.0.1:32770 127.0.0.1:5001 ino:947 sk:cfd50700 TIME-WAIT 0 0 127.0.0.1:32769 127.0.0.1:5001 timer:(timewait,3.430ms,0) ino:0 sk:cf209620 Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 8 ++++++-- net/ipv4/Makefile | 3 ++- net/ipv4/inet_diag.c | 27 +------------------------- net/ipv4/tcp_diag.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 63 insertions(+), 29 deletions(-) create mode 100644 net/ipv4/tcp_diag.c (limited to 'net/ipv4') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 019e88d8f29..e55136ae09f 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -413,8 +413,8 @@ config INET_TUNNEL If unsure, say Y. -config IP_INET_DIAG - tristate "IP: INET socket monitoring interface" +config INET_DIAG + tristate "INET: socket monitoring interface" default y ---help--- Support for INET (TCP, DCCP, etc) socket monitoring interface used by @@ -423,6 +423,10 @@ config IP_INET_DIAG If unsure, say Y. +config INET_TCP_DIAG + depends on INET_DIAG + def_tristate INET_DIAG + config TCP_CONG_ADVANCED bool "TCP: advanced congestion control" ---help--- diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index fe5accbb56b..f0435d00db6 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -30,8 +30,9 @@ obj-$(CONFIG_IP_ROUTE_MULTIPATH_WRANDOM) += multipath_wrandom.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_IP_VS) += ipvs/ -obj-$(CONFIG_IP_INET_DIAG) += inet_diag.o +obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o +obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 3bd510941da..1880ad8575d 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -797,25 +797,6 @@ static void inet_diag_rcv(struct sock *sk, int len) } } -static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, - void *_info) -{ - const struct tcp_sock *tp = tcp_sk(sk); - struct tcp_info *info = _info; - - r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq; - r->idiag_wqueue = tp->write_seq - tp->snd_una; - if (info != NULL) - tcp_get_info(sk, info); -} - -static struct inet_diag_handler tcp_diag_handler = { - .idiag_hashinfo = &tcp_hashinfo, - .idiag_get_info = tcp_diag_get_info, - .idiag_type = TCPDIAG_GETSOCK, - .idiag_info_size = sizeof(struct tcp_info), -}; - static DEFINE_SPINLOCK(inet_diag_register_lock); int inet_diag_register(const struct inet_diag_handler *h) @@ -864,19 +845,13 @@ static int __init inet_diag_init(void) goto out; memset(inet_diag_table, 0, inet_diag_table_size); - idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, inet_diag_rcv, THIS_MODULE); if (idiagnl == NULL) goto out_free_table; - - err = inet_diag_register(&tcp_diag_handler); - if (err) - goto out_sock_release; + err = 0; out: return err; -out_sock_release: - sock_release(idiagnl->sk_socket); out_free_table: kfree(inet_diag_table); goto out; diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c new file mode 100644 index 00000000000..c148c108188 --- /dev/null +++ b/net/ipv4/tcp_diag.c @@ -0,0 +1,54 @@ +/* + * tcp_diag.c Module for monitoring TCP transport protocols sockets. + * + * Version: $Id: tcp_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $ + * + * Authors: Alexey Kuznetsov, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include + +#include +#include + +#include + +#include + +static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, + void *_info) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct tcp_info *info = _info; + + r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq; + r->idiag_wqueue = tp->write_seq - tp->snd_una; + if (info != NULL) + tcp_get_info(sk, info); +} + +static struct inet_diag_handler tcp_diag_handler = { + .idiag_hashinfo = &tcp_hashinfo, + .idiag_get_info = tcp_diag_get_info, + .idiag_type = TCPDIAG_GETSOCK, + .idiag_info_size = sizeof(struct tcp_info), +}; + +static int __init tcp_diag_init(void) +{ + return inet_diag_register(&tcp_diag_handler); +} + +static void __exit tcp_diag_exit(void) +{ + inet_diag_unregister(&tcp_diag_handler); +} + +module_init(tcp_diag_init); +module_exit(tcp_diag_exit); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 9d810fd2d28a9d672eca3136476af1a54a380bb2 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Sat, 13 Aug 2005 13:56:26 -0700 Subject: [NETFILTER]: Add new iptables "connbytes" match This patch ads a new "connbytes" match that utilizes the CONFIG_NF_CT_ACCT per-connection byte and packet counters. Using it you can do things like packet classification on average packet size within a connection. Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/Kconfig | 11 ++- net/ipv4/netfilter/Makefile | 1 + net/ipv4/netfilter/ipt_connbytes.c | 166 +++++++++++++++++++++++++++++++++++++ 3 files changed, 177 insertions(+), 1 deletion(-) create mode 100644 net/ipv4/netfilter/ipt_connbytes.c (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 9f5e1d769b5..3f7e6e49cbd 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -386,6 +386,16 @@ config IP_NF_MATCH_CONNMARK . The module will be called ipt_connmark.o. If unsure, say `N'. +config IP_NF_MATCH_CONNBYTES + tristate 'Connection byte/packet counter match support' + depends on IP_NF_CT_ACCT && IP_NF_IPTABLES + help + This option adds a `connbytes' match, which allows you to match the + number of bytes and/or packets for each direction within a connection. + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + config IP_NF_MATCH_HASHLIMIT tristate 'hashlimit match support' depends on IP_NF_IPTABLES @@ -723,6 +733,5 @@ config IP_NF_CONNTRACK_NETLINK help This option enables support for a netlink-based userspace interface - endmenu diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 58aa7c616e1..7c8ae858aa4 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -59,6 +59,7 @@ obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o obj-$(CONFIG_IP_NF_MATCH_STATE) += ipt_state.o obj-$(CONFIG_IP_NF_MATCH_CONNMARK) += ipt_connmark.o obj-$(CONFIG_IP_NF_MATCH_CONNTRACK) += ipt_conntrack.o +obj-$(CONFIG_IP_NF_MATCH_CONNBYTES) += ipt_connbytes.o obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o diff --git a/net/ipv4/netfilter/ipt_connbytes.c b/net/ipv4/netfilter/ipt_connbytes.c new file mode 100644 index 00000000000..0dfb52c0e80 --- /dev/null +++ b/net/ipv4/netfilter/ipt_connbytes.c @@ -0,0 +1,166 @@ +/* Kernel module to match connection tracking byte counter. + * GPL (C) 2002 Martin Devera (devik@cdi.cz). + * + * 2004-07-20 Harald Welte + * - reimplemented to use per-connection accounting counters + * - add functionality to match number of packets + * - add functionality to match average packet size + * - add support to match directions seperately + * + */ +#include +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("iptables match for matching number of pkts/bytes per connection"); + +/* 64bit divisor, dividend and result. dynamic precision */ +static u_int64_t div64_64(u_int64_t divisor, u_int64_t dividend) +{ + u_int64_t result = divisor; + + if (dividend > 0xffffffff) { + int first_bit = find_first_bit((unsigned long *) ÷nd, sizeof(dividend)); + /* calculate number of bits to shift. shift exactly enough + * bits to make dividend fit in 32bits. */ + int num_shift = (64 - 32 - first_bit); + /* first bit has to be < 32, since dividend was > 0xffffffff */ + result = result >> num_shift; + dividend = dividend >> num_shift; + } + + do_div(divisor, dividend); + + return divisor; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_connbytes_info *sinfo = matchinfo; + enum ip_conntrack_info ctinfo; + struct ip_conntrack *ct; + u_int64_t what = 0; /* initialize to make gcc happy */ + + if (!(ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo))) + return 0; /* no match */ + + switch (sinfo->what) { + case IPT_CONNBYTES_WHAT_PKTS: + switch (sinfo->direction) { + case IPT_CONNBYTES_DIR_ORIGINAL: + what = ct->counters[IP_CT_DIR_ORIGINAL].packets; + break; + case IPT_CONNBYTES_DIR_REPLY: + what = ct->counters[IP_CT_DIR_REPLY].packets; + break; + case IPT_CONNBYTES_DIR_BOTH: + what = ct->counters[IP_CT_DIR_ORIGINAL].packets; + what += ct->counters[IP_CT_DIR_REPLY].packets; + break; + } + break; + case IPT_CONNBYTES_WHAT_BYTES: + switch (sinfo->direction) { + case IPT_CONNBYTES_DIR_ORIGINAL: + what = ct->counters[IP_CT_DIR_ORIGINAL].bytes; + break; + case IPT_CONNBYTES_DIR_REPLY: + what = ct->counters[IP_CT_DIR_REPLY].bytes; + break; + case IPT_CONNBYTES_DIR_BOTH: + what = ct->counters[IP_CT_DIR_ORIGINAL].bytes; + what += ct->counters[IP_CT_DIR_REPLY].bytes; + break; + } + break; + case IPT_CONNBYTES_WHAT_AVGPKT: + switch (sinfo->direction) { + case IPT_CONNBYTES_DIR_ORIGINAL: + what = div64_64(ct->counters[IP_CT_DIR_ORIGINAL].bytes, + ct->counters[IP_CT_DIR_ORIGINAL].packets); + break; + case IPT_CONNBYTES_DIR_REPLY: + what = div64_64(ct->counters[IP_CT_DIR_REPLY].bytes, + ct->counters[IP_CT_DIR_REPLY].packets); + break; + case IPT_CONNBYTES_DIR_BOTH: + { + u_int64_t bytes; + u_int64_t pkts; + bytes = ct->counters[IP_CT_DIR_ORIGINAL].bytes + + ct->counters[IP_CT_DIR_REPLY].bytes; + pkts = ct->counters[IP_CT_DIR_ORIGINAL].packets+ + ct->counters[IP_CT_DIR_REPLY].packets; + + /* FIXME_THEORETICAL: what to do if sum + * overflows ? */ + + what = div64_64(bytes, pkts); + } + break; + } + break; + } + + if (sinfo->count.to) + return (what <= sinfo->count.to && what >= sinfo->count.from); + else + return (what >= sinfo->count.from); +} + +static int check(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ipt_connbytes_info *sinfo = matchinfo; + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_connbytes_info))) + return 0; + + if (sinfo->what != IPT_CONNBYTES_WHAT_PKTS && + sinfo->what != IPT_CONNBYTES_WHAT_BYTES && + sinfo->what != IPT_CONNBYTES_WHAT_AVGPKT) + return 0; + + if (sinfo->direction != IPT_CONNBYTES_DIR_ORIGINAL && + sinfo->direction != IPT_CONNBYTES_DIR_REPLY && + sinfo->direction != IPT_CONNBYTES_DIR_BOTH) + return 0; + + return 1; +} + +static struct ipt_match state_match = { + .name = "connbytes", + .match = &match, + .checkentry = &check, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_match(&state_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&state_match); +} + +module_init(init); +module_exit(fini); -- cgit v1.2.3 From 8ffde671730df0b392ca478643b88ef7153244c0 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sat, 13 Aug 2005 13:57:58 -0700 Subject: [NETFILTER]: Fix div64_64 in ipt_connbytes Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_connbytes.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ipt_connbytes.c b/net/ipv4/netfilter/ipt_connbytes.c index 0dfb52c0e80..47128c073d8 100644 --- a/net/ipv4/netfilter/ipt_connbytes.c +++ b/net/ipv4/netfilter/ipt_connbytes.c @@ -22,23 +22,19 @@ MODULE_AUTHOR("Harald Welte "); MODULE_DESCRIPTION("iptables match for matching number of pkts/bytes per connection"); /* 64bit divisor, dividend and result. dynamic precision */ -static u_int64_t div64_64(u_int64_t divisor, u_int64_t dividend) +static u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor) { - u_int64_t result = divisor; - - if (dividend > 0xffffffff) { - int first_bit = find_first_bit((unsigned long *) ÷nd, sizeof(dividend)); - /* calculate number of bits to shift. shift exactly enough - * bits to make dividend fit in 32bits. */ - int num_shift = (64 - 32 - first_bit); - /* first bit has to be < 32, since dividend was > 0xffffffff */ - result = result >> num_shift; - dividend = dividend >> num_shift; - } + u_int32_t d = divisor; + + if (divisor > 0xffffffffULL) { + unsigned int shift = fls(divisor >> 32); - do_div(divisor, dividend); + d = divisor >> shift; + dividend >>= shift; + } - return divisor; + do_div(dividend, d); + return dividend; } static int -- cgit v1.2.3 From 25ed891019b84498c83903ecf53df7ce35e9cff6 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sat, 13 Aug 2005 13:58:21 -0700 Subject: [NETFILTER]: Nicer names for ipt_connbytes constants Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_connbytes.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ipt_connbytes.c b/net/ipv4/netfilter/ipt_connbytes.c index 47128c073d8..df4a42c6da2 100644 --- a/net/ipv4/netfilter/ipt_connbytes.c +++ b/net/ipv4/netfilter/ipt_connbytes.c @@ -54,7 +54,7 @@ match(const struct sk_buff *skb, return 0; /* no match */ switch (sinfo->what) { - case IPT_CONNBYTES_WHAT_PKTS: + case IPT_CONNBYTES_PKTS: switch (sinfo->direction) { case IPT_CONNBYTES_DIR_ORIGINAL: what = ct->counters[IP_CT_DIR_ORIGINAL].packets; @@ -68,7 +68,7 @@ match(const struct sk_buff *skb, break; } break; - case IPT_CONNBYTES_WHAT_BYTES: + case IPT_CONNBYTES_BYTES: switch (sinfo->direction) { case IPT_CONNBYTES_DIR_ORIGINAL: what = ct->counters[IP_CT_DIR_ORIGINAL].bytes; @@ -82,7 +82,7 @@ match(const struct sk_buff *skb, break; } break; - case IPT_CONNBYTES_WHAT_AVGPKT: + case IPT_CONNBYTES_AVGPKT: switch (sinfo->direction) { case IPT_CONNBYTES_DIR_ORIGINAL: what = div64_64(ct->counters[IP_CT_DIR_ORIGINAL].bytes, @@ -128,9 +128,9 @@ static int check(const char *tablename, if (matchsize != IPT_ALIGN(sizeof(struct ipt_connbytes_info))) return 0; - if (sinfo->what != IPT_CONNBYTES_WHAT_PKTS && - sinfo->what != IPT_CONNBYTES_WHAT_BYTES && - sinfo->what != IPT_CONNBYTES_WHAT_AVGPKT) + if (sinfo->what != IPT_CONNBYTES_PKTS && + sinfo->what != IPT_CONNBYTES_BYTES && + sinfo->what != IPT_CONNBYTES_AVGPKT) return 0; if (sinfo->direction != IPT_CONNBYTES_DIR_ORIGINAL && -- cgit v1.2.3 From a61bbcf28a8cb0ba56f8193d512f7222e711a294 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 14 Aug 2005 17:24:31 -0700 Subject: [NET]: Store skb->timestamp as offset to a base timestamp Reduces skb size by 8 bytes on 64-bit. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/arp.c | 4 +++- net/ipv4/ip_fragment.c | 4 ++-- net/ipv4/netfilter/ip_queue.c | 4 ++-- net/ipv4/netfilter/ipt_ULOG.c | 8 ++++---- net/ipv4/tcp_input.c | 10 +++++++--- net/ipv4/tcp_output.c | 4 ++-- 6 files changed, 20 insertions(+), 14 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 6eb9c549d64..8bf312bdea1 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -865,7 +865,7 @@ static int arp_process(struct sk_buff *skb) if (n) neigh_release(n); - if (skb->stamp.tv_sec == LOCALLY_ENQUEUED || + if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || skb->pkt_type == PACKET_HOST || in_dev->arp_parms->proxy_delay == 0) { arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); @@ -948,6 +948,8 @@ int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) goto out_of_mem; + memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); + return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process); freeskb: diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 1ac64c0c5b3..9e6e683cc34 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -533,7 +533,7 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) if (skb->dev) qp->iif = skb->dev->ifindex; skb->dev = NULL; - qp->stamp = skb->stamp; + skb_get_timestamp(skb, &qp->stamp); qp->meat += skb->len; atomic_add(skb->truesize, &ip_frag_mem); if (offset == 0) @@ -615,7 +615,7 @@ static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev) head->next = NULL; head->dev = dev; - head->stamp = qp->stamp; + skb_set_timestamp(head, &qp->stamp); iph = head->nh.iph; iph->frag_off = 0; diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 1c49833e00a..7f2bcc7198f 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -240,8 +240,8 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp) pmsg->packet_id = (unsigned long )entry; pmsg->data_len = data_len; - pmsg->timestamp_sec = entry->skb->stamp.tv_sec; - pmsg->timestamp_usec = entry->skb->stamp.tv_usec; + pmsg->timestamp_sec = skb_tv_base.tv_sec + entry->skb->tstamp.off_sec; + pmsg->timestamp_usec = skb_tv_base.tv_usec + entry->skb->tstamp.off_usec; pmsg->mark = entry->skb->nfmark; pmsg->hook = entry->info->hook; pmsg->hw_protocol = entry->skb->protocol; diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index b86f06ec976..1d8ac4595e1 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -220,13 +220,13 @@ static void ipt_ulog_packet(unsigned int hooknum, pm = NLMSG_DATA(nlh); /* We might not have a timestamp, get one */ - if (skb->stamp.tv_sec == 0) - do_gettimeofday((struct timeval *)&skb->stamp); + if (skb->tstamp.off_sec == 0) + __net_timestamp((struct sk_buff *)skb); /* copy hook, prefix, timestamp, payload, etc. */ pm->data_len = copy_len; - pm->timestamp_sec = skb->stamp.tv_sec; - pm->timestamp_usec = skb->stamp.tv_usec; + pm->timestamp_sec = skb_tv_base.tv_sec + skb->tstamp.off_sec; + pm->timestamp_usec = skb_tv_base.tv_usec + skb->tstamp.off_usec; pm->mark = skb->nfmark; pm->hook = hooknum; if (prefix != NULL) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fdd9547fb78..ebb8654e3de 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2097,9 +2097,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, s32 *seq_usrtt seq_rtt = -1; } else if (seq_rtt < 0) seq_rtt = now - scb->when; - if (seq_usrtt) - *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000 - + (usnow.tv_usec - skb->stamp.tv_usec); + if (seq_usrtt) { + struct timeval tv; + + skb_get_timestamp(skb, &tv); + *seq_usrtt = (usnow.tv_sec - tv.tv_sec) * 1000000 + + (usnow.tv_usec - tv.tv_usec); + } if (sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= tcp_skb_pcount(skb); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 267b0fcbfc9..8d92ab562ae 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -282,7 +282,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) /* If congestion control is doing timestamping */ if (icsk->icsk_ca_ops->rtt_sample) - do_gettimeofday(&skb->stamp); + __net_timestamp(skb); sysctl_flags = 0; if (tcb->flags & TCPCB_FLAG_SYN) { @@ -483,7 +483,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned * skbs, which it never sent before. --ANK */ TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; - buff->stamp = skb->stamp; + buff->tstamp = skb->tstamp; if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { tp->lost_out -= tcp_skb_pcount(skb); -- cgit v1.2.3 From 9baa5c67ff4ce57b6b9f68c90714a1bb876fccd7 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 14 Aug 2005 17:32:50 -0700 Subject: [NETFILTER]: Don't exclude local packets from MASQUERADING Increases consistency in source-address selection. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_MASQUERADE.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 91e74502c3d..2f3e181c8e9 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -86,11 +86,6 @@ masquerade_target(struct sk_buff **pskb, IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING); - /* FIXME: For the moment, don't do local packets, breaks - testsuite for 2.3.49 --RR */ - if ((*pskb)->sk) - return NF_ACCEPT; - ct = ip_conntrack_get(*pskb, &ctinfo); IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); -- cgit v1.2.3 From 000efe1d86620244b8e017429e57fab4170ab05a Mon Sep 17 00:00:00 2001 From: Gary Wayne Smith Date: Sun, 14 Aug 2005 17:33:24 -0700 Subject: [NETFILTER]: Make NETMAP target usable in OUTPUT Signed-off-by: Gary Wayne Smith Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_NETMAP.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c index 06254b29d03..e6e7b609536 100644 --- a/net/ipv4/netfilter/ipt_NETMAP.c +++ b/net/ipv4/netfilter/ipt_NETMAP.c @@ -46,7 +46,8 @@ check(const char *tablename, DEBUGP(MODULENAME":check: size %u.\n", targinfosize); return 0; } - if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING))) { + if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING) | + (1 << NF_IP_LOCAL_OUT))) { DEBUGP(MODULENAME":check: bad hooks %x.\n", hook_mask); return 0; } @@ -76,12 +77,13 @@ target(struct sk_buff **pskb, struct ip_nat_range newrange; IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING - || hooknum == NF_IP_POST_ROUTING); + || hooknum == NF_IP_POST_ROUTING + || hooknum == NF_IP_LOCAL_OUT); ct = ip_conntrack_get(*pskb, &ctinfo); netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); - if (hooknum == NF_IP_PRE_ROUTING) + if (hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_LOCAL_OUT) new_ip = (*pskb)->nh.iph->daddr & ~netmask; else new_ip = (*pskb)->nh.iph->saddr & ~netmask; -- cgit v1.2.3 From 34b4a4a624bafe089107966a6c56d2a1aca026d4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 14 Aug 2005 17:33:59 -0700 Subject: [NETFILTER]: Remove tasklist_lock abuse in ipt{,6}owner Rip out cmd/sid/pid matching since its unfixable broken and stands in the way of locking changes to tasklist_lock. Signed-off-by: Christoph Hellwig Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_owner.c | 132 +++-------------------------------------- 1 file changed, 7 insertions(+), 125 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c index 3b9065e0638..c1889f88262 100644 --- a/net/ipv4/netfilter/ipt_owner.c +++ b/net/ipv4/netfilter/ipt_owner.c @@ -20,106 +20,6 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marc Boucher "); MODULE_DESCRIPTION("iptables owner match"); -static int -match_comm(const struct sk_buff *skb, const char *comm) -{ - struct task_struct *g, *p; - struct files_struct *files; - int i; - - read_lock(&tasklist_lock); - do_each_thread(g, p) { - if(strncmp(p->comm, comm, sizeof(p->comm))) - continue; - - task_lock(p); - files = p->files; - if(files) { - spin_lock(&files->file_lock); - for (i=0; i < files->max_fds; i++) { - if (fcheck_files(files, i) == - skb->sk->sk_socket->file) { - spin_unlock(&files->file_lock); - task_unlock(p); - read_unlock(&tasklist_lock); - return 1; - } - } - spin_unlock(&files->file_lock); - } - task_unlock(p); - } while_each_thread(g, p); - read_unlock(&tasklist_lock); - return 0; -} - -static int -match_pid(const struct sk_buff *skb, pid_t pid) -{ - struct task_struct *p; - struct files_struct *files; - int i; - - read_lock(&tasklist_lock); - p = find_task_by_pid(pid); - if (!p) - goto out; - task_lock(p); - files = p->files; - if(files) { - spin_lock(&files->file_lock); - for (i=0; i < files->max_fds; i++) { - if (fcheck_files(files, i) == - skb->sk->sk_socket->file) { - spin_unlock(&files->file_lock); - task_unlock(p); - read_unlock(&tasklist_lock); - return 1; - } - } - spin_unlock(&files->file_lock); - } - task_unlock(p); -out: - read_unlock(&tasklist_lock); - return 0; -} - -static int -match_sid(const struct sk_buff *skb, pid_t sid) -{ - struct task_struct *g, *p; - struct file *file = skb->sk->sk_socket->file; - int i, found=0; - - read_lock(&tasklist_lock); - do_each_thread(g, p) { - struct files_struct *files; - if (p->signal->session != sid) - continue; - - task_lock(p); - files = p->files; - if (files) { - spin_lock(&files->file_lock); - for (i=0; i < files->max_fds; i++) { - if (fcheck_files(files, i) == file) { - found = 1; - break; - } - } - spin_unlock(&files->file_lock); - } - task_unlock(p); - if (found) - goto out; - } while_each_thread(g, p); -out: - read_unlock(&tasklist_lock); - - return found; -} - static int match(const struct sk_buff *skb, const struct net_device *in, @@ -145,24 +45,6 @@ match(const struct sk_buff *skb, return 0; } - if(info->match & IPT_OWNER_PID) { - if (!match_pid(skb, info->pid) ^ - !!(info->invert & IPT_OWNER_PID)) - return 0; - } - - if(info->match & IPT_OWNER_SID) { - if (!match_sid(skb, info->sid) ^ - !!(info->invert & IPT_OWNER_SID)) - return 0; - } - - if(info->match & IPT_OWNER_COMM) { - if (!match_comm(skb, info->comm) ^ - !!(info->invert & IPT_OWNER_COMM)) - return 0; - } - return 1; } @@ -173,6 +55,8 @@ checkentry(const char *tablename, unsigned int matchsize, unsigned int hook_mask) { + const struct ipt_owner_info *info = matchinfo; + if (hook_mask & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING))) { printk("ipt_owner: only valid for LOCAL_OUT or POST_ROUTING.\n"); @@ -184,15 +68,13 @@ checkentry(const char *tablename, IPT_ALIGN(sizeof(struct ipt_owner_info))); return 0; } -#ifdef CONFIG_SMP - /* files->file_lock can not be used in a BH */ - if (((struct ipt_owner_info *)matchinfo)->match - & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) { - printk("ipt_owner: pid, sid and command matching is broken " - "on SMP.\n"); + + if (info->match & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) { + printk("ipt_owner: pid, sid and command matching " + "not supported anymore\n"); return 0; } -#endif + return 1; } -- cgit v1.2.3 From db080529798b497eb5a37b92a25e966be5a7dd5d Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 14 Aug 2005 19:26:34 -0700 Subject: [NETLINK]: Remove unused groups member from struct netlink_skb_parms Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index b5e2f1550c9..75d03e37b9a 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -558,7 +558,6 @@ static void nl_fib_input(struct sock *sk, int len) nl_fib_lookup(frn, tb); pid = nlh->nlmsg_pid; /*pid of sending process */ - NETLINK_CB(skb).groups = 0; /* not in mcast group */ NETLINK_CB(skb).pid = 0; /* from kernel */ NETLINK_CB(skb).dst_pid = pid; NETLINK_CB(skb).dst_groups = 0; /* unicast */ -- cgit v1.2.3 From ac6d439d2097b72ea0cbc2322ce1263a38bc1fd0 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 14 Aug 2005 19:29:52 -0700 Subject: [NETLINK]: Convert netlink users to use group numbers instead of bitmasks Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 7 +++---- net/ipv4/fib_frontend.c | 2 +- net/ipv4/fib_semantics.c | 4 ++-- net/ipv4/netfilter/ip_conntrack_netlink.c | 12 ++++++------ net/ipv4/netfilter/ipt_ULOG.c | 8 ++++---- 5 files changed, 16 insertions(+), 17 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index d8a10e3dd77..ba2895ae815 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1111,13 +1111,12 @@ static void rtmsg_ifa(int event, struct in_ifaddr* ifa) struct sk_buff *skb = alloc_skb(size, GFP_KERNEL); if (!skb) - netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS); + netlink_set_err(rtnl, 0, RTNLGRP_IPV4_IFADDR, ENOBUFS); else if (inet_fill_ifaddr(skb, ifa, current->pid, 0, event, 0) < 0) { kfree_skb(skb); - netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL); + netlink_set_err(rtnl, 0, RTNLGRP_IPV4_IFADDR, EINVAL); } else { - NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_IFADDR; - netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV4_IFADDR, GFP_KERNEL); + netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV4_IFADDR, GFP_KERNEL); } } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 75d03e37b9a..d4e7b578a25 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -560,7 +560,7 @@ static void nl_fib_input(struct sock *sk, int len) pid = nlh->nlmsg_pid; /*pid of sending process */ NETLINK_CB(skb).pid = 0; /* from kernel */ NETLINK_CB(skb).dst_pid = pid; - NETLINK_CB(skb).dst_groups = 0; /* unicast */ + NETLINK_CB(skb).dst_group = 0; /* unicast */ netlink_unicast(sk, skb, pid, MSG_DONTWAIT); } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index e278cb9d007..7e4651b3caa 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -290,10 +290,10 @@ void rtmsg_fib(int event, u32 key, struct fib_alias *fa, kfree_skb(skb); return; } - NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_ROUTE; + NETLINK_CB(skb).dst_group = RTNLGRP_IPV4_ROUTE; if (n->nlmsg_flags&NLM_F_ECHO) atomic_inc(&skb->users); - netlink_broadcast(rtnl, skb, pid, RTMGRP_IPV4_ROUTE, GFP_KERNEL); + netlink_broadcast(rtnl, skb, pid, RTNLGRP_IPV4_ROUTE, GFP_KERNEL); if (n->nlmsg_flags&NLM_F_ECHO) netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); } diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 1221a9c8bac..a4e9278db4e 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -297,7 +297,7 @@ static int ctnetlink_conntrack_event(struct notifier_block *this, struct sk_buff *skb; unsigned int type; unsigned char *b; - unsigned int flags = 0, groups; + unsigned int flags = 0, group; /* ignore our fake conntrack entry */ if (ct == &ip_conntrack_untracked) @@ -305,7 +305,7 @@ static int ctnetlink_conntrack_event(struct notifier_block *this, if (events & IPCT_DESTROY) { type = IPCTNL_MSG_CT_DELETE; - groups = NF_NETLINK_CONNTRACK_DESTROY; + group = NFNLGRP_CONNTRACK_DESTROY; goto alloc_skb; } if (events & (IPCT_NEW | IPCT_RELATED)) { @@ -313,7 +313,7 @@ static int ctnetlink_conntrack_event(struct notifier_block *this, flags = NLM_F_CREATE|NLM_F_EXCL; /* dump everything */ events = ~0UL; - groups = NF_NETLINK_CONNTRACK_NEW; + group = NFNLGRP_CONNTRACK_NEW; goto alloc_skb; } if (events & (IPCT_STATUS | @@ -322,7 +322,7 @@ static int ctnetlink_conntrack_event(struct notifier_block *this, IPCT_HELPINFO | IPCT_NATINFO)) { type = IPCTNL_MSG_CT_NEW; - groups = NF_NETLINK_CONNTRACK_UPDATE; + group = NFNLGRP_CONNTRACK_UPDATE; goto alloc_skb; } @@ -375,7 +375,7 @@ alloc_skb: goto nfattr_failure; nlh->nlmsg_len = skb->tail - b; - nfnetlink_send(skb, 0, groups, 0); + nfnetlink_send(skb, 0, group, 0); return NOTIFY_DONE; nlmsg_failure: @@ -1194,7 +1194,7 @@ static int ctnetlink_expect_event(struct notifier_block *this, nlh->nlmsg_len = skb->tail - b; proto = exp->tuple.dst.protonum; - nfnetlink_send(skb, 0, NF_NETLINK_CONNTRACK_EXP_NEW, 0); + nfnetlink_send(skb, 0, NFNLGRP_CONNTRACK_EXP_NEW, 0); return NOTIFY_DONE; nlmsg_failure: diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index 1d8ac4595e1..89816b83455 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -116,10 +116,10 @@ static void ulog_send(unsigned int nlgroupnum) if (ub->qlen > 1) ub->lastnlh->nlmsg_type = NLMSG_DONE; - NETLINK_CB(ub->skb).dst_groups = (1 << nlgroupnum); - DEBUGP("ipt_ULOG: throwing %d packets to netlink mask %u\n", - ub->qlen, nlgroupnum); - netlink_broadcast(nflognl, ub->skb, 0, (1 << nlgroupnum), GFP_ATOMIC); + NETLINK_CB(ub->skb).dst_group = nlgroupnum + 1; + DEBUGP("ipt_ULOG: throwing %d packets to netlink group %u\n", + ub->qlen, nlgroupnum + 1); + netlink_broadcast(nflognl, ub->skb, 0, nlgroupnum + 1, GFP_ATOMIC); ub->qlen = 0; ub->skb = NULL; -- cgit v1.2.3 From 066286071d3542243baa68166acb779187c848b3 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 15 Aug 2005 12:33:26 -0700 Subject: [NETLINK]: Add "groups" argument to netlink_kernel_create Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 2 +- net/ipv4/inet_diag.c | 2 +- net/ipv4/netfilter/ip_queue.c | 2 +- net/ipv4/netfilter/ipt_ULOG.c | 3 ++- 4 files changed, 5 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index d4e7b578a25..4e1379f7126 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -566,7 +566,7 @@ static void nl_fib_input(struct sock *sk, int len) static void nl_fib_lookup_init(void) { - netlink_kernel_create(NETLINK_FIB_LOOKUP, nl_fib_input, THIS_MODULE); + netlink_kernel_create(NETLINK_FIB_LOOKUP, 0, nl_fib_input, THIS_MODULE); } static void fib_disable_ip(struct net_device *dev, int force) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 1880ad8575d..71f3c7350c6 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -845,7 +845,7 @@ static int __init inet_diag_init(void) goto out; memset(inet_diag_table, 0, inet_diag_table_size); - idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, inet_diag_rcv, + idiagnl = netlink_kernel_create(NETLINK_INET_DIAG, 0, inet_diag_rcv, THIS_MODULE); if (idiagnl == NULL) goto out_free_table; diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index 7f2bcc7198f..d54f14d926f 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -671,7 +671,7 @@ init_or_cleanup(int init) goto cleanup; netlink_register_notifier(&ipq_nl_notifier); - ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk, + ipqnl = netlink_kernel_create(NETLINK_FIREWALL, 0, ipq_rcv_sk, THIS_MODULE); if (ipqnl == NULL) { printk(KERN_ERR "ip_queue: failed to create netlink socket\n"); diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index 89816b83455..e2c14f3cb2f 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -388,7 +388,8 @@ static int __init init(void) ulog_buffers[i].timer.data = i; } - nflognl = netlink_kernel_create(NETLINK_NFLOG, NULL, THIS_MODULE); + nflognl = netlink_kernel_create(NETLINK_NFLOG, ULOG_MAXNLGROUPS, NULL, + THIS_MODULE); if (!nflognl) return -ENOMEM; -- cgit v1.2.3 From 20380731bc2897f2952ae055420972ded4cd786e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 16 Aug 2005 02:18:02 -0300 Subject: [NET]: Fix sparse warnings Of this type, mostly: CHECK net/ipv6/netfilter.c net/ipv6/netfilter.c:96:12: warning: symbol 'ipv6_netfilter_init' was not declared. Should it be static? net/ipv6/netfilter.c:101:6: warning: symbol 'ipv6_netfilter_fini' was not declared. Should it be static? Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 14 -------------- net/ipv4/datagram.c | 1 + net/ipv4/inetpeer.c | 1 + net/ipv4/ip_sockglue.c | 2 -- net/ipv4/proc.c | 3 --- net/ipv4/syncookies.c | 2 -- net/ipv4/sysctl_net_ipv4.c | 43 +++++++------------------------------------ net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 2 +- 9 files changed, 11 insertions(+), 59 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 20f52b5f5de..5810f9d1491 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -859,10 +859,6 @@ static struct net_proto_family inet_family_ops = { .owner = THIS_MODULE, }; - -extern void tcp_init(void); -extern void tcp_v4_init(struct net_proto_family *); - /* Upon startup we insert all the elements in inetsw_array[] into * the linked list inetsw. */ @@ -1132,7 +1128,6 @@ static int __init init_ipv4_mibs(void) } static int ipv4_proc_init(void); -extern void ipfrag_init(void); /* * IP protocol layer initialiser @@ -1253,19 +1248,10 @@ module_init(inet_init); /* ------------------------------------------------------------------------ */ #ifdef CONFIG_PROC_FS -extern int fib_proc_init(void); -extern void fib_proc_exit(void); #ifdef CONFIG_IP_FIB_TRIE extern int fib_stat_proc_init(void); extern void fib_stat_proc_exit(void); #endif -extern int ip_misc_proc_init(void); -extern int raw_proc_init(void); -extern void raw_proc_exit(void); -extern int tcp4_proc_init(void); -extern void tcp4_proc_exit(void); -extern int udp4_proc_init(void); -extern void udp4_proc_exit(void); static int __init ipv4_proc_init(void) { diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 3fd49f4282a..c1b42b5257f 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 3c513ceaca7..4410b9dc03e 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -20,6 +20,7 @@ #include #include #include +#include #include /* diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index ddb1aedbdc6..aca088b3707 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -614,7 +614,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, } case IP_MSFILTER: { - extern int sysctl_optmem_max; extern int sysctl_igmp_max_msf; struct ip_msfilter *msf; @@ -769,7 +768,6 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, } case MCAST_MSFILTER: { - extern int sysctl_optmem_max; extern int sysctl_igmp_max_msf; struct sockaddr_in *psin; struct ip_msfilter *msf = NULL; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 3eadbb27187..f7943ba1f43 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -59,9 +59,6 @@ static int fold_prot_inuse(struct proto *proto) */ static int sockstat_seq_show(struct seq_file *seq, void *v) { - /* From net/socket.c */ - extern void socket_seq_show(struct seq_file *seq); - socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count), diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 8692cb9d4bd..a34e60ea48a 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -169,8 +169,6 @@ static inline int cookie_check(struct sk_buff *skb, __u32 cookie) return mssind < NUM_MSS ? msstab[mssind] + 1 : 0; } -extern struct request_sock_ops tcp_request_sock_ops; - static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index ce47a345ecc..65268562351 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -11,7 +11,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -19,36 +21,6 @@ /* From af_inet.c */ extern int sysctl_ip_nonlocal_bind; -/* From icmp.c */ -extern int sysctl_icmp_echo_ignore_all; -extern int sysctl_icmp_echo_ignore_broadcasts; -extern int sysctl_icmp_ignore_bogus_error_responses; -extern int sysctl_icmp_errors_use_inbound_ifaddr; - -/* From ip_fragment.c */ -extern int sysctl_ipfrag_low_thresh; -extern int sysctl_ipfrag_high_thresh; -extern int sysctl_ipfrag_time; -extern int sysctl_ipfrag_secret_interval; - -/* From ip_output.c */ -extern int sysctl_ip_dynaddr; - -/* From icmp.c */ -extern int sysctl_icmp_ratelimit; -extern int sysctl_icmp_ratemask; - -/* From igmp.c */ -extern int sysctl_igmp_max_memberships; -extern int sysctl_igmp_max_msf; - -/* From inetpeer.c */ -extern int inet_peer_threshold; -extern int inet_peer_minttl; -extern int inet_peer_maxttl; -extern int inet_peer_gc_mintime; -extern int inet_peer_gc_maxtime; - #ifdef CONFIG_SYSCTL static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; @@ -57,8 +29,6 @@ static int ip_local_port_range_max[] = { 65535, 65535 }; struct ipv4_config ipv4_config; -extern ctl_table ipv4_route_table[]; - #ifdef CONFIG_SYSCTL static @@ -136,10 +106,11 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * return ret; } -int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, int nlen, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen, - void **context) +static int sysctl_tcp_congestion_control(ctl_table *table, int __user *name, + int nlen, void __user *oldval, + size_t __user *oldlenp, + void __user *newval, size_t newlen, + void **context) { char val[TCP_CA_NAME_MAX]; ctl_table tbl = { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ebb8654e3de..1afb080bdf0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4229,7 +4229,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && !tp->srtt) - tcp_ack_saw_tstamp(sk, 0, 0); + tcp_ack_saw_tstamp(sk, NULL, 0); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 97bbf595230..13dfb391cdf 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include #include @@ -76,7 +77,6 @@ #include #include -extern int sysctl_ip_dynaddr; int sysctl_tcp_tw_reuse; int sysctl_tcp_low_latency; -- cgit v1.2.3 From 4c6ea29d82e0d1b9b37e6b879e0a7fd6c409333d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 16 Aug 2005 19:46:48 -0300 Subject: [IP]: Introduce ip_options_get_from_user This variant is needed to satisfy sparse __user annotations. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/ip_options.c | 49 ++++++++++++++++++++++++++++++++++--------------- net/ipv4/ip_sockglue.c | 4 ++-- 2 files changed, 36 insertions(+), 17 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 7e02ba58407..bce4e875193 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -489,23 +489,18 @@ void ip_options_undo(struct ip_options * opt) } } -int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, int user) +static struct ip_options *ip_options_get_alloc(const int optlen) { - struct ip_options *opt; + struct ip_options *opt = kmalloc(sizeof(*opt) + ((optlen + 3) & ~3), + GFP_KERNEL); + if (opt) + memset(opt, 0, sizeof(*opt)); + return opt; +} - opt = kmalloc(sizeof(struct ip_options)+((optlen+3)&~3), GFP_KERNEL); - if (!opt) - return -ENOMEM; - memset(opt, 0, sizeof(struct ip_options)); - if (optlen) { - if (user) { - if (copy_from_user(opt->__data, data, optlen)) { - kfree(opt); - return -EFAULT; - } - } else - memcpy(opt->__data, data, optlen); - } +static int ip_options_get_finish(struct ip_options **optp, + struct ip_options *opt, int optlen) +{ while (optlen & 3) opt->__data[optlen++] = IPOPT_END; opt->optlen = optlen; @@ -521,6 +516,30 @@ int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, in return 0; } +int ip_options_get_from_user(struct ip_options **optp, unsigned char __user *data, int optlen) +{ + struct ip_options *opt = ip_options_get_alloc(optlen); + + if (!opt) + return -ENOMEM; + if (optlen && copy_from_user(opt->__data, data, optlen)) { + kfree(opt); + return -EFAULT; + } + return ip_options_get_finish(optp, opt, optlen); +} + +int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen) +{ + struct ip_options *opt = ip_options_get_alloc(optlen); + + if (!opt) + return -ENOMEM; + if (optlen) + memcpy(opt->__data, data, optlen); + return ip_options_get_finish(optp, opt, optlen); +} + void ip_forward_options(struct sk_buff *skb) { struct ip_options * opt = &(IPCB(skb)->opt); diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index aca088b3707..2f0b47da5b3 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -153,7 +153,7 @@ int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc) switch (cmsg->cmsg_type) { case IP_RETOPTS: err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); - err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40, 0); + err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40); if (err) return err; break; @@ -425,7 +425,7 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, struct ip_options * opt = NULL; if (optlen > 40 || optlen < 0) goto e_inval; - err = ip_options_get(&opt, optval, optlen, 1); + err = ip_options_get_from_user(&opt, optval, optlen); if (err) break; if (sk->sk_type == SOCK_STREAM) { -- cgit v1.2.3 From ba602a816132dcc66e875dddf2c62512a9f6f8cb Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 16 Aug 2005 20:50:16 -0700 Subject: [IPVS]: Rename tcp_{init,exit}() --> ip_vs_tcp_{init,exit}() Conflicts with global namespace functions with the same name. Signed-off-by: David S. Miller --- net/ipv4/ipvs/ip_vs_proto_tcp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index e65de675da7..c19408973c0 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -604,14 +604,14 @@ void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) } -static void tcp_init(struct ip_vs_protocol *pp) +static void ip_vs_tcp_init(struct ip_vs_protocol *pp) { IP_VS_INIT_HASH_TABLE(tcp_apps); pp->timeout_table = tcp_timeouts; } -static void tcp_exit(struct ip_vs_protocol *pp) +static void ip_vs_tcp_exit(struct ip_vs_protocol *pp) { } @@ -621,8 +621,8 @@ struct ip_vs_protocol ip_vs_protocol_tcp = { .protocol = IPPROTO_TCP, .dont_defrag = 0, .appcnt = ATOMIC_INIT(0), - .init = tcp_init, - .exit = tcp_exit, + .init = ip_vs_tcp_init, + .exit = ip_vs_tcp_exit, .register_app = tcp_register_app, .unregister_app = tcp_unregister_app, .conn_schedule = tcp_conn_schedule, -- cgit v1.2.3 From d179cd12928443f3ec29cfbc3567439644bd0afc Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 17 Aug 2005 14:57:30 -0700 Subject: [NET]: Implement SKB fast cloning. Protocols that make extensive use of SKB cloning, for example TCP, eat at least 2 allocations per packet sent as a result. To cut the kmalloc() count in half, we implement a pre-allocation scheme wherein we allocate 2 sk_buff objects in advance, then use a simple reference count to free up the memory at the correct time. Based upon an initial patch by Thomas Graf and suggestions from Herbert Xu. Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8d92ab562ae..75b68116682 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1582,7 +1582,7 @@ void tcp_send_fin(struct sock *sk) } else { /* Socket is locked, keep trying until memory is available. */ for (;;) { - skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL); + skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_KERNEL); if (skb) break; yield(); @@ -1804,7 +1804,7 @@ int tcp_connect(struct sock *sk) tcp_connect_init(sk); - buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation); + buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation); if (unlikely(buff == NULL)) return -ENOBUFS; -- cgit v1.2.3 From bf0ff9e578ba7dd8331005f00ad7310122011f43 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 19 Aug 2005 16:37:30 -0700 Subject: [IPVS]: ipv4_table --> ipvs_ipv4_table Fix conflict with symbol of same name in global namespace. Signed-off-by: David S. Miller --- net/ipv4/ipvs/ip_vs_ctl.c | 4 ++-- net/ipv4/ipvs/ip_vs_lblc.c | 4 ++-- net/ipv4/ipvs/ip_vs_lblcr.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 7d99ede2ef7..2d66848e7aa 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -1598,7 +1598,7 @@ static ctl_table vs_table[] = { { .ctl_name = 0 } }; -static ctl_table ipv4_table[] = { +static ctl_table ipvs_ipv4_table[] = { { .ctl_name = NET_IPV4, .procname = "ipv4", @@ -1613,7 +1613,7 @@ static ctl_table vs_root_table[] = { .ctl_name = CTL_NET, .procname = "net", .mode = 0555, - .child = ipv4_table, + .child = ipvs_ipv4_table, }, { .ctl_name = 0 } }; diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index c035838b780..561cda326fa 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -131,7 +131,7 @@ static ctl_table vs_table[] = { { .ctl_name = 0 } }; -static ctl_table ipv4_table[] = { +static ctl_table ipvs_ipv4_table[] = { { .ctl_name = NET_IPV4, .procname = "ipv4", @@ -146,7 +146,7 @@ static ctl_table lblc_root_table[] = { .ctl_name = CTL_NET, .procname = "net", .mode = 0555, - .child = ipv4_table + .child = ipvs_ipv4_table }, { .ctl_name = 0 } }; diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index 22b5dd55d27..ce456dbf09a 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -320,7 +320,7 @@ static ctl_table vs_table[] = { { .ctl_name = 0 } }; -static ctl_table ipv4_table[] = { +static ctl_table ipvs_ipv4_table[] = { { .ctl_name = NET_IPV4, .procname = "ipv4", @@ -335,7 +335,7 @@ static ctl_table lblcr_root_table[] = { .ctl_name = CTL_NET, .procname = "net", .mode = 0555, - .child = ipv4_table + .child = ipvs_ipv4_table }, { .ctl_name = 0 } }; -- cgit v1.2.3 From 58615242417638794a5ba299c49e3fbd6f47c2a3 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Sat, 20 Aug 2005 17:25:29 -0700 Subject: [IPV4]: Consistency and whitespace cleanup of ip_rcv() Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 81e18023dc1..322b082ede1 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -361,6 +361,7 @@ drop: int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct iphdr *iph; + u32 len; /* When the interface is in promisc. mode, drop all the crap * that it receives, do not try to analyse it. @@ -392,7 +393,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, */ if (iph->ihl < 5 || iph->version != 4) - goto inhdr_error; + goto inhdr_error; if (!pskb_may_pull(skb, iph->ihl*4)) goto inhdr_error; @@ -400,21 +401,19 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, iph = skb->nh.iph; if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) - goto inhdr_error; + goto inhdr_error; - { - __u32 len = ntohs(iph->tot_len); - if (skb->len < len || len < (iph->ihl<<2)) - goto inhdr_error; + len = ntohs(iph->tot_len); + if (skb->len < len || len < (iph->ihl*4)) + goto inhdr_error; - /* Our transport medium may have padded the buffer out. Now we know it - * is IP we can trim to the true length of the frame. - * Note this now means skb->len holds ntohs(iph->tot_len). - */ - if (pskb_trim_rcsum(skb, len)) { - IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); - goto drop; - } + /* Our transport medium may have padded the buffer out. Now we know it + * is IP we can trim to the true length of the frame. + * Note this now means skb->len holds ntohs(iph->tot_len). + */ + if (pskb_trim_rcsum(skb, len)) { + IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); + goto drop; } return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL, -- cgit v1.2.3 From e9c604227391308b185aa6b14c7f93b0a0c2e51b Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Sat, 20 Aug 2005 17:25:52 -0700 Subject: [IPV4]: Avoid common branch misprediction while checking csum in ip_rcv() Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 322b082ede1..6a06e15694d 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -400,7 +400,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, iph = skb->nh.iph; - if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) + if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) goto inhdr_error; len = ntohs(iph->tot_len); -- cgit v1.2.3 From d245407e758b14c464c609b632873f85709360c7 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Sat, 20 Aug 2005 17:26:12 -0700 Subject: [IPV4]: Move ip options parsing out of ip_rcv_finish() Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 93 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 54 insertions(+), 39 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 6a06e15694d..48e4ddc1e33 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -279,6 +279,58 @@ int ip_local_deliver(struct sk_buff *skb) ip_local_deliver_finish); } +static inline int ip_rcv_options(struct sk_buff *skb) +{ + struct ip_options *opt; + struct iphdr *iph; + struct net_device *dev = skb->dev; + + /* It looks as overkill, because not all + IP options require packet mangling. + But it is the easiest for now, especially taking + into account that combination of IP options + and running sniffer is extremely rare condition. + --ANK (980813) + */ + if (skb_cow(skb, skb_headroom(skb))) { + IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); + goto drop; + } + + iph = skb->nh.iph; + + if (ip_options_compile(NULL, skb)) { + IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + goto drop; + } + + opt = &(IPCB(skb)->opt); + if (unlikely(opt->srr)) { + struct in_device *in_dev = in_dev_get(dev); + if (in_dev) { + if (!IN_DEV_SOURCE_ROUTE(in_dev)) { + if (IN_DEV_LOG_MARTIANS(in_dev) && + net_ratelimit()) + printk(KERN_INFO "source route option " + "%u.%u.%u.%u -> %u.%u.%u.%u\n", + NIPQUAD(iph->saddr), + NIPQUAD(iph->daddr)); + in_dev_put(in_dev); + goto drop; + } + + in_dev_put(in_dev); + } + + if (ip_options_rcv_srr(skb)) + goto drop; + } + + return 0; +drop: + return -1; +} + static inline int ip_rcv_finish(struct sk_buff *skb) { struct net_device *dev = skb->dev; @@ -308,48 +360,11 @@ static inline int ip_rcv_finish(struct sk_buff *skb) } #endif - if (iph->ihl > 5) { - struct ip_options *opt; - - /* It looks as overkill, because not all - IP options require packet mangling. - But it is the easiest for now, especially taking - into account that combination of IP options - and running sniffer is extremely rare condition. - --ANK (980813) - */ - - if (skb_cow(skb, skb_headroom(skb))) { - IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); - goto drop; - } - iph = skb->nh.iph; - - if (ip_options_compile(NULL, skb)) - goto inhdr_error; - - opt = &(IPCB(skb)->opt); - if (opt->srr) { - struct in_device *in_dev = in_dev_get(dev); - if (in_dev) { - if (!IN_DEV_SOURCE_ROUTE(in_dev)) { - if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) - printk(KERN_INFO "source route option %u.%u.%u.%u -> %u.%u.%u.%u\n", - NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); - in_dev_put(in_dev); - goto drop; - } - in_dev_put(in_dev); - } - if (ip_options_rcv_srr(skb)) - goto drop; - } - } + if (iph->ihl > 5 && ip_rcv_options(skb)) + goto drop; return dst_input(skb); -inhdr_error: - IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); drop: kfree_skb(skb); return NET_RX_DROP; -- cgit v1.2.3 From 3e192beaf5ef260a31e84a12c0a04eff2eec02ab Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Sat, 20 Aug 2005 17:26:30 -0700 Subject: [IPV4]: Avoid common branch mispredictions in ip_rcv_finish() Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 48e4ddc1e33..7e78095baef 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -333,16 +333,16 @@ drop: static inline int ip_rcv_finish(struct sk_buff *skb) { - struct net_device *dev = skb->dev; struct iphdr *iph = skb->nh.iph; - int err; /* * Initialise the virtual path cache for the packet. It describes * how the packet travels inside Linux networking. */ - if (skb->dst == NULL) { - if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) { + if (likely(skb->dst == NULL)) { + int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, + skb->dev); + if (unlikely(err)) { if (err == -EHOSTUNREACH) IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS); goto drop; @@ -350,7 +350,7 @@ static inline int ip_rcv_finish(struct sk_buff *skb) } #ifdef CONFIG_NET_CLS_ROUTE - if (skb->dst->tclassid) { + if (unlikely(skb->dst->tclassid)) { struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id(); u32 idx = skb->dst->tclassid; st[idx&0xFF].o_packets++; -- cgit v1.2.3 From 9070683bdac59a3b26e2ce6dd0d05fbfcb3fc7d8 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Sat, 20 Aug 2005 17:27:09 -0700 Subject: [IPV4]: Remove some dead code from ip_forward() Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv4/ip_forward.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 77094aac6c2..0923add122b 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -76,16 +76,12 @@ int ip_forward(struct sk_buff *skb) * that reaches zero, we must reply an ICMP control message telling * that the packet's lifetime expired. */ - - iph = skb->nh.iph; - - if (iph->ttl <= 1) + if (skb->nh.iph->ttl <= 1) goto too_many_hops; if (!xfrm4_route_forward(skb)) goto drop; - iph = skb->nh.iph; rt = (struct rtable*)skb->dst; if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) -- cgit v1.2.3 From 33d043d65bbd3d97efca96c9bbada443cac3c4da Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Sat, 20 Aug 2005 17:27:34 -0700 Subject: [IPV4]: ip_finish_output() can be inlined Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 19f24f778dc..3f1a263e124 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -200,7 +200,7 @@ static inline int ip_finish_output2(struct sk_buff *skb) return -EINVAL; } -static int ip_finish_output(struct sk_buff *skb) +static inline int ip_finish_output(struct sk_buff *skb) { struct net_device *dev = skb->dst->dev; -- cgit v1.2.3 From 7567662ba896ee0c33d6215f32e2011488a6d1bf Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 21 Aug 2005 23:30:34 -0700 Subject: [NETFILTER]: Add string match Signed-off-by: Pablo Neira Ayuso Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/Kconfig | 12 ++++++ net/ipv4/netfilter/Makefile | 1 + net/ipv4/netfilter/ipt_string.c | 91 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 104 insertions(+) create mode 100644 net/ipv4/netfilter/ipt_string.c (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 3f7e6e49cbd..f2bea6ecb22 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -410,6 +410,18 @@ config IP_NF_MATCH_HASHLIMIT destination IP' or `500pps from any given source IP' with a single IPtables rule. +config IP_NF_MATCH_STRING + tristate 'string match support' + depends on IP_NF_IPTABLES + select TEXTSEARCH + select TEXTSEARCH_KMP + select TEXTSEARCH_FSM + help + This option adds a `string' match, which allows you to look for + pattern matchings in packets. + + To compile it as a module, choose M here. If unsure, say N. + # `filter', generic and specific targets config IP_NF_FILTER tristate "Packet filtering" diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 7c8ae858aa4..89cae69ee20 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -65,6 +65,7 @@ obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o obj-$(CONFIG_IP_NF_MATCH_COMMENT) += ipt_comment.o +obj-$(CONFIG_IP_NF_MATCH_STRING) += ipt_string.o # targets obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o diff --git a/net/ipv4/netfilter/ipt_string.c b/net/ipv4/netfilter/ipt_string.c new file mode 100644 index 00000000000..b5def204d79 --- /dev/null +++ b/net/ipv4/netfilter/ipt_string.c @@ -0,0 +1,91 @@ +/* String matching match for iptables + * + * (C) 2005 Pablo Neira Ayuso + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Pablo Neira Ayuso "); +MODULE_DESCRIPTION("IP tables string match module"); +MODULE_LICENSE("GPL"); + +static int match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + struct ts_state state; + struct ipt_string_info *conf = (struct ipt_string_info *) matchinfo; + + memset(&state, 0, sizeof(struct ts_state)); + + return (skb_find_text((struct sk_buff *)skb, conf->from_offset, + conf->to_offset, conf->config, &state) + != UINT_MAX) && !conf->invert; +} + +#define STRING_TEXT_PRIV(m) ((struct ipt_string_info *) m) + +static int checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + struct ipt_string_info *conf = matchinfo; + struct ts_config *ts_conf; + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_string_info))) + return 0; + + /* Damn, can't handle this case properly with iptables... */ + if (conf->from_offset > conf->to_offset) + return 0; + + ts_conf = textsearch_prepare(conf->algo, conf->pattern, conf->patlen, + GFP_KERNEL, TS_AUTOLOAD); + if (IS_ERR(ts_conf)) + return 0; + + conf->config = ts_conf; + + return 1; +} + +static void destroy(void *matchinfo, unsigned int matchsize) +{ + textsearch_destroy(STRING_TEXT_PRIV(matchinfo)->config); +} + +static struct ipt_match string_match = { + .name = "string", + .match = match, + .checkentry = checkentry, + .destroy = destroy, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_match(&string_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&string_match); +} + +module_init(init); +module_exit(fini); -- cgit v1.2.3 From 05465343bf74e00c8c2c5a310740157de3149f27 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 21 Aug 2005 23:31:43 -0700 Subject: [NETFILTER]: Add goto target Originally written by Henrik Nordstrom , taken from netfilter patch-o-matic and added ip6_tables support. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_tables.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index ff8d85d2070..eef99a1b5de 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -340,8 +340,8 @@ ipt_do_table(struct sk_buff **pskb, back->comefrom); continue; } - if (table_base + v - != (void *)e + e->next_offset) { + if (table_base + v != (void *)e + e->next_offset + && !(e->ip.flags & IPT_F_GOTO)) { /* Save old back ptr in next entry */ struct ipt_entry *next = (void *)e + e->next_offset; -- cgit v1.2.3 From dc40c7bc76054f5e4382835ca2bafb895b993a8a Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 23 Aug 2005 21:52:58 -0700 Subject: [ICSK]: Generalise tcp_listen_poll Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 02848e72e9c..68626de6d69 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -309,15 +309,6 @@ void tcp_enter_memory_pressure(void) EXPORT_SYMBOL(tcp_enter_memory_pressure); -/* - * LISTEN is a special case for poll.. - */ -static __inline__ unsigned int tcp_listen_poll(struct sock *sk, - poll_table *wait) -{ - return !reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue) ? (POLLIN | POLLRDNORM) : 0; -} - /* * Wait for a TCP event. * @@ -333,7 +324,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) poll_wait(file, sk->sk_sleep, wait); if (sk->sk_state == TCP_LISTEN) - return tcp_listen_poll(sk, wait); + return inet_csk_listen_poll(sk); /* Socket is not locked. We are protected from async events by poll logic and correct handling of state changes -- cgit v1.2.3 From 0c7770c740156c8802c23d24fc094d06967d997d Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 23 Aug 2005 21:59:41 -0700 Subject: [IPV4]: FIB trie cleanup This is a redo of earlier cleanup stuff: * replace DBG() macro with pr_debug() * get rid of duplicate extern's that are already in fib_lookup.h * use BUG_ON and WARN_ON * don't use BUG checks for null pointers where next statement would get a fault anyway * remove debug printout when rebalance causes deep tree * remove trailing blanks Signed-off-by: Stephen Hemminger Signed-off-by: Robert Olsson Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 100 ++++++++++++++++------------------------------------ 1 file changed, 31 insertions(+), 69 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 395f64df6f9..9c4c7f0367b 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -157,10 +157,6 @@ struct trie { unsigned int revision; }; -static int trie_debug = 0; - -#define DBG(x...) do { if (trie_debug) printk(x); } while (0) - static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n); static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull); static struct node *resize(struct trie *t, struct tnode *tn); @@ -168,12 +164,6 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn); static struct tnode *halve(struct trie *t, struct tnode *tn); static void tnode_free(struct tnode *tn); static void trie_dump_seq(struct seq_file *seq, struct trie *t); -extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio); -extern int fib_detect_death(struct fib_info *fi, int order, - struct fib_info **last_resort, int *last_idx, int *dflt); - -extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id, - struct nlmsghdr *n, struct netlink_skb_parms *req); static kmem_cache_t *fn_alias_kmem; static struct trie *trie_local = NULL, *trie_main = NULL; @@ -294,11 +284,9 @@ static void fn_free_alias(struct fib_alias *fa) */ -static void check_tnode(struct tnode *tn) +static inline void check_tnode(const struct tnode *tn) { - if (tn && tn->pos+tn->bits > 32) { - printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits); - } + WARN_ON(tn && tn->pos+tn->bits > 32); } static int halve_threshold = 25; @@ -374,21 +362,19 @@ static struct tnode* tnode_new(t_key key, int pos, int bits) tn->empty_children = 1<= 1<bits) { - printk("bits=%d, i=%d\n", tn->bits, i); - BUG(); - } + BUG_ON(i >= 1<bits); + write_lock_bh(&fib_lock); chi = tn->child[i]; @@ -459,8 +443,8 @@ static struct node *resize(struct trie *t, struct tnode *tn) if (!tn) return NULL; - DBG("In tnode_resize %p inflate_threshold=%d threshold=%d\n", - tn, inflate_threshold, halve_threshold); + pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n", + tn, inflate_threshold, halve_threshold); /* No children */ if (tn->empty_children == tnode_child_length(tn)) { @@ -625,11 +609,11 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn) int olen = tnode_child_length(tn); int i; - DBG("In inflate\n"); + pr_debug("In inflate\n"); tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1); - if (!tn) + if (!tn) return ERR_PTR(-ENOMEM); /* @@ -749,12 +733,12 @@ nomem: int size = tnode_child_length(tn); int j; - for(j = 0; j < size; j++) + for (j = 0; j < size; j++) if (tn->child[j]) tnode_free((struct tnode *)tn->child[j]); tnode_free(tn); - + return ERR_PTR(-ENOMEM); } } @@ -766,7 +750,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) int i; int olen = tnode_child_length(tn); - DBG("In halve\n"); + pr_debug("In halve\n"); tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1); @@ -785,14 +769,14 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) right = tnode_get_child(oldtnode, i+1); /* Two nonempty children */ - if (left && right) { + if (left && right) { struct tnode *newn; - + newn = tnode_new(left->key, tn->pos + tn->bits, 1); - - if (!newn) + + if (!newn) goto nomem; - + put_child(t, tn, i/2, (struct node *)newn); } @@ -810,7 +794,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) continue; put_child(t, tn, i/2, right); continue; - } + } if (right == NULL) { put_child(t, tn, i/2, left); @@ -820,9 +804,6 @@ static struct tnode *halve(struct trie *t, struct tnode *tn) /* Two nonempty children */ newBinNode = (struct tnode *) tnode_get_child(tn, i/2); put_child(t, tn, i/2, NULL); - - BUG_ON(!newBinNode); - put_child(t, newBinNode, 0, left); put_child(t, newBinNode, 1, right); put_child(t, tn, i/2, resize(t, newBinNode)); @@ -834,12 +815,12 @@ nomem: int size = tnode_child_length(tn); int j; - for(j = 0; j < size; j++) + for (j = 0; j < size; j++) if (tn->child[j]) tnode_free((struct tnode *)tn->child[j]); tnode_free(tn); - + return ERR_PTR(-ENOMEM); } } @@ -939,22 +920,10 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn) t_key cindex, key; struct tnode *tp = NULL; - BUG_ON(!tn); - key = tn->key; i = 0; while (tn != NULL && NODE_PARENT(tn) != NULL) { - if (i > 10) { - printk("Rebalance tn=%p \n", tn); - if (tn) - printk("tn->parent=%p \n", NODE_PARENT(tn)); - - printk("Rebalance tp=%p \n", tp); - if (tp) - printk("tp->parent=%p \n", NODE_PARENT(tp)); - } - BUG_ON(i > 12); /* Why is this a bug? -ojn */ i++; @@ -1019,10 +988,7 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) pos = tn->pos + tn->bits; n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits)); - if (n && NODE_PARENT(n) != tn) { - printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n)); - BUG(); - } + BUG_ON(n && NODE_PARENT(n) != tn); } else break; } @@ -1076,8 +1042,6 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) NODE_SET_PARENT(l, tp); - BUG_ON(!tp); - cindex = tkey_extract_bits(key, tp->pos, tp->bits); put_child(t, (struct tnode *)tp, cindex, (struct node *)l); } else { @@ -1158,7 +1122,7 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, key = ntohl(key); - DBG("Insert table=%d %08x/%d\n", tb->tb_id, key, plen); + pr_debug("Insert table=%d %08x/%d\n", tb->tb_id, key, plen); mask = ntohl(inet_make_mask(plen)); @@ -1282,7 +1246,8 @@ err: return err; } -static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp, +static inline int check_leaf(struct trie *t, struct leaf *l, + t_key key, int *plen, const struct flowi *flp, struct fib_result *res) { int err, i; @@ -1511,7 +1476,7 @@ static int trie_leaf_remove(struct trie *t, t_key key) struct node *n = t->trie; struct leaf *l; - DBG("entering trie_leaf_remove(%p)\n", n); + pr_debug("entering trie_leaf_remove(%p)\n", n); /* Note that in the case skipped bits, those bits are *not* checked! * When we finish this, we will have NULL or a T_LEAF, and the @@ -1523,10 +1488,7 @@ static int trie_leaf_remove(struct trie *t, t_key key) check_tnode(tn); n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits)); - if (n && NODE_PARENT(n) != tn) { - printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n)); - BUG(); - } + BUG_ON(n && NODE_PARENT(n) != tn); } l = (struct leaf *) n; @@ -1594,7 +1556,7 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, if (!fa) return -ESRCH; - DBG("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t); + pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t); fa_to_delete = NULL; fa_head = fa->fa_list.prev; @@ -1762,7 +1724,7 @@ static int fn_trie_flush(struct fib_table *tb) if (ll && hlist_empty(&ll->list)) trie_leaf_remove(t, ll->key); - DBG("trie_flush found=%d\n", found); + pr_debug("trie_flush found=%d\n", found); return found; } -- cgit v1.2.3 From 3625796806419d97641d90e0f197eab9b952212e Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Wed, 24 Aug 2005 11:38:53 -0700 Subject: [IPV4]: Module export of ip_rcv() no longer needed. With ip_rcv nowhere outside the IP stack being used anymore it's EXPORT_SYMBOL is not needed any longer either. Signed-off-by: Ralf Baechle DL5RB Signed-off-by: David S. Miller --- net/ipv4/ip_input.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 7e78095baef..220a8b5920e 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -442,5 +442,4 @@ out: return NET_RX_DROP; } -EXPORT_SYMBOL(ip_rcv); EXPORT_SYMBOL(ip_statistics); -- cgit v1.2.3 From e5b4376074e02b783e56a8f7c42d544e18112c4e Mon Sep 17 00:00:00 2001 From: Robert Olsson Date: Thu, 25 Aug 2005 13:01:03 -0700 Subject: [IPV4]: Prepare FIB core for RCU. * RCU versions of hlist_***_rcu * fib_alias partial rcu port just whats needed now. Signed-off-by: Robert Olsson Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/fib_lookup.h | 1 + net/ipv4/fib_semantics.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index b729d97cfa9..ef6609ea0eb 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -7,6 +7,7 @@ struct fib_alias { struct list_head fa_list; + struct rcu_head rcu; struct fib_info *fa_info; u8 fa_tos; u8 fa_type; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 7e4651b3caa..d41219e8037 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -854,6 +854,7 @@ failure: return NULL; } +/* Note! fib_semantic_match intentionally uses RCU list functions. */ int fib_semantic_match(struct list_head *head, const struct flowi *flp, struct fib_result *res, __u32 zone, __u32 mask, int prefixlen) @@ -861,7 +862,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp, struct fib_alias *fa; int nh_sel = 0; - list_for_each_entry(fa, head, fa_list) { + list_for_each_entry_rcu(fa, head, fa_list) { int err; if (fa->fa_tos && -- cgit v1.2.3 From 2373ce1ca04dd46bf2b8b0f9a799eb2a90da92fb Mon Sep 17 00:00:00 2001 From: Robert Olsson Date: Thu, 25 Aug 2005 13:01:29 -0700 Subject: [IPV4]: Convert FIB Trie to RCU. * Removes RW-lock * Proteced read functions uses rcu_dereference proteced with rcu_read_lock() * writing of procted pointer w. rcu_assigen_pointer * Insert/Replace atomic list_replace_rcu * A BUG_ON condition removed.in trie_rebalance With help from Paul E. McKenney. Signed-off-by: Robert Olsson Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 389 +++++++++++++++++++++++++++------------------------- 1 file changed, 202 insertions(+), 187 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 9c4c7f0367b..ff21748248e 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -43,7 +43,7 @@ * 2 of the License, or (at your option) any later version. */ -#define VERSION "0.325" +#define VERSION "0.402" #include #include @@ -62,6 +62,7 @@ #include #include #include +#include #include #include #include @@ -81,22 +82,19 @@ #define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l)) #define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset)) -static DEFINE_RWLOCK(fib_lock); - typedef unsigned int t_key; #define T_TNODE 0 #define T_LEAF 1 #define NODE_TYPE_MASK 0x1UL #define NODE_PARENT(node) \ - ((struct tnode *)((node)->parent & ~NODE_TYPE_MASK)) -#define NODE_SET_PARENT(node, ptr) \ - ((node)->parent = (((unsigned long)(ptr)) | \ - ((node)->parent & NODE_TYPE_MASK))) -#define NODE_INIT_PARENT(node, type) \ - ((node)->parent = (type)) -#define NODE_TYPE(node) \ - ((node)->parent & NODE_TYPE_MASK) + ((struct tnode *)rcu_dereference(((node)->parent & ~NODE_TYPE_MASK))) + +#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK) + +#define NODE_SET_PARENT(node, ptr) \ + rcu_assign_pointer((node)->parent, \ + ((unsigned long)(ptr)) | NODE_TYPE(node)) #define IS_TNODE(n) (!(n->parent & T_LEAF)) #define IS_LEAF(n) (n->parent & T_LEAF) @@ -110,10 +108,12 @@ struct leaf { t_key key; unsigned long parent; struct hlist_head list; + struct rcu_head rcu; }; struct leaf_info { struct hlist_node hlist; + struct rcu_head rcu; int plen; struct list_head falh; }; @@ -125,6 +125,7 @@ struct tnode { unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */ unsigned short full_children; /* KEYLENGTH bits needed */ unsigned short empty_children; /* KEYLENGTH bits needed */ + struct rcu_head rcu; struct node *child[0]; }; @@ -168,11 +169,14 @@ static void trie_dump_seq(struct seq_file *seq, struct trie *t); static kmem_cache_t *fn_alias_kmem; static struct trie *trie_local = NULL, *trie_main = NULL; + +/* rcu_read_lock needs to be hold by caller from readside */ + static inline struct node *tnode_get_child(struct tnode *tn, int i) { BUG_ON(i >= 1 << tn->bits); - return tn->child[i]; + return rcu_dereference(tn->child[i]); } static inline int tnode_child_length(const struct tnode *tn) @@ -213,14 +217,6 @@ static inline int tkey_mismatch(t_key a, int offset, t_key b) return i; } -/* Candidate for fib_semantics */ - -static void fn_free_alias(struct fib_alias *fa) -{ - fib_release_info(fa->fa_info); - kmem_cache_free(fn_alias_kmem, fa); -} - /* To understand this stuff, an understanding of keys and all their bits is necessary. Every node in the trie has a key associated with it, but not @@ -292,53 +288,57 @@ static inline void check_tnode(const struct tnode *tn) static int halve_threshold = 25; static int inflate_threshold = 50; -static struct leaf *leaf_new(void) + +static void __alias_free_mem(struct rcu_head *head) { - struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL); - if (l) { - NODE_INIT_PARENT(l, T_LEAF); - INIT_HLIST_HEAD(&l->list); - } - return l; + struct fib_alias *fa = container_of(head, struct fib_alias, rcu); + kmem_cache_free(fn_alias_kmem, fa); } -static struct leaf_info *leaf_info_new(int plen) +static inline void alias_free_mem_rcu(struct fib_alias *fa) { - struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL); - - if (!li) - return NULL; + call_rcu(&fa->rcu, __alias_free_mem); +} - li->plen = plen; - INIT_LIST_HEAD(&li->falh); +static void __leaf_free_rcu(struct rcu_head *head) +{ + kfree(container_of(head, struct leaf, rcu)); +} - return li; +static inline void free_leaf(struct leaf *leaf) +{ + call_rcu(&leaf->rcu, __leaf_free_rcu); } -static inline void free_leaf(struct leaf *l) +static void __leaf_info_free_rcu(struct rcu_head *head) { - kfree(l); + kfree(container_of(head, struct leaf_info, rcu)); } -static inline void free_leaf_info(struct leaf_info *li) +static inline void free_leaf_info(struct leaf_info *leaf) { - kfree(li); + call_rcu(&leaf->rcu, __leaf_info_free_rcu); } static struct tnode *tnode_alloc(unsigned int size) { - if (size <= PAGE_SIZE) { - return kmalloc(size, GFP_KERNEL); - } else { - return (struct tnode *) - __get_free_pages(GFP_KERNEL, get_order(size)); - } + struct page *pages; + + if (size <= PAGE_SIZE) + return kcalloc(size, 1, GFP_KERNEL); + + pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size)); + if (!pages) + return NULL; + + return page_address(pages); } -static void __tnode_free(struct tnode *tn) +static void __tnode_free_rcu(struct rcu_head *head) { + struct tnode *tn = container_of(head, struct tnode, rcu); unsigned int size = sizeof(struct tnode) + - (1 << tn->bits) * sizeof(struct node *); + (1 << tn->bits) * sizeof(struct node *); if (size <= PAGE_SIZE) kfree(tn); @@ -346,6 +346,31 @@ static void __tnode_free(struct tnode *tn) free_pages((unsigned long)tn, get_order(size)); } +static inline void tnode_free(struct tnode *tn) +{ + call_rcu(&tn->rcu, __tnode_free_rcu); +} + +static struct leaf *leaf_new(void) +{ + struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL); + if (l) { + l->parent = T_LEAF; + INIT_HLIST_HEAD(&l->list); + } + return l; +} + +static struct leaf_info *leaf_info_new(int plen) +{ + struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL); + if (li) { + li->plen = plen; + INIT_LIST_HEAD(&li->falh); + } + return li; +} + static struct tnode* tnode_new(t_key key, int pos, int bits) { int nchildren = 1<parent = T_TNODE; tn->pos = pos; tn->bits = bits; tn->key = key; @@ -367,17 +392,6 @@ static struct tnode* tnode_new(t_key key, int pos, int bits) return tn; } -static void tnode_free(struct tnode *tn) -{ - if (IS_LEAF(tn)) { - free_leaf((struct leaf *)tn); - pr_debug("FL %p \n", tn); - } else { - __tnode_free(tn); - pr_debug("FT %p \n", tn); - } -} - /* * Check whether a tnode 'n' is "full", i.e. it is an internal node * and no bits are skipped. See discussion in dyntree paper p. 6 @@ -403,13 +417,11 @@ static inline void put_child(struct trie *t, struct tnode *tn, int i, struct nod static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull) { - struct node *chi; + struct node *chi = tn->child[i]; int isfull; BUG_ON(i >= 1<bits); - write_lock_bh(&fib_lock); - chi = tn->child[i]; /* update emptyChildren */ if (n == NULL && chi != NULL) @@ -430,8 +442,7 @@ static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int w if (n) NODE_SET_PARENT(n, tn); - tn->child[i] = n; - write_unlock_bh(&fib_lock); + rcu_assign_pointer(tn->child[i], n); } static struct node *resize(struct trie *t, struct tnode *tn) @@ -456,17 +467,12 @@ static struct node *resize(struct trie *t, struct tnode *tn) for (i = 0; i < tnode_child_length(tn); i++) { struct node *n; - write_lock_bh(&fib_lock); n = tn->child[i]; - if (!n) { - write_unlock_bh(&fib_lock); + if (!n) continue; - } /* compress one level */ - NODE_INIT_PARENT(n, NODE_TYPE(n)); - - write_unlock_bh(&fib_lock); + NODE_SET_PARENT(n, NULL); tnode_free(tn); return n; } @@ -577,24 +583,17 @@ static struct node *resize(struct trie *t, struct tnode *tn) /* Only one child remains */ - if (tn->empty_children == tnode_child_length(tn) - 1) for (i = 0; i < tnode_child_length(tn); i++) { struct node *n; - write_lock_bh(&fib_lock); - n = tn->child[i]; - if (!n) { - write_unlock_bh(&fib_lock); + if (!n) continue; - } /* compress one level */ - NODE_INIT_PARENT(n, NODE_TYPE(n)); - - write_unlock_bh(&fib_lock); + NODE_SET_PARENT(n, NULL); tnode_free(tn); return n; } @@ -831,19 +830,22 @@ static void trie_init(struct trie *t) return; t->size = 0; - t->trie = NULL; + rcu_assign_pointer(t->trie, NULL); t->revision = 0; #ifdef CONFIG_IP_FIB_TRIE_STATS memset(&t->stats, 0, sizeof(struct trie_use_stats)); #endif } +/* readside most use rcu_read_lock currently dump routines + via get_fa_head and dump */ + static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen) { struct hlist_node *node; struct leaf_info *li; - hlist_for_each_entry(li, node, head, hlist) + hlist_for_each_entry_rcu(li, node, head, hlist) if (li->plen == plen) return li; @@ -862,28 +864,27 @@ static inline struct list_head * get_fa_head(struct leaf *l, int plen) static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) { - struct leaf_info *li = NULL, *last = NULL; - struct hlist_node *node; + struct leaf_info *li = NULL, *last = NULL; + struct hlist_node *node; - write_lock_bh(&fib_lock); + if (hlist_empty(head)) { + hlist_add_head_rcu(&new->hlist, head); + } else { + hlist_for_each_entry(li, node, head, hlist) { + if (new->plen > li->plen) + break; - if (hlist_empty(head)) { - hlist_add_head(&new->hlist, head); - } else { - hlist_for_each_entry(li, node, head, hlist) { - if (new->plen > li->plen) - break; - - last = li; - } - if (last) - hlist_add_after(&last->hlist, &new->hlist); - else - hlist_add_before(&new->hlist, &li->hlist); - } - write_unlock_bh(&fib_lock); + last = li; + } + if (last) + hlist_add_after_rcu(&last->hlist, &new->hlist); + else + hlist_add_before_rcu(&new->hlist, &li->hlist); + } } +/* rcu_read_lock needs to be hold by caller from readside */ + static struct leaf * fib_find_node(struct trie *t, u32 key) { @@ -892,7 +893,7 @@ fib_find_node(struct trie *t, u32 key) struct node *n; pos = 0; - n = t->trie; + n = rcu_dereference(t->trie); while (n != NULL && NODE_TYPE(n) == T_TNODE) { tn = (struct tnode *) n; @@ -915,17 +916,13 @@ fib_find_node(struct trie *t, u32 key) static struct node *trie_rebalance(struct trie *t, struct tnode *tn) { - int i; int wasfull; t_key cindex, key; struct tnode *tp = NULL; key = tn->key; - i = 0; while (tn != NULL && NODE_PARENT(tn) != NULL) { - BUG_ON(i > 12); /* Why is this a bug? -ojn */ - i++; tp = NODE_PARENT(tn); cindex = tkey_extract_bits(key, tp->pos, tp->bits); @@ -945,6 +942,8 @@ static struct node *trie_rebalance(struct trie *t, struct tnode *tn) return (struct node*) tn; } +/* only used from updater-side */ + static struct list_head * fib_insert_node(struct trie *t, int *err, u32 key, int plen) { @@ -1081,7 +1080,7 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) cindex = tkey_extract_bits(key, tp->pos, tp->bits); put_child(t, (struct tnode *)tp, cindex, (struct node *)tn); } else { - t->trie = (struct node*) tn; /* First tnode */ + rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */ tp = tn; } } @@ -1091,7 +1090,8 @@ fib_insert_node(struct trie *t, int *err, u32 key, int plen) tp, tp->pos, tp->bits, key, plen); /* Rebalance the trie */ - t->trie = trie_rebalance(t, tp); + + rcu_assign_pointer(t->trie, trie_rebalance(t, tp)); done: t->revision++; err: @@ -1166,16 +1166,21 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, struct fib_info *fi_drop; u8 state; - write_lock_bh(&fib_lock); + err = -ENOBUFS; + new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL); + if (new_fa == NULL) + goto out; fi_drop = fa->fa_info; - fa->fa_info = fi; - fa->fa_type = type; - fa->fa_scope = r->rtm_scope; + new_fa->fa_tos = fa->fa_tos; + new_fa->fa_info = fi; + new_fa->fa_type = type; + new_fa->fa_scope = r->rtm_scope; state = fa->fa_state; - fa->fa_state &= ~FA_S_ACCESSED; + new_fa->fa_state &= ~FA_S_ACCESSED; - write_unlock_bh(&fib_lock); + list_replace_rcu(&fa->fa_list, &new_fa->fa_list); + alias_free_mem_rcu(fa); fib_release_info(fi_drop); if (state & FA_S_ACCESSED) @@ -1227,11 +1232,8 @@ fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, goto out_free_new_fa; } - write_lock_bh(&fib_lock); - - list_add_tail(&new_fa->fa_list, (fa ? &fa->fa_list : fa_head)); - - write_unlock_bh(&fib_lock); + list_add_tail_rcu(&new_fa->fa_list, + (fa ? &fa->fa_list : fa_head)); rt_cache_flush(-1); rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req); @@ -1246,6 +1248,8 @@ err: return err; } + +/* should be clalled with rcu_read_lock */ static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp, struct fib_result *res) @@ -1256,7 +1260,7 @@ static inline int check_leaf(struct trie *t, struct leaf *l, struct hlist_head *hhead = &l->list; struct hlist_node *node; - hlist_for_each_entry(li, node, hhead, hlist) { + hlist_for_each_entry_rcu(li, node, hhead, hlist) { i = li->plen; mask = ntohl(inet_make_mask(i)); if (l->key != (key & mask)) @@ -1292,10 +1296,9 @@ fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result t_key node_prefix, key_prefix, pref_mismatch; int mp; - n = t->trie; - - read_lock(&fib_lock); + rcu_read_lock(); + n = rcu_dereference(t->trie); if (!n) goto failed; @@ -1465,10 +1468,11 @@ backtrace: failed: ret = 1; found: - read_unlock(&fib_lock); + rcu_read_unlock(); return ret; } +/* only called from updater side */ static int trie_leaf_remove(struct trie *t, t_key key) { t_key cindex; @@ -1503,15 +1507,17 @@ static int trie_leaf_remove(struct trie *t, t_key key) t->revision++; t->size--; + preempt_disable(); tp = NODE_PARENT(n); tnode_free((struct tnode *) n); if (tp) { cindex = tkey_extract_bits(key, tp->pos, tp->bits); put_child(t, (struct tnode *)tp, cindex, NULL); - t->trie = trie_rebalance(t, tp); + rcu_assign_pointer(t->trie, trie_rebalance(t, tp)); } else - t->trie = NULL; + rcu_assign_pointer(t->trie, NULL); + preempt_enable(); return 1; } @@ -1527,7 +1533,6 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, struct fib_alias *fa, *fa_to_delete; struct list_head *fa_head; struct leaf *l; - int kill_li = 0; struct leaf_info *li; @@ -1560,6 +1565,7 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, fa_to_delete = NULL; fa_head = fa->fa_list.prev; + list_for_each_entry(fa, fa_head, fa_list) { struct fib_info *fi = fa->fa_info; @@ -1587,18 +1593,12 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, l = fib_find_node(t, key); li = find_leaf_info(&l->list, plen); - write_lock_bh(&fib_lock); - - list_del(&fa->fa_list); + list_del_rcu(&fa->fa_list); if (list_empty(fa_head)) { - hlist_del(&li->hlist); - kill_li = 1; - } - write_unlock_bh(&fib_lock); - - if (kill_li) + hlist_del_rcu(&li->hlist); free_leaf_info(li); + } if (hlist_empty(&l->list)) trie_leaf_remove(t, key); @@ -1606,7 +1606,8 @@ fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, if (fa->fa_state & FA_S_ACCESSED) rt_cache_flush(-1); - fn_free_alias(fa); + fib_release_info(fa->fa_info); + alias_free_mem_rcu(fa); return 0; } @@ -1618,12 +1619,10 @@ static int trie_flush_list(struct trie *t, struct list_head *head) list_for_each_entry_safe(fa, fa_node, head, fa_list) { struct fib_info *fi = fa->fa_info; - if (fi && (fi->fib_flags&RTNH_F_DEAD)) { - write_lock_bh(&fib_lock); - list_del(&fa->fa_list); - write_unlock_bh(&fib_lock); - - fn_free_alias(fa); + if (fi && (fi->fib_flags & RTNH_F_DEAD)) { + list_del_rcu(&fa->fa_list); + fib_release_info(fa->fa_info); + alias_free_mem_rcu(fa); found++; } } @@ -1641,30 +1640,30 @@ static int trie_flush_leaf(struct trie *t, struct leaf *l) found += trie_flush_list(t, &li->falh); if (list_empty(&li->falh)) { - write_lock_bh(&fib_lock); - hlist_del(&li->hlist); - write_unlock_bh(&fib_lock); - + hlist_del_rcu(&li->hlist); free_leaf_info(li); } } return found; } +/* rcu_read_lock needs to be hold by caller from readside */ + static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf) { struct node *c = (struct node *) thisleaf; struct tnode *p; int idx; + struct node *trie = rcu_dereference(t->trie); if (c == NULL) { - if (t->trie == NULL) + if (trie == NULL) return NULL; - if (IS_LEAF(t->trie)) /* trie w. just a leaf */ - return (struct leaf *) t->trie; + if (IS_LEAF(trie)) /* trie w. just a leaf */ + return (struct leaf *) trie; - p = (struct tnode*) t->trie; /* Start */ + p = (struct tnode*) trie; /* Start */ } else p = (struct tnode *) NODE_PARENT(c); @@ -1679,23 +1678,26 @@ static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf) last = 1 << p->bits; for (idx = pos; idx < last ; idx++) { - if (!p->child[idx]) + c = rcu_dereference(p->child[idx]); + + if (!c) continue; /* Decend if tnode */ - while (IS_TNODE(p->child[idx])) { - p = (struct tnode*) p->child[idx]; - idx = 0; + while (IS_TNODE(c)) { + p = (struct tnode *) c; + idx = 0; /* Rightmost non-NULL branch */ if (p && IS_TNODE(p)) - while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++; + while (!(c = rcu_dereference(p->child[idx])) + && idx < (1<bits)) idx++; /* Done with this tnode? */ - if (idx >= (1 << p->bits) || p->child[idx] == NULL) + if (idx >= (1 << p->bits) || !c) goto up; } - return (struct leaf*) p->child[idx]; + return (struct leaf *) c; } up: /* No more children go up one step */ @@ -1713,6 +1715,7 @@ static int fn_trie_flush(struct fib_table *tb) t->revision++; + rcu_read_lock(); for (h = 0; (l = nextleaf(t, l)) != NULL; h++) { found += trie_flush_leaf(t, l); @@ -1720,6 +1723,7 @@ static int fn_trie_flush(struct fib_table *tb) trie_leaf_remove(t, ll->key); ll = l; } + rcu_read_unlock(); if (ll && hlist_empty(&ll->list)) trie_leaf_remove(t, ll->key); @@ -1745,7 +1749,7 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib last_resort = NULL; order = -1; - read_lock(&fib_lock); + rcu_read_lock(); l = fib_find_node(t, 0); if (!l) @@ -1758,7 +1762,7 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib if (list_empty(fa_head)) goto out; - list_for_each_entry(fa, fa_head, fa_list) { + list_for_each_entry_rcu(fa, fa_head, fa_list) { struct fib_info *next_fi = fa->fa_info; if (fa->fa_scope != res->scope || @@ -1809,7 +1813,7 @@ fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib } trie_last_dflt = last_idx; out:; - read_unlock(&fib_lock); + rcu_read_unlock(); } static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb, @@ -1823,7 +1827,9 @@ static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fi s_i = cb->args[3]; i = 0; - list_for_each_entry(fa, fah, fa_list) { + /* rcu_read_lock is hold by caller */ + + list_for_each_entry_rcu(fa, fah, fa_list) { if (i < s_i) { i++; continue; @@ -1898,7 +1904,7 @@ static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin s_m = cb->args[1]; - read_lock(&fib_lock); + rcu_read_lock(); for (m = 0; m <= 32; m++) { if (m < s_m) continue; @@ -1911,11 +1917,11 @@ static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlin goto out; } } - read_unlock(&fib_lock); + rcu_read_unlock(); cb->args[1] = m; return skb->len; out: - read_unlock(&fib_lock); + rcu_read_unlock(); return -1; } @@ -2016,7 +2022,7 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n, putspace_seq(seq, indent+2); seq_printf(seq, "{/%d...dumping}\n", i); - list_for_each_entry(fa, fa_head, fa_list) { + list_for_each_entry_rcu(fa, fa_head, fa_list) { putspace_seq(seq, indent+2); if (fa->fa_info == NULL) { seq_printf(seq, "Error fa_info=NULL\n"); @@ -2056,28 +2062,28 @@ static void printnode_seq(struct seq_file *seq, int indent, struct node *n, static void trie_dump_seq(struct seq_file *seq, struct trie *t) { - struct node *n = t->trie; + struct node *n; int cindex = 0; int indent = 1; int pend = 0; int depth = 0; struct tnode *tn; - read_lock(&fib_lock); - + rcu_read_lock(); + n = rcu_dereference(t->trie); seq_printf(seq, "------ trie_dump of t=%p ------\n", t); if (!n) { seq_printf(seq, "------ trie is empty\n"); - read_unlock(&fib_lock); + rcu_read_unlock(); return; } printnode_seq(seq, indent, n, pend, cindex, 0); if (!IS_TNODE(n)) { - read_unlock(&fib_lock); + rcu_read_unlock(); return; } @@ -2088,26 +2094,32 @@ static void trie_dump_seq(struct seq_file *seq, struct trie *t) depth++; while (tn && cindex < (1 << tn->bits)) { - if (tn->child[cindex]) { + struct node *child = rcu_dereference(tn->child[cindex]); + if (!child) + cindex++; + else { /* Got a child */ + printnode_seq(seq, indent, child, pend, + cindex, tn->bits); - printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits); - if (IS_LEAF(tn->child[cindex])) { + if (IS_LEAF(child)) cindex++; - } else { + + else { /* * New tnode. Decend one level */ depth++; - tn = (struct tnode *)tn->child[cindex]; - pend = tn->pos + tn->bits; - putspace_seq(seq, indent); seq_printf(seq, "\\--\n"); + n = child; + tn = (struct tnode *)n; + pend = tn->pos+tn->bits; + putspace_seq(seq, indent); + seq_printf(seq, "\\--\n"); indent += 3; cindex = 0; } - } else - cindex++; + } /* * Test if we are done @@ -2132,8 +2144,7 @@ static void trie_dump_seq(struct seq_file *seq, struct trie *t) depth--; } } - - read_unlock(&fib_lock); + rcu_read_unlock(); } static struct trie_stat *trie_stat_new(void) @@ -2159,7 +2170,7 @@ static struct trie_stat *trie_stat_new(void) static struct trie_stat *trie_collect_stats(struct trie *t) { - struct node *n = t->trie; + struct node *n; struct trie_stat *s = trie_stat_new(); int cindex = 0; int pend = 0; @@ -2167,11 +2178,13 @@ static struct trie_stat *trie_collect_stats(struct trie *t) if (!s) return NULL; + + rcu_read_lock(); + n = rcu_dereference(t->trie); + if (!n) return s; - read_lock(&fib_lock); - if (IS_TNODE(n)) { struct tnode *tn = (struct tnode *)n; pend = tn->pos+tn->bits; @@ -2179,7 +2192,9 @@ static struct trie_stat *trie_collect_stats(struct trie *t) depth++; while (tn && cindex < (1 << tn->bits)) { - if (tn->child[cindex]) { + struct node *ch = rcu_dereference(tn->child[cindex]); + if (ch) { + /* Got a child */ if (IS_LEAF(tn->child[cindex])) { @@ -2199,7 +2214,7 @@ static struct trie_stat *trie_collect_stats(struct trie *t) s->nodesizes[tn->bits]++; depth++; - n = tn->child[cindex]; + n = ch; tn = (struct tnode *)n; pend = tn->pos+tn->bits; @@ -2236,7 +2251,7 @@ static struct trie_stat *trie_collect_stats(struct trie *t) } } - read_unlock(&fib_lock); + rcu_read_unlock(); return s; } -- cgit v1.2.3 From 29cb9f9c5502f6218cd3ea574efe46a5e55522d2 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 25 Aug 2005 16:23:11 -0700 Subject: [LIB]: Make TEXTSEARCH_BM plain tristate like the others And select it when the relevant modules are enabled. Signed-off-by: David S. Miller --- net/ipv4/netfilter/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index f2bea6ecb22..c4213f3de50 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -415,6 +415,7 @@ config IP_NF_MATCH_STRING depends on IP_NF_IPTABLES select TEXTSEARCH select TEXTSEARCH_KMP + select TEXTSEARCH_BM select TEXTSEARCH_FSM help This option adds a `string' match, which allows you to look for -- cgit v1.2.3 From ba89966c1984513f4f2cc0a6c182266be44ddd03 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 26 Aug 2005 12:05:31 -0700 Subject: [NET]: use __read_mostly on kmem_cache_t , DEFINE_SNMP_STAT pointers This patch puts mostly read only data in the right section (read_mostly), to help sharing of these data between CPUS without memory ping pongs. On one of my production machine, tcp_statistics was sitting in a heavily modified cache line, so *every* SNMP update had to force a reload. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 2 +- net/ipv4/fib_hash.c | 4 ++-- net/ipv4/fib_trie.c | 2 +- net/ipv4/icmp.c | 2 +- net/ipv4/inetpeer.c | 2 +- net/ipv4/ip_input.c | 2 +- net/ipv4/ipmr.c | 2 +- net/ipv4/ipvs/ip_vs_conn.c | 2 +- net/ipv4/netfilter/ip_conntrack_core.c | 4 ++-- net/ipv4/netfilter/ipt_hashlimit.c | 2 +- net/ipv4/tcp.c | 2 +- net/ipv4/udp.c | 2 +- 12 files changed, 14 insertions(+), 14 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 5810f9d1491..bf147f8db39 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -113,7 +113,7 @@ #include #endif -DEFINE_SNMP_STAT(struct linux_mib, net_statistics); +DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly; extern void ip_mc_drop_socket(struct sock *sk); diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index b10d6bb5ef3..2a8c9afc369 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -45,8 +45,8 @@ #include "fib_lookup.h" -static kmem_cache_t *fn_hash_kmem; -static kmem_cache_t *fn_alias_kmem; +static kmem_cache_t *fn_hash_kmem __read_mostly; +static kmem_cache_t *fn_alias_kmem __read_mostly; struct fib_node { struct hlist_node fn_hash; diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index ff21748248e..b2dea4e5da7 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -166,7 +166,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn); static void tnode_free(struct tnode *tn); static void trie_dump_seq(struct seq_file *seq, struct trie *t); -static kmem_cache_t *fn_alias_kmem; +static kmem_cache_t *fn_alias_kmem __read_mostly; static struct trie *trie_local = NULL, *trie_main = NULL; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 25f66b750fd..24eb56ae1b5 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -114,7 +114,7 @@ struct icmp_bxm { /* * Statistics */ -DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics); +DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics) __read_mostly; /* An array of errno for error messages from dest unreach. */ /* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */ diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 4410b9dc03e..f84ba9c9655 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -73,7 +73,7 @@ /* Exported for inet_getid inline function. */ DEFINE_SPINLOCK(inet_peer_idlock); -static kmem_cache_t *peer_cachep; +static kmem_cache_t *peer_cachep __read_mostly; #define node_height(x) x->avl_height static struct inet_peer peer_fake_node = { diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 220a8b5920e..473d0f2b2e0 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -150,7 +150,7 @@ * SNMP management statistics */ -DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics); +DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics) __read_mostly; /* * Process Router Attention IP option diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index dc806b57842..9dbf5909f3a 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -103,7 +103,7 @@ static DEFINE_SPINLOCK(mfc_unres_lock); In this case data path is free of exclusive locks at all. */ -static kmem_cache_t *mrt_cachep; +static kmem_cache_t *mrt_cachep __read_mostly; static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local); static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert); diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index d0145a8b155..e11952ea17a 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -40,7 +40,7 @@ static struct list_head *ip_vs_conn_tab; /* SLAB cache for IPVS connections */ -static kmem_cache_t *ip_vs_conn_cachep; +static kmem_cache_t *ip_vs_conn_cachep __read_mostly; /* counter for current IPVS connections */ static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 285743bfbed..a0648600190 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -70,8 +70,8 @@ static LIST_HEAD(helpers); unsigned int ip_conntrack_htable_size = 0; int ip_conntrack_max; struct list_head *ip_conntrack_hash; -static kmem_cache_t *ip_conntrack_cachep; -static kmem_cache_t *ip_conntrack_expect_cachep; +static kmem_cache_t *ip_conntrack_cachep __read_mostly; +static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly; struct ip_conntrack ip_conntrack_untracked; unsigned int ip_ct_log_invalid; static LIST_HEAD(unconfirmed); diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c index 564b49bfebc..2dd1cccbdab 100644 --- a/net/ipv4/netfilter/ipt_hashlimit.c +++ b/net/ipv4/netfilter/ipt_hashlimit.c @@ -94,7 +94,7 @@ struct ipt_hashlimit_htable { static DEFINE_SPINLOCK(hashlimit_lock); /* protects htables list */ static DECLARE_MUTEX(hlimit_mutex); /* additional checkentry protection */ static HLIST_HEAD(hashlimit_htables); -static kmem_cache_t *hashlimit_cachep; +static kmem_cache_t *hashlimit_cachep __read_mostly; static inline int dst_cmp(const struct dsthash_ent *ent, struct dsthash_dst *b) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 68626de6d69..02fdda68718 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -269,7 +269,7 @@ int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; -DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics); +DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly; atomic_t tcp_orphan_count = ATOMIC_INIT(0); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3a5bbbe7dd8..e5beca7de86 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -113,7 +113,7 @@ * Snmp MIB for the UDP layer */ -DEFINE_SNMP_STAT(struct udp_mib, udp_statistics); +DEFINE_SNMP_STAT(struct udp_mib, udp_statistics) __read_mostly; struct hlist_head udp_hash[UDP_HTABLE_SIZE]; DEFINE_RWLOCK(udp_hash_lock); -- cgit v1.2.3 From 5f2c3b910744f68e1a507f027398f404b3feb5fb Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Sat, 27 Aug 2005 22:37:03 -0700 Subject: [NETFILTER]: Add new iptables TTL target This new iptables target allows manipulation of the TTL of an IPv4 packet. Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/Kconfig | 14 +++++ net/ipv4/netfilter/Makefile | 1 + net/ipv4/netfilter/ipt_TTL.c | 119 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 134 insertions(+) create mode 100644 net/ipv4/netfilter/ipt_TTL.c (limited to 'net/ipv4') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index c4213f3de50..e046f552181 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -664,6 +664,20 @@ config IP_NF_TARGET_CLASSIFY To compile it as a module, choose M here. If unsure, say N. +config IP_NF_TARGET_TTL + tristate 'TTL target support' + depends on IP_NF_MANGLE + help + This option adds a `TTL' target, which enables the user to modify + the TTL value of the IP header. + + While it is safe to decrement/lower the TTL, this target also enables + functionality to increment and set the TTL value of the IP header to + arbitrary values. This is EXTREMELY DANGEROUS since you can easily + create immortal packets that loop forever on the network. + + To compile it as a module, choose M here. If unsure, say N. + config IP_NF_TARGET_CONNMARK tristate 'CONNMARK target support' depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 89cae69ee20..a7bd38f5052 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -85,6 +85,7 @@ obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o obj-$(CONFIG_IP_NF_TARGET_NOTRACK) += ipt_NOTRACK.o obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o +obj-$(CONFIG_IP_NF_TARGET_TTL) += ipt_TTL.o # generic ARP tables obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o diff --git a/net/ipv4/netfilter/ipt_TTL.c b/net/ipv4/netfilter/ipt_TTL.c new file mode 100644 index 00000000000..b9ae6a9382f --- /dev/null +++ b/net/ipv4/netfilter/ipt_TTL.c @@ -0,0 +1,119 @@ +/* TTL modification target for IP tables + * (C) 2000,2005 by Harald Welte + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include +#include + +#include +#include + +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("IP tables TTL modification module"); +MODULE_LICENSE("GPL"); + +static unsigned int +ipt_ttl_target(struct sk_buff **pskb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const void *targinfo, void *userinfo) +{ + struct iphdr *iph; + const struct ipt_TTL_info *info = targinfo; + u_int16_t diffs[2]; + int new_ttl; + + if (!skb_make_writable(pskb, (*pskb)->len)) + return NF_DROP; + + iph = (*pskb)->nh.iph; + + switch (info->mode) { + case IPT_TTL_SET: + new_ttl = info->ttl; + break; + case IPT_TTL_INC: + new_ttl = iph->ttl + info->ttl; + if (new_ttl > 255) + new_ttl = 255; + break; + case IPT_TTL_DEC: + new_ttl = iph->ttl - info->ttl; + if (new_ttl < 0) + new_ttl = 0; + break; + default: + new_ttl = iph->ttl; + break; + } + + if (new_ttl != iph->ttl) { + diffs[0] = htons(((unsigned)iph->ttl) << 8) ^ 0xFFFF; + iph->ttl = new_ttl; + diffs[1] = htons(((unsigned)iph->ttl) << 8); + iph->check = csum_fold(csum_partial((char *)diffs, + sizeof(diffs), + iph->check^0xFFFF)); + } + + return IPT_CONTINUE; +} + +static int ipt_ttl_checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + struct ipt_TTL_info *info = targinfo; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_TTL_info))) { + printk(KERN_WARNING "ipt_TTL: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_TTL_info))); + return 0; + } + + if (strcmp(tablename, "mangle")) { + printk(KERN_WARNING "ipt_TTL: can only be called from " + "\"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + + if (info->mode > IPT_TTL_MAXMODE) { + printk(KERN_WARNING "ipt_TTL: invalid or unknown Mode %u\n", + info->mode); + return 0; + } + + if ((info->mode != IPT_TTL_SET) && (info->ttl == 0)) + return 0; + + return 1; +} + +static struct ipt_target ipt_TTL = { + .name = "TTL", + .target = ipt_ttl_target, + .checkentry = ipt_ttl_checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_TTL); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_TTL); +} + +module_init(init); +module_exit(fini); -- cgit v1.2.3