aboutsummaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/ip_input.c14
-rw-r--r--net/ipv4/ip_output.c6
-rw-r--r--net/ipv4/tcp.c14
-rw-r--r--net/ipv4/tcp_input.c30
-rw-r--r--net/ipv4/tcp_output.c8
-rw-r--r--net/ipv4/udp.c171
6 files changed, 162 insertions, 81 deletions
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 324e7e0fdb2..97069399d86 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -329,6 +329,7 @@ drop:
static inline int ip_rcv_finish(struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
+ struct rtable *rt;
/*
* Initialise the virtual path cache for the packet. It describes
@@ -340,6 +341,8 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
if (unlikely(err)) {
if (err == -EHOSTUNREACH)
IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
+ else if (err == -ENETUNREACH)
+ IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
goto drop;
}
}
@@ -358,6 +361,12 @@ static inline int ip_rcv_finish(struct sk_buff *skb)
if (iph->ihl > 5 && ip_rcv_options(skb))
goto drop;
+ rt = (struct rtable*)skb->dst;
+ if (rt->rt_type == RTN_MULTICAST)
+ IP_INC_STATS_BH(IPSTATS_MIB_INMCASTPKTS);
+ else if (rt->rt_type == RTN_BROADCAST)
+ IP_INC_STATS_BH(IPSTATS_MIB_INBCASTPKTS);
+
return dst_input(skb);
drop:
@@ -414,7 +423,10 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
goto inhdr_error;
len = ntohs(iph->tot_len);
- if (skb->len < len || len < (iph->ihl*4))
+ if (skb->len < len) {
+ IP_INC_STATS_BH(IPSTATS_MIB_INTRUNCATEDPKTS);
+ goto drop;
+ } else if (len < (iph->ihl*4))
goto inhdr_error;
/* Our transport medium may have padded the buffer out. Now we know it
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 534650cad3a..d6427d91851 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -160,9 +160,15 @@ EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
static inline int ip_finish_output2(struct sk_buff *skb)
{
struct dst_entry *dst = skb->dst;
+ struct rtable *rt = (struct rtable *)dst;
struct net_device *dev = dst->dev;
int hh_len = LL_RESERVED_SPACE(dev);
+ if (rt->rt_type == RTN_MULTICAST)
+ IP_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
+ else if (rt->rt_type == RTN_BROADCAST)
+ IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS);
+
/* Be paranoid, rather than too clever. */
if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
struct sk_buff *skb2;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 2cf9a898ce5..d6e48866817 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1573,14 +1573,12 @@ void tcp_close(struct sock *sk, long timeout)
sk_stream_mem_reclaim(sk);
- /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
- * 3.10, we send a RST here because data was lost. To
- * witness the awful effects of the old behavior of always
- * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
- * a bulk GET in an FTP client, suspend the process, wait
- * for the client to advertise a zero window, then kill -9
- * the FTP client, wheee... Note: timeout is always zero
- * in such a case.
+ /* As outlined in RFC 2525, section 2.17, we send a RST here because
+ * data was lost. To witness the awful effects of the old behavior of
+ * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk
+ * GET in an FTP client, suspend the process, wait for the client to
+ * advertise a zero window, then kill -9 the FTP client, wheee...
+ * Note: timeout is always zero in such a case.
*/
if (data_was_unread) {
/* Unread data was tossed, zap the connection. */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 051f0f815f1..7641b2761a1 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1265,20 +1265,15 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
return flag;
}
-/* F-RTO can only be used if these conditions are satisfied:
- * - there must be some unsent new data
- * - the advertised window should allow sending it
- * - TCP has never retransmitted anything other than head (SACK enhanced
- * variant from Appendix B of RFC4138 is more robust here)
+/* F-RTO can only be used if TCP has never retransmitted anything other than
+ * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
*/
int tcp_use_frto(struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- if (!sysctl_tcp_frto || !tcp_send_head(sk) ||
- after(TCP_SKB_CB(tcp_send_head(sk))->end_seq,
- tp->snd_una + tp->snd_wnd))
+ if (!sysctl_tcp_frto)
return 0;
if (IsSackFrto())
@@ -2642,7 +2637,9 @@ static void tcp_undo_spur_to_response(struct sock *sk, int flag)
* algorithm is not part of the F-RTO detection algorithm
* given in RFC4138 but can be selected separately).
* Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss
- * and TCP falls back to conventional RTO recovery.
+ * and TCP falls back to conventional RTO recovery. F-RTO allows overriding
+ * of Nagle, this is done using frto_counter states 2 and 3, when a new data
+ * segment of any size sent during F-RTO, state 2 is upgraded to 3.
*
* Rationale: if the RTO was spurious, new ACKs should arrive from the
* original window even after we transmit two new data segments.
@@ -2671,7 +2668,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
inet_csk(sk)->icsk_retransmits = 0;
if (!before(tp->snd_una, tp->frto_highmark)) {
- tcp_enter_frto_loss(sk, tp->frto_counter + 1, flag);
+ tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag);
return 1;
}
@@ -2697,7 +2694,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
return 1;
}
- if ((tp->frto_counter == 2) &&
+ if ((tp->frto_counter >= 2) &&
(!(flag&FLAG_FORWARD_PROGRESS) ||
((flag&FLAG_DATA_SACKED) && !(flag&FLAG_ONLY_ORIG_SACKED)))) {
/* RFC4138 shortcoming (see comment above) */
@@ -2710,10 +2707,19 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
}
if (tp->frto_counter == 1) {
+ /* Sending of the next skb must be allowed or no FRTO */
+ if (!tcp_send_head(sk) ||
+ after(TCP_SKB_CB(tcp_send_head(sk))->end_seq,
+ tp->snd_una + tp->snd_wnd)) {
+ tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3),
+ flag);
+ return 1;
+ }
+
tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
tp->frto_counter = 2;
return 1;
- } else /* frto_counter == 2 */ {
+ } else {
switch (sysctl_tcp_frto_response) {
case 2:
tcp_undo_spur_to_response(sk, flag);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e70a6840cb6..0faacf9c419 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1035,8 +1035,10 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb,
if (nonagle & TCP_NAGLE_PUSH)
return 1;
- /* Don't use the nagle rule for urgent data (or for the final FIN). */
- if (tp->urg_mode ||
+ /* Don't use the nagle rule for urgent data (or for the final FIN).
+ * Nagle can be ignored during F-RTO too (see RFC4138).
+ */
+ if (tp->urg_mode || (tp->frto_counter == 2) ||
(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN))
return 1;
@@ -2035,7 +2037,7 @@ void tcp_send_fin(struct sock *sk)
/* We get here when a process closes a file descriptor (either due to
* an explicit close() or as a byproduct of exit()'ing) and there
* was unread data in the receive queue. This behavior is recommended
- * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM
+ * by RFC 2525, section 2.17. -DaveM
*/
void tcp_send_active_reset(struct sock *sk, gfp_t priority)
{
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index cec0f2cc49b..144970704c2 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -114,14 +114,33 @@ DEFINE_RWLOCK(udp_hash_lock);
static int udp_port_rover;
-static inline int __udp_lib_lport_inuse(__u16 num, struct hlist_head udptable[])
+/*
+ * Note about this hash function :
+ * Typical use is probably daddr = 0, only dport is going to vary hash
+ */
+static inline unsigned int hash_port_and_addr(__u16 port, __be32 addr)
+{
+ addr ^= addr >> 16;
+ addr ^= addr >> 8;
+ return port ^ addr;
+}
+
+static inline int __udp_lib_port_inuse(unsigned int hash, int port,
+ __be32 daddr, struct hlist_head udptable[])
{
struct sock *sk;
struct hlist_node *node;
+ struct inet_sock *inet;
- sk_for_each(sk, node, &udptable[num & (UDP_HTABLE_SIZE - 1)])
- if (sk->sk_hash == num)
+ sk_for_each(sk, node, &udptable[hash & (UDP_HTABLE_SIZE - 1)]) {
+ if (sk->sk_hash != hash)
+ continue;
+ inet = inet_sk(sk);
+ if (inet->num != port)
+ continue;
+ if (inet->rcv_saddr == daddr)
return 1;
+ }
return 0;
}
@@ -142,6 +161,7 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum,
struct hlist_node *node;
struct hlist_head *head;
struct sock *sk2;
+ unsigned int hash;
int error = 1;
write_lock_bh(&udp_hash_lock);
@@ -156,7 +176,9 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum,
for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) {
int size;
- head = &udptable[result & (UDP_HTABLE_SIZE - 1)];
+ hash = hash_port_and_addr(result,
+ inet_sk(sk)->rcv_saddr);
+ head = &udptable[hash & (UDP_HTABLE_SIZE - 1)];
if (hlist_empty(head)) {
if (result > sysctl_local_port_range[1])
result = sysctl_local_port_range[0] +
@@ -181,7 +203,10 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum,
result = sysctl_local_port_range[0]
+ ((result - sysctl_local_port_range[0]) &
(UDP_HTABLE_SIZE - 1));
- if (! __udp_lib_lport_inuse(result, udptable))
+ hash = hash_port_and_addr(result,
+ inet_sk(sk)->rcv_saddr);
+ if (! __udp_lib_port_inuse(hash, result,
+ inet_sk(sk)->rcv_saddr, udptable))
break;
}
if (i >= (1 << 16) / UDP_HTABLE_SIZE)
@@ -189,11 +214,13 @@ int __udp_lib_get_port(struct sock *sk, unsigned short snum,
gotit:
*port_rover = snum = result;
} else {
- head = &udptable[snum & (UDP_HTABLE_SIZE - 1)];
+ hash = hash_port_and_addr(snum, inet_sk(sk)->rcv_saddr);
+ head = &udptable[hash & (UDP_HTABLE_SIZE - 1)];
sk_for_each(sk2, node, head)
- if (sk2->sk_hash == snum &&
+ if (sk2->sk_hash == hash &&
sk2 != sk &&
+ inet_sk(sk2)->num == snum &&
(!sk2->sk_reuse || !sk->sk_reuse) &&
(!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if
|| sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
@@ -201,9 +228,9 @@ gotit:
goto fail;
}
inet_sk(sk)->num = snum;
- sk->sk_hash = snum;
+ sk->sk_hash = hash;
if (sk_unhashed(sk)) {
- head = &udptable[snum & (UDP_HTABLE_SIZE - 1)];
+ head = &udptable[hash & (UDP_HTABLE_SIZE - 1)];
sk_add_node(sk, head);
sock_prot_inc_use(sk->sk_prot);
}
@@ -242,63 +269,78 @@ static struct sock *__udp4_lib_lookup(__be32 saddr, __be16 sport,
{
struct sock *sk, *result = NULL;
struct hlist_node *node;
- unsigned short hnum = ntohs(dport);
- int badness = -1;
+ unsigned int hash, hashwild;
+ int score, best = -1;
+
+ hash = hash_port_and_addr(ntohs(dport), daddr);
+ hashwild = hash_port_and_addr(ntohs(dport), 0);
read_lock(&udp_hash_lock);
- sk_for_each(sk, node, &udptable[hnum & (UDP_HTABLE_SIZE - 1)]) {
+
+lookup:
+
+ sk_for_each(sk, node, &udptable[hash & (UDP_HTABLE_SIZE - 1)]) {
struct inet_sock *inet = inet_sk(sk);
- if (sk->sk_hash == hnum && !ipv6_only_sock(sk)) {
- int score = (sk->sk_family == PF_INET ? 1 : 0);
- if (inet->rcv_saddr) {
- if (inet->rcv_saddr != daddr)
- continue;
- score+=2;
- }
- if (inet->daddr) {
- if (inet->daddr != saddr)
- continue;
- score+=2;
- }
- if (inet->dport) {
- if (inet->dport != sport)
- continue;
- score+=2;
- }
- if (sk->sk_bound_dev_if) {
- if (sk->sk_bound_dev_if != dif)
- continue;
- score+=2;
- }
- if (score == 9) {
- result = sk;
- break;
- } else if (score > badness) {
- result = sk;
- badness = score;
- }
+ if (sk->sk_hash != hash || ipv6_only_sock(sk) ||
+ inet->num != dport)
+ continue;
+
+ score = (sk->sk_family == PF_INET ? 1 : 0);
+ if (inet->rcv_saddr) {
+ if (inet->rcv_saddr != daddr)
+ continue;
+ score+=2;
+ }
+ if (inet->daddr) {
+ if (inet->daddr != saddr)
+ continue;
+ score+=2;
+ }
+ if (inet->dport) {
+ if (inet->dport != sport)
+ continue;
+ score+=2;
+ }
+ if (sk->sk_bound_dev_if) {
+ if (sk->sk_bound_dev_if != dif)
+ continue;
+ score+=2;
+ }
+ if (score == 9) {
+ result = sk;
+ goto found;
+ } else if (score > best) {
+ result = sk;
+ best = score;
}
}
+
+ if (hash != hashwild) {
+ hash = hashwild;
+ goto lookup;
+ }
+found:
if (result)
sock_hold(result);
read_unlock(&udp_hash_lock);
return result;
}
-static inline struct sock *udp_v4_mcast_next(struct sock *sk,
- __be16 loc_port, __be32 loc_addr,
- __be16 rmt_port, __be32 rmt_addr,
- int dif)
+static inline struct sock *udp_v4_mcast_next(
+ struct sock *sk,
+ unsigned int hnum, __be16 loc_port, __be32 loc_addr,
+ __be16 rmt_port, __be32 rmt_addr,
+ int dif)
{
struct hlist_node *node;
struct sock *s = sk;
- unsigned short hnum = ntohs(loc_port);
sk_for_each_from(s, node) {
struct inet_sock *inet = inet_sk(s);
if (s->sk_hash != hnum ||
+ inet->num != loc_port ||
(inet->daddr && inet->daddr != rmt_addr) ||
(inet->dport != rmt_port && inet->dport) ||
(inet->rcv_saddr && inet->rcv_saddr != loc_addr) ||
@@ -1129,29 +1171,44 @@ static int __udp4_lib_mcast_deliver(struct sk_buff *skb,
__be32 saddr, __be32 daddr,
struct hlist_head udptable[])
{
- struct sock *sk;
+ struct sock *sk, *skw, *sknext;
int dif;
+ unsigned int hash = hash_port_and_addr(ntohs(uh->dest), daddr);
+ unsigned int hashwild = hash_port_and_addr(ntohs(uh->dest), 0);
- read_lock(&udp_hash_lock);
- sk = sk_head(&udptable[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]);
dif = skb->dev->ifindex;
- sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
- if (sk) {
- struct sock *sknext = NULL;
+ read_lock(&udp_hash_lock);
+
+ sk = sk_head(&udptable[hash & (UDP_HTABLE_SIZE - 1)]);
+ skw = sk_head(&udptable[hashwild & (UDP_HTABLE_SIZE - 1)]);
+
+ sk = udp_v4_mcast_next(sk, hash, uh->dest, daddr, uh->source, saddr, dif);
+ if (!sk) {
+ hash = hashwild;
+ sk = udp_v4_mcast_next(skw, hash, uh->dest, daddr, uh->source,
+ saddr, dif);
+ }
+ if (sk) {
do {
struct sk_buff *skb1 = skb;
-
- sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr,
- uh->source, saddr, dif);
+ sknext = udp_v4_mcast_next(sk_next(sk), hash, uh->dest,
+ daddr, uh->source, saddr, dif);
+ if (!sknext && hash != hashwild) {
+ hash = hashwild;
+ sknext = udp_v4_mcast_next(skw, hash, uh->dest,
+ daddr, uh->source, saddr, dif);
+ }
if (sknext)
skb1 = skb_clone(skb, GFP_ATOMIC);
if (skb1) {
int ret = udp_queue_rcv_skb(sk, skb1);
if (ret > 0)
- /* we should probably re-process instead
- * of dropping packets here. */
+ /*
+ * we should probably re-process
+ * instead of dropping packets here.
+ */
kfree_skb(skb1);
}
sk = sknext;