From 9bb7bc942d3da606f184ac6a4dfc7e4d470c831b Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Mon, 30 May 2005 15:35:26 -0700 Subject: [NETFILTER]: Fix deadlock with ip_queue and tcp local input path. When we have ip_queue being used from LOCAL_IN, then we end up with a situation where the verdicts coming back from userspace traverse the TCP input path from syscall context. While this seems to work most of the time, there's an ugly deadlock: syscall context is interrupted by the timer interrupt. When the timer interrupt leaves, the timer softirq get's scheduled and calls tcp_delack_timer() and alike. They themselves do bh_lock_sock(sk), which is already held from somewhere else -> boom. I've now tested the suggested solution by Patrick McHardy and Herbert Xu to simply use local_bh_{en,dis}able(). Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_queue.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index e5746b67441..eda1fba431a 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -3,6 +3,7 @@ * communicating with userspace via netlink. * * (C) 2000-2002 James Morris + * (C) 2003-2005 Netfilter Core Team * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -17,6 +18,7 @@ * 2005-01-10: Added /proc counter for dropped packets; fixed so * packets aren't delivered to user space if they're going * to be dropped. + * 2005-05-26: local_bh_{disable,enable} around nf_reinject (Harald Welte) * */ #include @@ -71,7 +73,15 @@ static DECLARE_MUTEX(ipqnl_sem); static void ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict) { + /* TCP input path (and probably other bits) assume to be called + * from softirq context, not from syscall, like ipq_issue_verdict is + * called. TCP input path deadlocks with locks taken from timer + * softirq, e.g. We therefore emulate this by local_bh_disable() */ + + local_bh_disable(); nf_reinject(entry->skb, entry->info, verdict); + local_bh_enable(); + kfree(entry); } -- cgit v1.2.3 From 208d89843b7b03978d8e748b8b991c1be81c4f43 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 30 May 2005 15:50:15 -0700 Subject: [IPV4]: Fix BUG() in 2.6.x, udp_poll(), fragments + CONFIG_HIGHMEM Steven Hand wrote: > > Reconstructed forward trace: > > net/ipv4/udp.c:1334 spin_lock_irq() > net/ipv4/udp.c:1336 udp_checksum_complete() > net/core/skbuff.c:1069 skb_shinfo(skb)->nr_frags > 1 > net/core/skbuff.c:1086 kunmap_skb_frag() > net/core/skbuff.h:1087 local_bh_enable() > kernel/softirq.c:0140 WARN_ON(irqs_disabled()); The receive queue lock is never taken in IRQs (and should never be) so we can simply substitute bh for irq. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/udp.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 4a6952e3fee..7c24e64b443 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -738,7 +738,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) unsigned long amount; amount = 0; - spin_lock_irq(&sk->sk_receive_queue.lock); + spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb != NULL) { /* @@ -748,7 +748,7 @@ int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) */ amount = skb->len - sizeof(struct udphdr); } - spin_unlock_irq(&sk->sk_receive_queue.lock); + spin_unlock_bh(&sk->sk_receive_queue.lock); return put_user(amount, (int __user *)arg); } @@ -848,12 +848,12 @@ csum_copy_err: /* Clear queue. */ if (flags&MSG_PEEK) { int clear = 0; - spin_lock_irq(&sk->sk_receive_queue.lock); + spin_lock_bh(&sk->sk_receive_queue.lock); if (skb == skb_peek(&sk->sk_receive_queue)) { __skb_unlink(skb, &sk->sk_receive_queue); clear = 1; } - spin_unlock_irq(&sk->sk_receive_queue.lock); + spin_unlock_bh(&sk->sk_receive_queue.lock); if (clear) kfree_skb(skb); } @@ -1334,7 +1334,7 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) struct sk_buff_head *rcvq = &sk->sk_receive_queue; struct sk_buff *skb; - spin_lock_irq(&rcvq->lock); + spin_lock_bh(&rcvq->lock); while ((skb = skb_peek(rcvq)) != NULL) { if (udp_checksum_complete(skb)) { UDP_INC_STATS_BH(UDP_MIB_INERRORS); @@ -1345,7 +1345,7 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) break; } } - spin_unlock_irq(&rcvq->lock); + spin_unlock_bh(&rcvq->lock); /* nothing to see, move along */ if (skb == NULL) -- cgit v1.2.3 From 0451eb074eef30240c6c06dacf2911bee26831e1 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 31 May 2005 15:15:58 -0700 Subject: [PKT_SCHED]: Fix dsmark to count ignored indices while walking Unused indices which are ignored while walking must still be counted to avoid dumping the same index twice. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/sched/sch_dsmark.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 8a3db9d95ba..acbe9d2b3e1 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -163,14 +163,15 @@ static void dsmark_walk(struct Qdisc *sch,struct qdisc_walker *walker) return; for (i = 0; i < p->indices; i++) { if (p->mask[i] == 0xff && !p->value[i]) - continue; + goto ignore; if (walker->count >= walker->skip) { if (walker->fn(sch, i+1, walker) < 0) { walker->stop = 1; break; } } - walker->count++; +ignore: + walker->count++; } } -- cgit v1.2.3 From 486b53e59ca8cd07d91ad88375c1c884b15cc9bd Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 31 May 2005 15:16:52 -0700 Subject: [PKT_SCHED]: make dsmark try using pfifo instead of noop while grafting Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/sched/sch_dsmark.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index acbe9d2b3e1..a8c948f6297 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -73,8 +73,13 @@ static int dsmark_graft(struct Qdisc *sch,unsigned long arg, DPRINTK("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n",sch,p,new, old); - if (!new) - new = &noop_qdisc; + + if (new == NULL) { + new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + if (new == NULL) + new = &noop_qdisc; + } + sch_tree_lock(sch); *old = xchg(&p->q,new); if (*old) -- cgit v1.2.3 From 08e9cd1fc559c00bc05df3fc551efe3b87c57ee3 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 31 May 2005 15:17:28 -0700 Subject: [PKT_SCHED]: Disable dsmark debugging messages by default Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/sched/sch_dsmark.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index a8c948f6297..d8bd2a569c7 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -18,7 +18,7 @@ #include -#if 1 /* control */ +#if 0 /* control */ #define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) #else #define DPRINTK(format,args...) -- cgit v1.2.3 From 36839836e8132731e0cadddce452423036a1d5b3 Mon Sep 17 00:00:00 2001 From: Edgar E Iglesias Date: Tue, 31 May 2005 17:08:05 -0700 Subject: [IPSEC]: Fix esp_decap_data size verification in esp4. Signed-off-by: Edgar E Iglesias Signed-off-by: David S. Miller --- net/ipv4/esp4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 053a883247b..eae84cc39d3 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -478,7 +478,7 @@ static int __init esp4_init(void) { struct xfrm_decap_state decap; - if (sizeof(struct esp_decap_data) < + if (sizeof(struct esp_decap_data) > sizeof(decap.decap_data)) { extern void decap_data_too_small(void); -- cgit v1.2.3