From b4528762ca92261c6ed3f03e76adeb1dc587aacb Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 11 May 2008 12:18:51 -0700 Subject: SUNRPC: AUTH_SYS "machine creds" shouldn't use negative valued uid/gid Apparently this causes Solaris 10 servers to refuse our NFSv4 SETCLIENTID calls. Fall back to root creds for now, since most servers that care are very likely to have root squashing enabled. Signed-off-by: Trond Myklebust --- net/sunrpc/auth_generic.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index d927d9f5741..744b79fdcb1 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -17,8 +17,8 @@ # define RPCDBG_FACILITY RPCDBG_AUTH #endif -#define RPC_ANONYMOUS_USERID ((uid_t)-2) -#define RPC_ANONYMOUS_GROUPID ((gid_t)-2) +#define RPC_MACHINE_CRED_USERID ((uid_t)0) +#define RPC_MACHINE_CRED_GROUPID ((gid_t)0) struct generic_cred { struct rpc_cred gc_base; @@ -44,8 +44,8 @@ EXPORT_SYMBOL_GPL(rpc_lookup_cred); struct rpc_cred *rpc_lookup_machine_cred(void) { struct auth_cred acred = { - .uid = RPC_ANONYMOUS_USERID, - .gid = RPC_ANONYMOUS_GROUPID, + .uid = RPC_MACHINE_CRED_USERID, + .gid = RPC_MACHINE_CRED_GROUPID, .machine_cred = 1, }; -- cgit v1.2.3 From d71a4dd72e67210ae0767ccae69c79f1c933ff64 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 9 May 2008 12:01:19 -0700 Subject: svcrpc: fix proc/net/rpc/auth.unix.ip/content display Commit f15364bd4cf8799a7677b6daeed7b67d9139d974 ("IPv6 support for NFS server export caches") dropped a couple spaces, rendering the output here difficult to read. (However note that we expect the output to be parsed only by humans, not machines, so this shouldn't have broken any userland software.) Signed-off-by: J. Bruce Fields --- net/sunrpc/svcauth_unix.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 3f30ee6006a..f24800f2c09 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -278,7 +278,7 @@ static int ip_map_show(struct seq_file *m, dom = im->m_client->h.name; if (ipv6_addr_v4mapped(&addr)) { - seq_printf(m, "%s" NIPQUAD_FMT "%s\n", + seq_printf(m, "%s " NIPQUAD_FMT " %s\n", im->m_class, ntohl(addr.s6_addr32[3]) >> 24 & 0xff, ntohl(addr.s6_addr32[3]) >> 16 & 0xff, @@ -286,7 +286,7 @@ static int ip_map_show(struct seq_file *m, ntohl(addr.s6_addr32[3]) >> 0 & 0xff, dom); } else { - seq_printf(m, "%s" NIP6_FMT "%s\n", + seq_printf(m, "%s " NIP6_FMT " %s\n", im->m_class, NIP6(addr), dom); } return 0; -- cgit v1.2.3 From fc63a050861a53ba99a6222229cda555796d669e Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Fri, 25 Apr 2008 11:07:10 -0500 Subject: svc: Remove extra check for XPT_DEAD bit in svc_xprt_enqueue Remove a redundant check for the XPT_DEAD bit in the svc_xprt_enqueue function. This same bit is checked below while holding the pool lock and prints a debug message if found to be dead. Signed-off-by: Tom Tucker --- net/sunrpc/svc_xprt.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index d8e8d79a845..cac3f82fca6 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -296,8 +296,6 @@ void svc_xprt_enqueue(struct svc_xprt *xprt) if (!(xprt->xpt_flags & ((1<xpt_flags)) - return; cpu = get_cpu(); pool = svc_pool_for_cpu(xprt->xpt_server, cpu); -- cgit v1.2.3 From aa3314c8d6da673b3454549eed45547a79f7cbe1 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Thu, 24 Apr 2008 21:30:47 -0500 Subject: svc: Remove unused header files from svc_xprt.c This cosmetic patch removes unused header files that svc_xprt.c inherited from svcsock.c Signed-off-by: Tom Tucker --- net/sunrpc/svc_xprt.c | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index cac3f82fca6..e46c825f495 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -6,30 +6,9 @@ #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include #include #include -#include -#include -#include -#include -#include -#include - -#include -#include -#include #include #include -- cgit v1.2.3 From 0e7f011a19696cc25d68a8d6631fc6c5aa60a54c Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 23 Apr 2008 16:49:54 -0500 Subject: svcrdma: Simplify receive buffer posting The svcrdma transport provider currently allocates receive buffers to the RQ through the xpo_release_rqst method. This approach is overly complicated since it means that the rqstp rq_xprt_ctxt has to be selectively set based on whether the RPC is going to be processed immediately or deferred. Instead, just post the receive buffer when we are certain that we are replying in the send_reply function. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 17 +---------------- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 10 ++++++++++ net/sunrpc/xprtrdma/svc_rdma_transport.c | 19 ------------------- 3 files changed, 11 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index c22d6b6f2db..f3a108a864a 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -457,8 +457,6 @@ static int rdma_read_complete(struct svc_rqst *rqstp, ret, rqstp->rq_arg.len, rqstp->rq_arg.head[0].iov_base, rqstp->rq_arg.head[0].iov_len); - /* Indicate that we've consumed an RQ credit */ - rqstp->rq_xprt_ctxt = rqstp->rq_xprt; svc_xprt_received(rqstp->rq_xprt); return ret; } @@ -480,13 +478,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) dprintk("svcrdma: rqstp=%p\n", rqstp); - /* - * The rq_xprt_ctxt indicates if we've consumed an RQ credit - * or not. It is used in the rdma xpo_release_rqst function to - * determine whether or not to return an RQ WQE to the RQ. - */ - rqstp->rq_xprt_ctxt = NULL; - spin_lock_bh(&rdma_xprt->sc_read_complete_lock); if (!list_empty(&rdma_xprt->sc_read_complete_q)) { ctxt = list_entry(rdma_xprt->sc_read_complete_q.next, @@ -550,9 +541,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) return 0; } - /* Indicate we've consumed an RQ credit */ - rqstp->rq_xprt_ctxt = rqstp->rq_xprt; - ret = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len + rqstp->rq_arg.tail[0].iov_len; @@ -569,11 +557,8 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) return ret; close_out: - if (ctxt) { + if (ctxt) svc_rdma_put_context(ctxt, 1); - /* Indicate we've consumed an RQ credit */ - rqstp->rq_xprt_ctxt = rqstp->rq_xprt; - } dprintk("svcrdma: transport %p is closing\n", xprt); /* * Set the close bit and enqueue it. svc_recv will see the diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 981f190c1b3..f61d7bd105f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -389,6 +389,16 @@ static int send_reply(struct svcxprt_rdma *rdma, int page_no; int ret; + /* Post a recv buffer to handle another request. */ + ret = svc_rdma_post_recv(rdma); + if (ret) { + printk(KERN_INFO + "svcrdma: could not post a receive buffer, err=%d." + "Closing transport %p.\n", ret, rdma); + set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + return 0; + } + /* Prepare the context */ ctxt->pages[0] = page; ctxt->count = 1; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index af408fc1263..1e0af2f205e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -910,27 +910,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) return NULL; } -/* - * Post an RQ WQE to the RQ when the rqst is being released. This - * effectively returns an RQ credit to the client. The rq_xprt_ctxt - * will be null if the request is deferred due to an RDMA_READ or the - * transport had no data ready (EAGAIN). Note that an RPC deferred in - * svc_process will still return the credit, this is because the data - * is copied and no longer consume a WQE/WC. - */ static void svc_rdma_release_rqst(struct svc_rqst *rqstp) { - int err; - struct svcxprt_rdma *rdma = - container_of(rqstp->rq_xprt, struct svcxprt_rdma, sc_xprt); - if (rqstp->rq_xprt_ctxt) { - BUG_ON(rqstp->rq_xprt_ctxt != rdma); - err = svc_rdma_post_recv(rdma); - if (err) - dprintk("svcrdma: failed to post an RQ WQE error=%d\n", - err); - } - rqstp->rq_xprt_ctxt = NULL; } /* -- cgit v1.2.3 From dbcd00eba99945acfc433508a58eadc5dcd18cad Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Tue, 6 May 2008 11:33:11 -0500 Subject: svcrdma: Fix race with dto_tasklet in svc_rdma_send The svc_rdma_send function will attempt to reap SQ WR to make room for a new request if it finds the SQ full. This function races with the dto_tasklet that also reaps SQ WR. To avoid polling and arming the CQ unnecessarily move the test_and_clear_bit of the RDMAXPRT_SQ_PENDING flag and arming of the CQ to the sq_cq_reap function. Refactor the rq_cq_reap function to match sq_cq_reap so that the code is easier to follow. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 40 ++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 1e0af2f205e..73734173f99 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -228,23 +228,8 @@ static void dto_tasklet_func(unsigned long data) list_del_init(&xprt->sc_dto_q); spin_unlock_irqrestore(&dto_lock, flags); - if (test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags)) { - ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP); - rq_cq_reap(xprt); - set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); - /* - * If data arrived before established event, - * don't enqueue. This defers RPC I/O until the - * RDMA connection is complete. - */ - if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags)) - svc_xprt_enqueue(&xprt->sc_xprt); - } - - if (test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) { - ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP); - sq_cq_reap(xprt); - } + rq_cq_reap(xprt); + sq_cq_reap(xprt); svc_xprt_put(&xprt->sc_xprt); spin_lock_irqsave(&dto_lock, flags); @@ -297,6 +282,10 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt) struct ib_wc wc; struct svc_rdma_op_ctxt *ctxt = NULL; + if (!test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags)) + return; + + ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP); atomic_inc(&rdma_stat_rq_poll); spin_lock_bh(&xprt->sc_rq_dto_lock); @@ -316,6 +305,15 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt) if (ctxt) atomic_inc(&rdma_stat_rq_prod); + + set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); + /* + * If data arrived before established event, + * don't enqueue. This defers RPC I/O until the + * RDMA connection is complete. + */ + if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags)) + svc_xprt_enqueue(&xprt->sc_xprt); } /* @@ -328,6 +326,11 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt) struct ib_cq *cq = xprt->sc_sq_cq; int ret; + + if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags)) + return; + + ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP); atomic_inc(&rdma_stat_sq_poll); while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) { ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; @@ -1010,7 +1013,8 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) if (xprt->sc_sq_depth == atomic_read(&xprt->sc_sq_count)) { spin_unlock_bh(&xprt->sc_lock); atomic_inc(&rdma_stat_sq_starve); - /* See if we can reap some SQ WR */ + + /* See if we can opportunistically reap SQ WR to make room */ sq_cq_reap(xprt); /* Wait until SQ WR available if SQ still full */ -- cgit v1.2.3 From 9d6347acd2134373c3a4c65a4d43e4f1d59aa012 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Fri, 25 Apr 2008 15:51:27 -0500 Subject: svcrdma: Fix return value in svc_rdma_send Fix the return value on close to -ENOTCONN so caller knows to free context. Also if a thread is waiting for free SQ space, check for close when waking to avoid posting WR to a closing transport. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 73734173f99..17f036b23a9 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -1002,7 +1002,7 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) int ret; if (test_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags)) - return 0; + return -ENOTCONN; BUG_ON(wr->send_flags != IB_SEND_SIGNALED); BUG_ON(((struct svc_rdma_op_ctxt *)(unsigned long)wr->wr_id)->wr_op != -- cgit v1.2.3 From 120693d12cde0cc735d784c951b53381efec918f Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Thu, 24 Apr 2008 14:17:21 -0500 Subject: svcrdma: Add put of connection ESTABLISHED reference in rdma_cma_handler The svcrdma transport takes a reference when it gets the ESTABLISHED event from the provider. This reference is supposed to be removed when the DISCONNECT event is received, however, the call to svc_xprt_put was missing in the switch statement. This results in the memory associated with the transport never being freed. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 17f036b23a9..4bf8b5ad167 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -630,6 +630,7 @@ static int rdma_cma_handler(struct rdma_cm_id *cma_id, if (xprt) { set_bit(XPT_CLOSE, &xprt->xpt_flags); svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); } break; case RDMA_CM_EVENT_DEVICE_REMOVAL: -- cgit v1.2.3 From 05a0826a6e6d95ab6e9c3e4a10b58e10f233cc2b Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Fri, 25 Apr 2008 14:11:31 -0500 Subject: svcrdma: Free context on ib_post_recv error If there is an error posting the recv WR to the RQ, free the context associated with the WR. This would leak a context when asynchronous errors occurred on the transport while conccurent threads were processing their RPC. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 4bf8b5ad167..e85ac77f495 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -524,6 +524,8 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) recv_wr.wr_id = (u64)(unsigned long)ctxt; ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr); + if (ret) + svc_rdma_put_context(ctxt, 1); return ret; } -- cgit v1.2.3 From 5ac461a6f05499fa233ea43b1de80b679d1eec21 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Fri, 25 Apr 2008 18:08:59 -0500 Subject: svcrdma: Free context on post_recv error in send_reply If an error is encountered trying to post a recv buffer in send_reply, free the passed in context. Return an error to the caller so it is aware that the request was not posted. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_sendto.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index f61d7bd105f..fb82b1b683f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -396,7 +396,8 @@ static int send_reply(struct svcxprt_rdma *rdma, "svcrdma: could not post a receive buffer, err=%d." "Closing transport %p.\n", ret, rdma); set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); - return 0; + svc_rdma_put_context(ctxt, 0); + return -ENOTCONN; } /* Prepare the context */ -- cgit v1.2.3 From 58e8f62137f1c55fe3d31234167660f2ce509297 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Tue, 6 May 2008 09:45:54 -0500 Subject: svcrdma: Fix error handling during listening endpoint creation A listening endpoint isn't known to the generic transport switch until the svc_create_xprt function returns without error. Calling svc_xprt_put within the xpo_create function causes the module reference count to be erroneously decremented. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index e85ac77f495..d9ed5f24c36 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -667,31 +667,27 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, cma_xprt = rdma_create_xprt(serv, 1); if (!cma_xprt) - return ERR_PTR(ENOMEM); + return ERR_PTR(-ENOMEM); xprt = &cma_xprt->sc_xprt; listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP); if (IS_ERR(listen_id)) { - svc_xprt_put(&cma_xprt->sc_xprt); - dprintk("svcrdma: rdma_create_id failed = %ld\n", - PTR_ERR(listen_id)); - return (void *)listen_id; + ret = PTR_ERR(listen_id); + dprintk("svcrdma: rdma_create_id failed = %d\n", ret); + goto err0; } + ret = rdma_bind_addr(listen_id, sa); if (ret) { - rdma_destroy_id(listen_id); - svc_xprt_put(&cma_xprt->sc_xprt); dprintk("svcrdma: rdma_bind_addr failed = %d\n", ret); - return ERR_PTR(ret); + goto err1; } cma_xprt->sc_cm_id = listen_id; ret = rdma_listen(listen_id, RPCRDMA_LISTEN_BACKLOG); if (ret) { - rdma_destroy_id(listen_id); - svc_xprt_put(&cma_xprt->sc_xprt); dprintk("svcrdma: rdma_listen failed = %d\n", ret); - return ERR_PTR(ret); + goto err1; } /* @@ -702,6 +698,12 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, svc_xprt_set_local(&cma_xprt->sc_xprt, sa, salen); return &cma_xprt->sc_xprt; + + err1: + rdma_destroy_id(listen_id); + err0: + kfree(cma_xprt); + return ERR_PTR(ret); } /* -- cgit v1.2.3 From d16d40093a95f2b31007d7a7abefc50e6b27e236 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Tue, 6 May 2008 10:04:50 -0500 Subject: svcrdma: Return error from rdma_read_xdr so caller knows to free context The rdma_read_xdr function did not discriminate between no read-list and an error posting the read-list. This results in a leak of a page if there is an error posting the read-list. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index f3a108a864a..5e03d95b25e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -260,11 +260,16 @@ static int rdma_read_max_sge(struct svcxprt_rdma *xprt, int sge_count) * On our side, we need to read into a pagelist. The first page immediately * follows the RPC header. * - * This function returns 1 to indicate success. The data is not yet in + * This function returns: + * 0 - No error and no read-list found. + * + * 1 - Successful read-list processing. The data is not yet in * the pagelist and therefore the RPC request must be deferred. The * I/O completion will enqueue the transport again and * svc_rdma_recvfrom will complete the request. * + * <0 - Error processing/posting read-list. + * * NOTE: The ctxt must not be touched after the last WR has been posted * because the I/O completion processing may occur on another * processor and free / modify the context. Ne touche pas! @@ -398,7 +403,7 @@ next_sge: svc_rdma_put_context(head, 1); head = ctxt; } - return 0; + return err; } return 1; @@ -532,14 +537,18 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) goto close_out; } - /* Read read-list data. If we would need to wait, defer - * it. Not that in this case, we don't return the RQ credit - * until after the read completes. - */ - if (rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt)) { + /* Read read-list data. */ + ret = rdma_read_xdr(rdma_xprt, rmsgp, rqstp, ctxt); + if (ret > 0) { + /* read-list posted, defer until data received from client. */ svc_xprt_received(xprt); return 0; } + if (ret < 0) { + /* Post of read-list failed, free context. */ + svc_rdma_put_context(ctxt, 1); + return 0; + } ret = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len -- cgit v1.2.3 From 10a38c33f46d128d11e299acba744bc325cde420 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 30 Apr 2008 17:32:17 -0500 Subject: svcrdma: Remove unused READ_DONE context flags bit The RDMACTXT_F_READ_DONE bit is not longer used. Remove it. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 1 - net/sunrpc/xprtrdma/svc_rdma_transport.c | 1 - 2 files changed, 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 5e03d95b25e..80c6ee82c34 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -325,7 +325,6 @@ next_sge: } ctxt->next = NULL; ctxt->direction = DMA_FROM_DEVICE; - clear_bit(RDMACTXT_F_READ_DONE, &ctxt->flags); clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); /* Prepare READ WR */ diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index d9ed5f24c36..4a79dfda146 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -353,7 +353,6 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt) case IB_WR_RDMA_READ: if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); - set_bit(RDMACTXT_F_READ_DONE, &ctxt->flags); spin_lock_bh(&xprt->sc_read_complete_lock); list_add_tail(&ctxt->dto_q, &xprt->sc_read_complete_q); -- cgit v1.2.3 From 02e7452de74d308ca642f54f7e5ef801ced60a92 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 30 Apr 2008 19:50:56 -0500 Subject: svcrdma: Simplify RDMA_READ deferral buffer management An NFS_WRITE requires a set of RDMA_READ requests to fetch the write data from the client. There are two principal pieces of data that need to be tracked: the list of pages that comprise the completed RPC and the SGE of dma mapped pages to refer to this list of pages. Previously this whole bit was managed as a linked list of contexts with the context containing the page list buried in this list. This patch simplifies this processing by not keeping a linked list, but rather only a pionter from the last submitted RDMA_READ's context to the context that maps the set of pages that describe the RPC. This significantly simplifies this code path. SGE contexts are cleaned up inline in the DTO path instead of at read completion time. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 58 ++++++-------------------------- net/sunrpc/xprtrdma/svc_rdma_transport.c | 5 ++- 2 files changed, 15 insertions(+), 48 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 80c6ee82c34..21a1e625ef0 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -289,7 +289,6 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt, u64 sgl_offset; struct rpcrdma_read_chunk *ch; struct svc_rdma_op_ctxt *ctxt = NULL; - struct svc_rdma_op_ctxt *head; struct svc_rdma_op_ctxt *tmp_sge_ctxt; struct svc_rdma_op_ctxt *tmp_ch_ctxt; struct chunk_sge *ch_sge_ary; @@ -310,20 +309,13 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt, sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp, sge, ch_sge_ary, ch_count, byte_count); - head = svc_rdma_get_context(xprt); sgl_offset = 0; ch_no = 0; for (ch = (struct rpcrdma_read_chunk *)&rmsgp->rm_body.rm_chunks[0]; ch->rc_discrim != 0; ch++, ch_no++) { next_sge: - if (!ctxt) - ctxt = head; - else { - ctxt->next = svc_rdma_get_context(xprt); - ctxt = ctxt->next; - } - ctxt->next = NULL; + ctxt = svc_rdma_get_context(xprt); ctxt->direction = DMA_FROM_DEVICE; clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); @@ -351,20 +343,15 @@ next_sge: * the client and the RPC needs to be enqueued. */ set_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags); - ctxt->next = hdr_ctxt; - hdr_ctxt->next = head; + ctxt->read_hdr = hdr_ctxt; } /* Post the read */ err = svc_rdma_send(xprt, &read_wr); if (err) { - printk(KERN_ERR "svcrdma: Error posting send = %d\n", + printk(KERN_ERR "svcrdma: Error %d posting RDMA_READ\n", err); - /* - * Break the circular list so free knows when - * to stop if the error happened to occur on - * the last read - */ - ctxt->next = NULL; + set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); + svc_rdma_put_context(ctxt, 0); goto out; } atomic_inc(&rdma_stat_read); @@ -375,7 +362,7 @@ next_sge: goto next_sge; } sgl_offset = 0; - err = 0; + err = 1; } out: @@ -393,25 +380,12 @@ next_sge: while (rqstp->rq_resused) rqstp->rq_respages[--rqstp->rq_resused] = NULL; - if (err) { - printk(KERN_ERR "svcrdma : RDMA_READ error = %d\n", err); - set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); - /* Free the linked list of read contexts */ - while (head != NULL) { - ctxt = head->next; - svc_rdma_put_context(head, 1); - head = ctxt; - } - return err; - } - - return 1; + return err; } static int rdma_read_complete(struct svc_rqst *rqstp, - struct svc_rdma_op_ctxt *data) + struct svc_rdma_op_ctxt *head) { - struct svc_rdma_op_ctxt *head = data->next; int page_no; int ret; @@ -437,22 +411,12 @@ static int rdma_read_complete(struct svc_rqst *rqstp, rqstp->rq_arg.len = head->arg.len; rqstp->rq_arg.buflen = head->arg.buflen; + /* Free the context */ + svc_rdma_put_context(head, 0); + /* XXX: What should this be? */ rqstp->rq_prot = IPPROTO_MAX; - /* - * Free the contexts we used to build the RDMA_READ. We have - * to be careful here because the context list uses the same - * next pointer used to chain the contexts associated with the - * RDMA_READ - */ - data->next = NULL; /* terminate circular list */ - do { - data = head->next; - svc_rdma_put_context(head, 0); - head = data; - } while (head != NULL); - ret = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len + rqstp->rq_arg.tail[0].iov_len; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 4a79dfda146..34141eaf25a 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -352,13 +352,16 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt) case IB_WR_RDMA_READ: if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { + struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; + BUG_ON(!read_hdr); set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); spin_lock_bh(&xprt->sc_read_complete_lock); - list_add_tail(&ctxt->dto_q, + list_add_tail(&read_hdr->dto_q, &xprt->sc_read_complete_q); spin_unlock_bh(&xprt->sc_read_complete_lock); svc_xprt_enqueue(&xprt->sc_xprt); } + svc_rdma_put_context(ctxt, 0); break; default: -- cgit v1.2.3 From 8740767376b32a7772607e1b2b07cde0c24120cc Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 30 Apr 2008 20:44:39 -0500 Subject: svcrdma: Use standard Linux lists for context cache Replace the one-off linked list implementation used to implement the context cache with the standard Linux list_head lists. Add a context counter to catch resource leaks. A WARN_ON will be added later to ensure that we've freed all contexts. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 47 ++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 34141eaf25a..817cf4de746 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -103,8 +103,8 @@ static int rdma_bump_context_cache(struct svcxprt_rdma *xprt) spin_lock_bh(&xprt->sc_ctxt_lock); if (ctxt) { at_least_one = 1; - ctxt->next = xprt->sc_ctxt_head; - xprt->sc_ctxt_head = ctxt; + INIT_LIST_HEAD(&ctxt->free_list); + list_add(&ctxt->free_list, &xprt->sc_ctxt_free); } else { /* kmalloc failed...give up for now */ xprt->sc_ctxt_cnt--; @@ -123,7 +123,7 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) while (1) { spin_lock_bh(&xprt->sc_ctxt_lock); - if (unlikely(xprt->sc_ctxt_head == NULL)) { + if (unlikely(list_empty(&xprt->sc_ctxt_free))) { /* Try to bump my cache. */ spin_unlock_bh(&xprt->sc_ctxt_lock); @@ -136,12 +136,15 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) schedule_timeout_uninterruptible(msecs_to_jiffies(500)); continue; } - ctxt = xprt->sc_ctxt_head; - xprt->sc_ctxt_head = ctxt->next; + ctxt = list_entry(xprt->sc_ctxt_free.next, + struct svc_rdma_op_ctxt, + free_list); + list_del_init(&ctxt->free_list); spin_unlock_bh(&xprt->sc_ctxt_lock); ctxt->xprt = xprt; INIT_LIST_HEAD(&ctxt->dto_q); ctxt->count = 0; + atomic_inc(&xprt->sc_ctxt_used); break; } return ctxt; @@ -163,10 +166,11 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) ctxt->sge[i].addr, ctxt->sge[i].length, ctxt->direction); + spin_lock_bh(&xprt->sc_ctxt_lock); - ctxt->next = xprt->sc_ctxt_head; - xprt->sc_ctxt_head = ctxt; + list_add(&ctxt->free_list, &xprt->sc_ctxt_free); spin_unlock_bh(&xprt->sc_ctxt_lock); + atomic_dec(&xprt->sc_ctxt_used); } /* ib_cq event handler */ @@ -412,28 +416,29 @@ static void create_context_cache(struct svcxprt_rdma *xprt, xprt->sc_ctxt_max = ctxt_max; xprt->sc_ctxt_bump = ctxt_bump; xprt->sc_ctxt_cnt = 0; - xprt->sc_ctxt_head = NULL; + atomic_set(&xprt->sc_ctxt_used, 0); + + INIT_LIST_HEAD(&xprt->sc_ctxt_free); for (i = 0; i < ctxt_count; i++) { ctxt = kmalloc(sizeof(*ctxt), GFP_KERNEL); if (ctxt) { - ctxt->next = xprt->sc_ctxt_head; - xprt->sc_ctxt_head = ctxt; + INIT_LIST_HEAD(&ctxt->free_list); + list_add(&ctxt->free_list, &xprt->sc_ctxt_free); xprt->sc_ctxt_cnt++; } } } -static void destroy_context_cache(struct svc_rdma_op_ctxt *ctxt) +static void destroy_context_cache(struct svcxprt_rdma *xprt) { - struct svc_rdma_op_ctxt *next; - if (!ctxt) - return; - - do { - next = ctxt->next; + while (!list_empty(&xprt->sc_ctxt_free)) { + struct svc_rdma_op_ctxt *ctxt; + ctxt = list_entry(xprt->sc_ctxt_free.next, + struct svc_rdma_op_ctxt, + free_list); + list_del_init(&ctxt->free_list); kfree(ctxt); - ctxt = next; - } while (next); + } } static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, @@ -470,7 +475,7 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, reqs + cma_xprt->sc_sq_depth + RPCRDMA_MAX_THREADS + 1); /* max */ - if (!cma_xprt->sc_ctxt_head) { + if (list_empty(&cma_xprt->sc_ctxt_free)) { kfree(cma_xprt); return NULL; } @@ -976,7 +981,7 @@ static void svc_rdma_free(struct svc_xprt *xprt) if (rdma->sc_pd && !IS_ERR(rdma->sc_pd)) ib_dealloc_pd(rdma->sc_pd); - destroy_context_cache(rdma->sc_ctxt_head); + destroy_context_cache(rdma); kfree(rdma); } -- cgit v1.2.3 From 47698e083e40bbd3ef87f5561390ae33abb13cd0 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Tue, 6 May 2008 11:49:05 -0500 Subject: svcrdma: Shrink scope of spinlock on RQ CQ The rq_cq_reap function is only called from the dto_tasklet. The only resource shared with other threads is the sc_rq_dto_q. Move the spin lock to protect only this list. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 817cf4de746..78303f0fad9 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -292,7 +292,6 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt) ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP); atomic_inc(&rdma_stat_rq_poll); - spin_lock_bh(&xprt->sc_rq_dto_lock); while ((ret = ib_poll_cq(xprt->sc_rq_cq, 1, &wc)) > 0) { ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id; ctxt->wc_status = wc.status; @@ -303,9 +302,10 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt) svc_rdma_put_context(ctxt, 1); continue; } + spin_lock_bh(&xprt->sc_rq_dto_lock); list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q); + spin_unlock_bh(&xprt->sc_rq_dto_lock); } - spin_unlock_bh(&xprt->sc_rq_dto_lock); if (ctxt) atomic_inc(&rdma_stat_rq_prod); -- cgit v1.2.3 From 8da91ea8de873ee8be82377ff18637d05e882058 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 30 Apr 2008 22:00:46 -0500 Subject: svcrdma: Move destroy to kernel thread Some providers may wait while destroying adapter resources. Since it is possible that the last reference is put on the dto_tasklet, the actual destroy must be scheduled as a work item. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 78303f0fad9..028c6cf8936 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -963,12 +963,15 @@ static void svc_rdma_detach(struct svc_xprt *xprt) rdma_destroy_id(rdma->sc_cm_id); } -static void svc_rdma_free(struct svc_xprt *xprt) +static void __svc_rdma_free(struct work_struct *work) { - struct svcxprt_rdma *rdma = (struct svcxprt_rdma *)xprt; + struct svcxprt_rdma *rdma = + container_of(work, struct svcxprt_rdma, sc_work); dprintk("svcrdma: svc_rdma_free(%p)\n", rdma); + /* We should only be called from kref_put */ - BUG_ON(atomic_read(&xprt->xpt_ref.refcount) != 0); + BUG_ON(atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0); + if (rdma->sc_sq_cq && !IS_ERR(rdma->sc_sq_cq)) ib_destroy_cq(rdma->sc_sq_cq); @@ -985,6 +988,14 @@ static void svc_rdma_free(struct svc_xprt *xprt) kfree(rdma); } +static void svc_rdma_free(struct svc_xprt *xprt) +{ + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + INIT_WORK(&rdma->sc_work, __svc_rdma_free); + schedule_work(&rdma->sc_work); +} + static int svc_rdma_has_wspace(struct svc_xprt *xprt) { struct svcxprt_rdma *rdma = -- cgit v1.2.3 From 0905c0f0a2346516ecd12f0a4f33dca571b0dccd Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Thu, 1 May 2008 10:49:03 -0500 Subject: svcrdma: Add reference for each SQ/RQ WR Add a reference on the transport for every outstanding WR. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 028c6cf8936..31b1927b5ee 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -279,6 +279,8 @@ static void rq_comp_handler(struct ib_cq *cq, void *cq_context) * * Take all completing WC off the CQE and enqueue the associated DTO * context on the dto_q for the transport. + * + * Note that caller must hold a transport reference. */ static void rq_cq_reap(struct svcxprt_rdma *xprt) { @@ -298,13 +300,16 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt) ctxt->byte_len = wc.byte_len; if (wc.status != IB_WC_SUCCESS) { /* Close the transport */ + dprintk("svcrdma: transport closing putting ctxt %p\n", ctxt); set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags); svc_rdma_put_context(ctxt, 1); + svc_xprt_put(&xprt->sc_xprt); continue; } spin_lock_bh(&xprt->sc_rq_dto_lock); list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q); spin_unlock_bh(&xprt->sc_rq_dto_lock); + svc_xprt_put(&xprt->sc_xprt); } if (ctxt) @@ -322,6 +327,8 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt) /* * Send Queue Completion Handler - potentially called on interrupt context. + * + * Note that caller must hold a transport reference. */ static void sq_cq_reap(struct svcxprt_rdma *xprt) { @@ -374,6 +381,7 @@ static void sq_cq_reap(struct svcxprt_rdma *xprt) wc.opcode, wc.status); break; } + svc_xprt_put(&xprt->sc_xprt); } if (ctxt) @@ -530,9 +538,12 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) recv_wr.num_sge = ctxt->count; recv_wr.wr_id = (u64)(unsigned long)ctxt; + svc_xprt_get(&xprt->sc_xprt); ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr); - if (ret) + if (ret) { + svc_xprt_put(&xprt->sc_xprt); svc_rdma_put_context(ctxt, 1); + } return ret; } @@ -1049,14 +1060,17 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) continue; } /* Bumped used SQ WR count and post */ + svc_xprt_get(&xprt->sc_xprt); ret = ib_post_send(xprt->sc_qp, wr, &bad_wr); if (!ret) atomic_inc(&xprt->sc_sq_count); - else + else { + svc_xprt_put(&xprt->sc_xprt); dprintk("svcrdma: failed to post SQ WR rc=%d, " "sc_sq_count=%d, sc_sq_depth=%d\n", ret, atomic_read(&xprt->sc_sq_count), xprt->sc_sq_depth); + } spin_unlock_bh(&xprt->sc_lock); break; } -- cgit v1.2.3 From 1711386c62c97f7fb086a2247d44cdb1f8867640 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Thu, 1 May 2008 11:13:50 -0500 Subject: svcrdma: Move the QP and cm_id destruction to svc_rdma_free Move the destruction of the QP and CM_ID to the free path so that the QP cleanup code doesn't race with the dto_tasklet handling flushed WR. The QP reference is not needed because we now have a reference for every WR. Also add a guard in the SQ and RQ completion handlers to ignore calls generated by some providers when the QP is destroyed. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 40 ++++++++++++++++---------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 31b1927b5ee..b412a49c46f 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -252,11 +252,15 @@ static void rq_comp_handler(struct ib_cq *cq, void *cq_context) struct svcxprt_rdma *xprt = cq_context; unsigned long flags; + /* Guard against unconditional flush call for destroyed QP */ + if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0) + return; + /* * Set the bit regardless of whether or not it's on the list * because it may be on the list already due to an SQ * completion. - */ + */ set_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags); /* @@ -393,11 +397,15 @@ static void sq_comp_handler(struct ib_cq *cq, void *cq_context) struct svcxprt_rdma *xprt = cq_context; unsigned long flags; + /* Guard against unconditional flush call for destroyed QP */ + if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0) + return; + /* * Set the bit regardless of whether or not it's on the list * because it may be on the list already due to an RQ * completion. - */ + */ set_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags); /* @@ -852,7 +860,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_sq_depth = qp_attr.cap.max_send_wr; newxprt->sc_max_requests = qp_attr.cap.max_recv_wr; } - svc_xprt_get(&newxprt->sc_xprt); newxprt->sc_qp = newxprt->sc_cm_id->qp; /* Register all of physical memory */ @@ -926,10 +933,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dprintk("svcrdma: failure accepting new connection rc=%d.\n", ret); /* Take a reference in case the DTO handler runs */ svc_xprt_get(&newxprt->sc_xprt); - if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp)) { + if (newxprt->sc_qp && !IS_ERR(newxprt->sc_qp)) ib_destroy_qp(newxprt->sc_qp); - svc_xprt_put(&newxprt->sc_xprt); - } rdma_destroy_id(newxprt->sc_cm_id); /* This call to put will destroy the transport */ svc_xprt_put(&newxprt->sc_xprt); @@ -941,10 +946,7 @@ static void svc_rdma_release_rqst(struct svc_rqst *rqstp) } /* - * When connected, an svc_xprt has at least three references: - * - * - A reference held by the QP. We still hold that here because this - * code deletes the QP and puts the reference. + * When connected, an svc_xprt has at least two references: * * - A reference held by the cm_id between the ESTABLISHED and * DISCONNECTED events. If the remote peer disconnected first, this @@ -953,7 +955,7 @@ static void svc_rdma_release_rqst(struct svc_rqst *rqstp) * - A reference held by the svc_recv code that called this function * as part of close processing. * - * At a minimum two references should still be held. + * At a minimum one references should still be held. */ static void svc_rdma_detach(struct svc_xprt *xprt) { @@ -963,15 +965,6 @@ static void svc_rdma_detach(struct svc_xprt *xprt) /* Disconnect and flush posted WQE */ rdma_disconnect(rdma->sc_cm_id); - - /* Destroy the QP if present (not a listener) */ - if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) { - ib_destroy_qp(rdma->sc_qp); - svc_xprt_put(xprt); - } - - /* Destroy the CM ID */ - rdma_destroy_id(rdma->sc_cm_id); } static void __svc_rdma_free(struct work_struct *work) @@ -983,6 +976,13 @@ static void __svc_rdma_free(struct work_struct *work) /* We should only be called from kref_put */ BUG_ON(atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0); + /* Destroy the QP if present (not a listener) */ + if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) + ib_destroy_qp(rdma->sc_qp); + + /* Destroy the CM ID */ + rdma_destroy_id(rdma->sc_cm_id); + if (rdma->sc_sq_cq && !IS_ERR(rdma->sc_sq_cq)) ib_destroy_cq(rdma->sc_sq_cq); -- cgit v1.2.3 From 356d0a1519867422c3f17f79e2183f8c2d44f8ee Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Thu, 1 May 2008 11:25:02 -0500 Subject: svcrdma: Cleanup queued, but unprocessed I/O in svc_rdma_free When the transport is closing, the DTO tasklet may queue data that never gets processed. Clean up resources associated with this I/O. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 38 +++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index b412a49c46f..b1ff08d7da6 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -976,13 +976,42 @@ static void __svc_rdma_free(struct work_struct *work) /* We should only be called from kref_put */ BUG_ON(atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0); + /* + * Destroy queued, but not processed read completions. Note + * that this cleanup has to be done before destroying the + * cm_id because the device ptr is needed to unmap the dma in + * svc_rdma_put_context. + */ + spin_lock_bh(&rdma->sc_read_complete_lock); + while (!list_empty(&rdma->sc_read_complete_q)) { + struct svc_rdma_op_ctxt *ctxt; + ctxt = list_entry(rdma->sc_read_complete_q.next, + struct svc_rdma_op_ctxt, + dto_q); + list_del_init(&ctxt->dto_q); + svc_rdma_put_context(ctxt, 1); + } + spin_unlock_bh(&rdma->sc_read_complete_lock); + + /* Destroy queued, but not processed recv completions */ + spin_lock_bh(&rdma->sc_rq_dto_lock); + while (!list_empty(&rdma->sc_rq_dto_q)) { + struct svc_rdma_op_ctxt *ctxt; + ctxt = list_entry(rdma->sc_rq_dto_q.next, + struct svc_rdma_op_ctxt, + dto_q); + list_del_init(&ctxt->dto_q); + svc_rdma_put_context(ctxt, 1); + } + spin_unlock_bh(&rdma->sc_rq_dto_lock); + + /* Warn if we leaked a resource or under-referenced */ + WARN_ON(atomic_read(&rdma->sc_ctxt_used) != 0); + /* Destroy the QP if present (not a listener) */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) ib_destroy_qp(rdma->sc_qp); - /* Destroy the CM ID */ - rdma_destroy_id(rdma->sc_cm_id); - if (rdma->sc_sq_cq && !IS_ERR(rdma->sc_sq_cq)) ib_destroy_cq(rdma->sc_sq_cq); @@ -995,6 +1024,9 @@ static void __svc_rdma_free(struct work_struct *work) if (rdma->sc_pd && !IS_ERR(rdma->sc_pd)) ib_dealloc_pd(rdma->sc_pd); + /* Destroy the CM ID */ + rdma_destroy_id(rdma->sc_cm_id); + destroy_context_cache(rdma); kfree(rdma); } -- cgit v1.2.3 From 97a3df382e01c49555ea844bd7c4e5a08f245b9d Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Thu, 1 May 2008 14:02:45 -0500 Subject: svcrdma: Use ib verbs version of dma_unmap Use the ib_verbs version of the dma_unmap service in the svc_rdma_put_context function. This should support providers using software rdma. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index b1ff08d7da6..0b72c4c7d7c 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -162,10 +162,10 @@ void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) put_page(ctxt->pages[i]); for (i = 0; i < ctxt->count; i++) - dma_unmap_single(xprt->sc_cm_id->device->dma_device, - ctxt->sge[i].addr, - ctxt->sge[i].length, - ctxt->direction); + ib_dma_unmap_single(xprt->sc_cm_id->device, + ctxt->sge[i].addr, + ctxt->sge[i].length, + ctxt->direction); spin_lock_bh(&xprt->sc_ctxt_lock); list_add(&ctxt->free_list, &xprt->sc_ctxt_free); -- cgit v1.2.3 From 69500c43b45f7155b72dcadad31cd55cda789c93 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 7 May 2008 13:49:58 -0500 Subject: svcrdma: Set rqstp transport address in rdma_read_complete function The rdma_read_complete function needs to copy the rqstp transport address from the transport. Failure to do so can result in using the wrong authentication method for the RPC or bug checking if the rqstp address is not valid. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 21a1e625ef0..c016f5ca0ce 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -416,6 +416,7 @@ static int rdma_read_complete(struct svc_rqst *rqstp, /* XXX: What should this be? */ rqstp->rq_prot = IPPROTO_MAX; + svc_xprt_copy_addrs(rqstp, rqstp->rq_xprt); ret = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len -- cgit v1.2.3 From af261af4db14230fb35bcdc0ba9ef78ed6cf7bc1 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 7 May 2008 13:52:42 -0500 Subject: svcrdma: Copy transport address and arm CQ before calling rdma_accept This race was found by inspection. Messages can be received from the peer immediately following the rdma_accept call, however, the CQ have not yet been armed and the transport address has not yet been set. Set the transport address in the connect request handler and arm the CQ prior to calling rdma_accept. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_transport.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 0b72c4c7d7c..c7545203f4b 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -570,6 +570,7 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id) { struct svcxprt_rdma *listen_xprt = new_cma_id->context; struct svcxprt_rdma *newxprt; + struct sockaddr *sa; /* Create a new transport */ newxprt = rdma_create_xprt(listen_xprt->sc_xprt.xpt_server, 0); @@ -582,6 +583,12 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id) dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n", newxprt, newxprt->sc_cm_id, listen_xprt); + /* Set the local and remote addresses in the transport */ + sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; + svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa)); + sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr; + svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa)); + /* * Enqueue the new transport on the accept queue of the listening * transport @@ -750,7 +757,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct rdma_conn_param conn_param; struct ib_qp_init_attr qp_attr; struct ib_device_attr devattr; - struct sockaddr *sa; int ret; int i; @@ -883,6 +889,13 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) /* Swap out the handler */ newxprt->sc_cm_id->event_handler = rdma_cma_handler; + /* + * Arm the CQs for the SQ and RQ before accepting so we can't + * miss the first message + */ + ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP); + ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP); + /* Accept Connection */ set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags); memset(&conn_param, 0, sizeof conn_param); @@ -919,14 +932,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt->sc_max_requests, newxprt->sc_ord); - /* Set the local and remote addresses in the transport */ - sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr; - svc_xprt_set_remote(&newxprt->sc_xprt, sa, svc_addr_len(sa)); - sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.src_addr; - svc_xprt_set_local(&newxprt->sc_xprt, sa, svc_addr_len(sa)); - - ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP); - ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP); return &newxprt->sc_xprt; errout: -- cgit v1.2.3 From 008fdbc57164b0ac237ad6ee2766944f02ac9c28 Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Wed, 7 May 2008 15:47:42 -0500 Subject: svcrdma: Change svc_rdma_send_error return type to void The svc_rdma_send_error function is called when an RPCRDMA protocol error is detected. This function attempts to post an error reply message. Since an error posting to a transport in error is ignored, change the return type to void. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 2 +- net/sunrpc/xprtrdma/svc_rdma_transport.c | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index c016f5ca0ce..6b16d8cd568 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -497,7 +497,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) /* If the request is invalid, reply with an error */ if (len < 0) { if (len == -ENOSYS) - (void)svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS); + svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS); goto close_out; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index c7545203f4b..e132509d1db 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -1114,8 +1114,8 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr) return ret; } -int svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, - enum rpcrdma_errcode err) +void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, + enum rpcrdma_errcode err) { struct ib_send_wr err_wr; struct ib_sge sge; @@ -1153,9 +1153,8 @@ int svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, /* Post It */ ret = svc_rdma_send(xprt, &err_wr); if (ret) { - dprintk("svcrdma: Error posting send = %d\n", ret); + dprintk("svcrdma: Error %d posting send for protocol error\n", + ret); svc_rdma_put_context(ctxt, 1); } - - return ret; } -- cgit v1.2.3 From a6f911c04e20b98feb4b33d3aba2976851977d6a Mon Sep 17 00:00:00 2001 From: Tom Tucker Date: Tue, 13 May 2008 09:16:05 -0500 Subject: svcrdma: Verify read-list fits within RPCSVC_MAXPAGES A RDMA read-list cannot contain more elements than RPCSVC_MAXPAGES or it will overflow the DTO context. Verify this when processing the protocol header. Signed-off-by: Tom Tucker --- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 6b16d8cd568..06ab4841537 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -306,6 +306,8 @@ static int rdma_read_xdr(struct svcxprt_rdma *xprt, ch_sge_ary = (struct chunk_sge *)tmp_ch_ctxt->sge; svc_rdma_rcl_chunk_counts(ch, &ch_count, &byte_count); + if (ch_count > RPCSVC_MAXPAGES) + return -EINVAL; sge_count = rdma_rcl_to_sge(xprt, rqstp, hdr_ctxt, rmsgp, sge, ch_sge_ary, ch_count, byte_count); -- cgit v1.2.3 From 6079a463cf95fafcc704a4e5e92a4da12444bd3c Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 27 May 2008 06:22:38 -0700 Subject: dccp: Fix to handle short sequence numbers packet correctly RFC4340 said: 8.5. Pseudocode ... If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet has short sequence numbers), drop packet and return But DCCP has some mistake to handle short sequence numbers packet, now it drop packet only if P.type is Data, Ack, or DataAck and P.X == 0. Signed-off-by: Wei Yongjun Acked-by: Gerrit Renker Acked-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/dccp/ipv4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index b348dd70c68..c22a3780c14 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -739,8 +739,8 @@ int dccp_invalid_packet(struct sk_buff *skb) * If P.type is not Data, Ack, or DataAck and P.X == 0 (the packet * has short sequence numbers), drop packet and return */ - if (dh->dccph_type >= DCCP_PKT_DATA && - dh->dccph_type <= DCCP_PKT_DATAACK && dh->dccph_x == 0) { + if ((dh->dccph_type < DCCP_PKT_DATA || + dh->dccph_type > DCCP_PKT_DATAACK) && dh->dccph_x == 0) { DCCP_WARN("P.type (%s) not Data || [Data]Ack, while P.X == 0\n", dccp_packet_name(dh->dccph_type)); return 1; -- cgit v1.2.3 From 825de27d9e40b3117b29a79d412b7a4b78c5d815 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Tue, 27 May 2008 06:33:54 -0700 Subject: dccp ccid-3: Fix "t_ipi explosion" bug The identification of this bug is thanks to Cheng Wei and Tomasz Grobelny. To avoid divide-by-zero, the implementation previously ignored RTTs smaller than 4 microseconds when performing integer division RTT/4. When the RTT reached a value less than 4 microseconds (as observed on loopback), this prevented the Window Counter CCVal value from advancing. As a result, the receiver stopped sending feedback. This in turn caused non-ending expiries of the nofeedback timer at the sender, so that the sending rate was progressively reduced until reaching the minimum of one packet per 64 seconds. The patch fixes this bug by handling integer division more intelligently. Due to consistent use of dccp_sample_rtt(), divide-by-zero-RTT is avoided. Signed-off-by: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/ccids/ccid3.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index cd61dea2eea..f813077234b 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -193,22 +193,17 @@ static inline void ccid3_hc_tx_update_s(struct ccid3_hc_tx_sock *hctx, int len) /* * Update Window Counter using the algorithm from [RFC 4342, 8.1]. - * The algorithm is not applicable if RTT < 4 microseconds. + * As elsewhere, RTT > 0 is assumed by using dccp_sample_rtt(). */ static inline void ccid3_hc_tx_update_win_count(struct ccid3_hc_tx_sock *hctx, ktime_t now) { - u32 quarter_rtts; - - if (unlikely(hctx->ccid3hctx_rtt < 4)) /* avoid divide-by-zero */ - return; - - quarter_rtts = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count); - quarter_rtts /= hctx->ccid3hctx_rtt / 4; + u32 delta = ktime_us_delta(now, hctx->ccid3hctx_t_last_win_count), + quarter_rtts = (4 * delta) / hctx->ccid3hctx_rtt; if (quarter_rtts > 0) { hctx->ccid3hctx_t_last_win_count = now; - hctx->ccid3hctx_last_win_count += min_t(u32, quarter_rtts, 5); + hctx->ccid3hctx_last_win_count += min(quarter_rtts, 5U); hctx->ccid3hctx_last_win_count &= 0xF; /* mod 16 */ } } -- cgit v1.2.3 From 679fda1aa49fddf938bb699df7867c01988371ab Mon Sep 17 00:00:00 2001 From: Nicolas Kaiser Date: Tue, 20 May 2008 18:42:54 +0200 Subject: net/mac80211: always true conditionals Correct always true conditionals. Signed-off-by: Nicolas Kaiser Signed-off-by: John W. Linville --- net/mac80211/cfg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 699d97b8de5..a9fce4afdf2 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -672,7 +672,7 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev, if (params->vlan) { sdata = IEEE80211_DEV_TO_SUB_IF(params->vlan); - if (sdata->vif.type != IEEE80211_IF_TYPE_VLAN || + if (sdata->vif.type != IEEE80211_IF_TYPE_VLAN && sdata->vif.type != IEEE80211_IF_TYPE_AP) return -EINVAL; } else @@ -760,7 +760,7 @@ static int ieee80211_change_station(struct wiphy *wiphy, if (params->vlan && params->vlan != sta->sdata->dev) { vlansdata = IEEE80211_DEV_TO_SUB_IF(params->vlan); - if (vlansdata->vif.type != IEEE80211_IF_TYPE_VLAN || + if (vlansdata->vif.type != IEEE80211_IF_TYPE_VLAN && vlansdata->vif.type != IEEE80211_IF_TYPE_AP) { rcu_read_unlock(); return -EINVAL; -- cgit v1.2.3 From 167ad6f7a2b2ae58dfaa46620b9b3212594f38e6 Mon Sep 17 00:00:00 2001 From: Tomas Winkler Date: Wed, 21 May 2008 18:17:05 +0300 Subject: mac80211: fix ieee80211_rx_bss_put/get imbalance This patch fixes iee80211_rx_bss_put/get imbalance introduced by 'mac80211: enable IBSS merging' patch. Signed-off-by: Tomas Winkler Signed-off-by: John W. Linville --- net/mac80211/mlme.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 7cfd12e0d1e..0ef5993e785 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2479,8 +2479,6 @@ static int ieee80211_sta_join_ibss(struct net_device *dev, ifsta->state = IEEE80211_IBSS_JOINED; mod_timer(&ifsta->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL); - ieee80211_rx_bss_put(dev, bss); - return res; } @@ -3523,6 +3521,7 @@ static int ieee80211_sta_create_ibss(struct net_device *dev, struct ieee80211_supported_band *sband; u8 bssid[ETH_ALEN], *pos; int i; + int ret; DECLARE_MAC_BUF(mac); #if 0 @@ -3567,7 +3566,9 @@ static int ieee80211_sta_create_ibss(struct net_device *dev, *pos++ = (u8) (rate / 5); } - return ieee80211_sta_join_ibss(dev, ifsta, bss); + ret = ieee80211_sta_join_ibss(dev, ifsta, bss); + ieee80211_rx_bss_put(dev, bss); + return ret; } @@ -3615,10 +3616,13 @@ static int ieee80211_sta_find_ibss(struct net_device *dev, (bss = ieee80211_rx_bss_get(dev, bssid, local->hw.conf.channel->center_freq, ifsta->ssid, ifsta->ssid_len))) { + int ret; printk(KERN_DEBUG "%s: Selected IBSS BSSID %s" " based on configured SSID\n", dev->name, print_mac(mac, bssid)); - return ieee80211_sta_join_ibss(dev, ifsta, bss); + ret = ieee80211_sta_join_ibss(dev, ifsta, bss); + ieee80211_rx_bss_put(dev, bss); + return ret; } #ifdef CONFIG_MAC80211_IBSS_DEBUG printk(KERN_DEBUG " did not try to join ibss\n"); -- cgit v1.2.3 From 9381be059bf5831d259e8735005cfa35b7488543 Mon Sep 17 00:00:00 2001 From: Tomas Winkler Date: Fri, 23 May 2008 01:36:36 +0300 Subject: mac80211: reorder channel and freq reporting in wext scan report This patch switch order of channel and freq (SIOCGIWFREQ) reports in scan results in order to overcome wpa_supplicant inability to handle channel numbers in 5.2Ghz band. Wext reporting channel number is ambiguous as channels 7-12 (802.11j) exist on both bands. Signed-off-by: Tomas Winkler Signed-off-by: Emmanuel Grumbach Acked-by: Dan Williams Signed-off-by: John W. Linville --- net/mac80211/mlme.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 0ef5993e785..c29927c4977 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -4099,18 +4099,17 @@ ieee80211_sta_scan_result(struct net_device *dev, memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWFREQ; - iwe.u.freq.m = bss->freq; - iwe.u.freq.e = 6; + iwe.u.freq.m = ieee80211_frequency_to_channel(bss->freq); + iwe.u.freq.e = 0; current_ev = iwe_stream_add_event(current_ev, end_buf, &iwe, IW_EV_FREQ_LEN); memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWFREQ; - iwe.u.freq.m = ieee80211_frequency_to_channel(bss->freq); - iwe.u.freq.e = 0; + iwe.u.freq.m = bss->freq; + iwe.u.freq.e = 6; current_ev = iwe_stream_add_event(current_ev, end_buf, &iwe, IW_EV_FREQ_LEN); - memset(&iwe, 0, sizeof(iwe)); iwe.cmd = IWEVQUAL; iwe.u.qual.qual = bss->signal; -- cgit v1.2.3 From d4231ca3e162387a2b6964dacaa83604e065c4e9 Mon Sep 17 00:00:00 2001 From: Abhijeet Kolekar Date: Fri, 23 May 2008 10:15:26 -0700 Subject: mac80211 : Fixes the status message for iwconfig iwconfig was showing incorrect status messages when disassociated. Patch fixes this by always checking for association status in ioctl calls for getting ap address. Signed-off-by: Abhijeet Kolekar Acked-by: Dan Williams Signed-off-by: John W. Linville --- net/mac80211/wext.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c index 457ebf9e85a..8311bb24f9f 100644 --- a/net/mac80211/wext.c +++ b/net/mac80211/wext.c @@ -489,9 +489,14 @@ static int ieee80211_ioctl_giwap(struct net_device *dev, sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (sdata->vif.type == IEEE80211_IF_TYPE_STA || sdata->vif.type == IEEE80211_IF_TYPE_IBSS) { - ap_addr->sa_family = ARPHRD_ETHER; - memcpy(&ap_addr->sa_data, sdata->u.sta.bssid, ETH_ALEN); - return 0; + if (sdata->u.sta.state == IEEE80211_ASSOCIATED) { + ap_addr->sa_family = ARPHRD_ETHER; + memcpy(&ap_addr->sa_data, sdata->u.sta.bssid, ETH_ALEN); + return 0; + } else { + memset(&ap_addr->sa_data, 0, ETH_ALEN); + return 0; + } } else if (sdata->vif.type == IEEE80211_IF_TYPE_WDS) { ap_addr->sa_family = ARPHRD_ETHER; memcpy(&ap_addr->sa_data, sdata->u.wds.remote_addr, ETH_ALEN); -- cgit v1.2.3 From f6d97104890203ba9c2cf8e34894c4c8e64cb880 Mon Sep 17 00:00:00 2001 From: Yi Zhu Date: Tue, 27 May 2008 17:50:50 +0300 Subject: mac80211: fix a typo in ieee80211_handle_filtered_frame comment fix a typo in ieee80211_handle_filtered_frame comment Signed-off-by: Yi Zhu Signed-off-by: John W. Linville --- net/mac80211/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 915afadb060..5c876450b14 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -1313,7 +1313,7 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, /* * Clear the TX filter mask for this STA when sending the next * packet. If the STA went to power save mode, this will happen - * happen when it wakes up for the next time. + * when it wakes up for the next time. */ sta->flags |= WLAN_STA_CLEAR_PS_FILT; -- cgit v1.2.3 From 70d251b24c44ab2fcba1807a5206e844cf10eb38 Mon Sep 17 00:00:00 2001 From: Senthil Balasubramanian Date: Wed, 28 May 2008 20:08:12 +0530 Subject: mac80211: Fix for NULL pointer dereference in sta_info_get() This addresses a NULL pointer dereference in sta_info_get(). TID and sta_info are extracted in ADDBA Timer expiry function through the timer handler's argument. The problem is extracging the TID (which was stored in timer_to_tid[] array of type "u8") through "int *" typecast which may also yield unwanted bytes for the MSB of TID that results in incorrect sta_info and ieee80211_local pointers. ieee80211_local pointer is NULL as illustrated below, it crashes in sta_info_get(). The problem started when extracting ieee80211_local pointer out of sta_info iteself and eventually crashed in stat_info_get(). The proper way to fix is to change the data type of TID to u8 instead of u16. However changing all the occurences requires some prototype changes as well. We should fix this in upcoming patches. Signed-off-by: Senthil Balasubramanian Signed-off-by: Luis Rodriguez Signed-off-by: John W. Linville --- net/mac80211/mlme.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index c29927c4977..33a356e7b66 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1614,7 +1614,7 @@ void sta_addba_resp_timer_expired(unsigned long data) * only one argument, and both sta_info and TID are needed, so init * flow in sta_info_create gives the TID as data, while the timer_to_id * array gives the sta through container_of */ - u16 tid = *(int *)data; + u16 tid = *(u8 *)data; struct sta_info *temp_sta = container_of((void *)data, struct sta_info, timer_to_tid[tid]); @@ -1662,7 +1662,7 @@ timer_expired_exit: void sta_rx_agg_session_timer_expired(unsigned long data) { /* not an elegant detour, but there is no choice as the timer passes - * only one argument, and verious sta_info are needed here, so init + * only one argument, and various sta_info are needed here, so init * flow in sta_info_create gives the TID as data, while the timer_to_id * array gives the sta through container_of */ u8 *ptid = (u8 *)data; -- cgit v1.2.3 From c97c23e38625f59e3e9869664eeeb0cab1822948 Mon Sep 17 00:00:00 2001 From: Senthil Balasubramanian Date: Wed, 28 May 2008 23:15:32 +0530 Subject: mac80211: fix alignment issue with compare_ether_addr() This addresses an alignment issue with compare_ether_addr(). The addresses passed to compare_ether_addr should be two bytes aligned. It may function properly in x86 platform. However may not work properly on IA-64 or ARM processor. This also fixes a typo in mlme.c where the sk_buff struct name is incorect. Though sizeof() works for any incorrect structure pointer name as its just a pointer length that we want, lets just fix it. Signed-off-by: Senthil Balasubramanian Signed-off-by: Luis R. Rodriguez Signed-off-by: John W. Linville --- net/mac80211/mlme.c | 4 ++-- net/mac80211/rx.c | 4 ++-- net/mac80211/util.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 33a356e7b66..841278f1df8 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1325,7 +1325,7 @@ static void ieee80211_sta_process_addba_request(struct net_device *dev, /* prepare reordering buffer */ tid_agg_rx->reorder_buf = - kmalloc(buf_size * sizeof(struct sk_buf *), GFP_ATOMIC); + kmalloc(buf_size * sizeof(struct sk_buff *), GFP_ATOMIC); if (!tid_agg_rx->reorder_buf) { if (net_ratelimit()) printk(KERN_ERR "can not allocate reordering buffer " @@ -1334,7 +1334,7 @@ static void ieee80211_sta_process_addba_request(struct net_device *dev, goto end; } memset(tid_agg_rx->reorder_buf, 0, - buf_size * sizeof(struct sk_buf *)); + buf_size * sizeof(struct sk_buff *)); if (local->ops->ampdu_action) ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_RX_START, diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 1958bfb361c..0941e5d6a52 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1091,7 +1091,7 @@ ieee80211_data_to_8023(struct ieee80211_rx_data *rx) u16 fc, hdrlen, ethertype; u8 *payload; u8 dst[ETH_ALEN]; - u8 src[ETH_ALEN]; + u8 src[ETH_ALEN] __aligned(2); struct sk_buff *skb = rx->skb; struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); DECLARE_MAC_BUF(mac); @@ -1234,7 +1234,7 @@ ieee80211_data_to_8023(struct ieee80211_rx_data *rx) */ static bool ieee80211_frame_allowed(struct ieee80211_rx_data *rx) { - static const u8 pae_group_addr[ETH_ALEN] + static const u8 pae_group_addr[ETH_ALEN] __aligned(2) = { 0x01, 0x80, 0xC2, 0x00, 0x00, 0x03 }; struct ethhdr *ehdr = (struct ethhdr *) rx->skb->data; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 131e9e6c8a3..4e97b266f90 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -34,11 +34,11 @@ void *mac80211_wiphy_privid = &mac80211_wiphy_privid; /* See IEEE 802.1H for LLC/SNAP encapsulation/decapsulation */ /* Ethernet-II snap header (RFC1042 for most EtherTypes) */ -const unsigned char rfc1042_header[] = +const unsigned char rfc1042_header[] __aligned(2) = { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0x00 }; /* Bridge-Tunnel header (for EtherTypes ETH_P_AARP and ETH_P_IPX) */ -const unsigned char bridge_tunnel_header[] = +const unsigned char bridge_tunnel_header[] __aligned(2) = { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 }; -- cgit v1.2.3 From 4c8411f8c115def968820a4df6658ccfd55d7f1a Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 29 May 2008 01:32:47 -0700 Subject: bluetooth: fix locking bug in the rfcomm socket cleanup handling in net/bluetooth/rfcomm/sock.c, rfcomm_sk_state_change() does the following operation: if (parent && sock_flag(sk, SOCK_ZAPPED)) { /* We have to drop DLC lock here, otherwise * rfcomm_sock_destruct() will dead lock. */ rfcomm_dlc_unlock(d); rfcomm_sock_kill(sk); rfcomm_dlc_lock(d); } } which is fine, since rfcomm_sock_kill() will call sk_free() which will call rfcomm_sock_destruct() which takes the rfcomm_dlc_lock()... so far so good. HOWEVER, this assumes that the rfcomm_sk_state_change() function always gets called with the rfcomm_dlc_lock() taken. This is the case for all but one case, and in that case where we don't have the lock, we do a double unlock followed by an attempt to take the lock, which due to underflow isn't going anywhere fast. This patch fixes this by moving the stragling case inside the lock, like the other usages of the same call are doing in this code. This was found with the help of the www.kerneloops.org project, where this deadlock was observed 51 times at this point in time: http://www.kerneloops.org/search.php?search=rfcomm_sock_destruct Signed-off-by: Arjan van de Ven Acked-by: Marcel Holtmann Signed-off-by: David S. Miller --- net/bluetooth/rfcomm/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index eb62558e9b0..0c2c93735e9 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -423,8 +423,8 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err) rfcomm_dlc_lock(d); d->state = BT_CLOSED; - rfcomm_dlc_unlock(d); d->state_change(d, err); + rfcomm_dlc_unlock(d); skb_queue_purge(&d->tx_queue); rfcomm_dlc_unlink(d); -- cgit v1.2.3 From 12293bf91126ad253a25e2840b307fdc7c2754c3 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 29 May 2008 03:19:37 -0700 Subject: netfilter: nf_conntrack_expect: fix error path unwind in nf_conntrack_expect_init() Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_expect.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index e31beeb33b2..e8f0dead267 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -587,10 +587,10 @@ int __init nf_conntrack_expect_init(void) return 0; err3: + kmem_cache_destroy(nf_ct_expect_cachep); +err2: nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_vmalloc, nf_ct_expect_hsize); -err2: - kmem_cache_destroy(nf_ct_expect_cachep); err1: return err; } -- cgit v1.2.3 From 3446b9d57edd0b96a89715fef222879e4919a115 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 30 May 2008 02:57:29 -0700 Subject: llc: Fix double accounting of received packets llc_sap_rcv was being preceded by skb_set_owner_r, then calling llc_state_process that calls sock_queue_rcv_skb, that in turn calls skb_set_owner_r again making the space allowed to be used by the socket to be leaked, making the socket to get stuck. Fix it by setting skb->sk at llc_sap_rcv and leave the accounting to be done only at sock_queue_rcv_skb. Reported-by: Dmitry Petukhov Tested-by: Dmitry Petukhov Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/llc/llc_sap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c index e2ddde75501..008de1fc42c 100644 --- a/net/llc/llc_sap.c +++ b/net/llc/llc_sap.c @@ -286,12 +286,14 @@ void llc_build_and_send_xid_pkt(struct llc_sap *sap, struct sk_buff *skb, * * Sends received pdus to the sap state machine. */ -static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb) +static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb, + struct sock *sk) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); ev->type = LLC_SAP_EV_TYPE_PDU; ev->reason = 0; + skb->sk = sk; llc_sap_state_process(sap, skb); } @@ -360,8 +362,7 @@ static void llc_sap_mcast(struct llc_sap *sap, break; sock_hold(sk); - skb_set_owner_r(skb1, sk); - llc_sap_rcv(sap, skb1); + llc_sap_rcv(sap, skb1, sk); sock_put(sk); } read_unlock_bh(&sap->sk_list.lock); @@ -381,8 +382,7 @@ void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb) } else { struct sock *sk = llc_lookup_dgram(sap, &laddr); if (sk) { - skb_set_owner_r(skb, sk); - llc_sap_rcv(sap, skb); + llc_sap_rcv(sap, skb, sk); sock_put(sk); } else kfree_skb(skb); -- cgit v1.2.3 From 537d59af73d894750cff14f90fe2b6d77fbab15b Mon Sep 17 00:00:00 2001 From: Dave Young Date: Sun, 1 Jun 2008 23:50:52 -0700 Subject: bluetooth: rfcomm_dev_state_change deadlock fix There's logic in __rfcomm_dlc_close: rfcomm_dlc_lock(d); d->state = BT_CLOSED; d->state_changed(d, err); rfcomm_dlc_unlock(d); In rfcomm_dev_state_change, it's possible that rfcomm_dev_put try to take the dlc lock, then we will deadlock. Here fixed it by unlock dlc before rfcomm_dev_get in rfcomm_dev_state_change. why not unlock just before rfcomm_dev_put? it's because there's another problem. rfcomm_dev_get/rfcomm_dev_del will take rfcomm_dev_lock, but in rfcomm_dev_add the lock order is : rfcomm_dev_lock --> dlc lock so I unlock dlc before the taken of rfcomm_dev_lock. Actually it's a regression caused by commit 1905f6c736cb618e07eca0c96e60e3c024023428 ("bluetooth : __rfcomm_dlc_close lock fix"), the dlc state_change could be two callbacks : rfcomm_sk_state_change and rfcomm_dev_state_change. I missed the rfcomm_sk_state_change that time. Thanks Arjan van de Ven for the effort in commit 4c8411f8c115def968820a4df6658ccfd55d7f1a ("bluetooth: fix locking bug in the rfcomm socket cleanup handling") but he missed the rfcomm_dev_state_change lock issue. Signed-off-by: Dave Young Acked-by: Marcel Holtmann Signed-off-by: David S. Miller --- net/bluetooth/rfcomm/tty.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index c3f749abb2d..c9191871c1e 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -566,11 +566,22 @@ static void rfcomm_dev_state_change(struct rfcomm_dlc *dlc, int err) if (dlc->state == BT_CLOSED) { if (!dev->tty) { if (test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags)) { - if (rfcomm_dev_get(dev->id) == NULL) + /* Drop DLC lock here to avoid deadlock + * 1. rfcomm_dev_get will take rfcomm_dev_lock + * but in rfcomm_dev_add there's lock order: + * rfcomm_dev_lock -> dlc lock + * 2. rfcomm_dev_put will deadlock if it's + * the last reference + */ + rfcomm_dlc_unlock(dlc); + if (rfcomm_dev_get(dev->id) == NULL) { + rfcomm_dlc_lock(dlc); return; + } rfcomm_dev_del(dev); rfcomm_dev_put(dev); + rfcomm_dlc_lock(dlc); } } else tty_hangup(dev->tty); -- cgit v1.2.3 From 7dccf1f4e1696c79bff064c3770867cc53cbc71c Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Tue, 3 Jun 2008 14:53:46 -0700 Subject: ax25: Fix NULL pointer dereference and lockup. From: Jarek Poplawski There is only one function in AX25 calling skb_append(), and it really looks suspicious: appends skb after previously enqueued one, but in the meantime this previous skb could be removed from the queue. This patch Fixes it the simple way, so this is not fully compatible with the current method, but testing hasn't shown any problems. Signed-off-by: Ralf Baechle Signed-off-by: David S. Miller --- net/ax25/ax25_subr.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c index d8f21573317..034aa10a519 100644 --- a/net/ax25/ax25_subr.c +++ b/net/ax25/ax25_subr.c @@ -64,20 +64,15 @@ void ax25_frames_acked(ax25_cb *ax25, unsigned short nr) void ax25_requeue_frames(ax25_cb *ax25) { - struct sk_buff *skb, *skb_prev = NULL; + struct sk_buff *skb; /* * Requeue all the un-ack-ed frames on the output queue to be picked * up by ax25_kick called from the timer. This arrangement handles the * possibility of an empty output queue. */ - while ((skb = skb_dequeue(&ax25->ack_queue)) != NULL) { - if (skb_prev == NULL) - skb_queue_head(&ax25->write_queue, skb); - else - skb_append(skb_prev, skb, &ax25->write_queue); - skb_prev = skb; - } + while ((skb = skb_dequeue_tail(&ax25->ack_queue)) != NULL) + skb_queue_head(&ax25->write_queue, skb); } /* -- cgit v1.2.3 From 9ecad877948deb2871d29e03786a7d7911687009 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Tue, 3 Jun 2008 15:18:36 -0700 Subject: irda: Sock leak on error path in irda_create. Bad type/protocol specified result in sk leak. Fix is simple - release the sk if bad values are given, but to make it possible just to call sk_free(), I move some sk initialization a bit lower. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/irda/af_irda.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index ae54b20d047..3eb5bcc75f9 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -1093,11 +1093,6 @@ static int irda_create(struct net *net, struct socket *sock, int protocol) init_waitqueue_head(&self->query_wait); - /* Initialise networking socket struct */ - sock_init_data(sock, sk); /* Note : set sk->sk_refcnt to 1 */ - sk->sk_family = PF_IRDA; - sk->sk_protocol = protocol; - switch (sock->type) { case SOCK_STREAM: sock->ops = &irda_stream_ops; @@ -1124,13 +1119,20 @@ static int irda_create(struct net *net, struct socket *sock, int protocol) self->max_sdu_size_rx = TTP_SAR_UNBOUND; break; default: + sk_free(sk); return -ESOCKTNOSUPPORT; } break; default: + sk_free(sk); return -ESOCKTNOSUPPORT; } + /* Initialise networking socket struct */ + sock_init_data(sock, sk); /* Note : set sk->sk_refcnt to 1 */ + sk->sk_family = PF_IRDA; + sk->sk_protocol = protocol; + /* Register as a client with IrLMP */ self->ckey = irlmp_register_client(0, NULL, NULL, NULL); self->mask.word = 0xffff; -- cgit v1.2.3 From b9f5f52cca3e94f1e7509f366aa250ebbe1ed0b5 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 3 Jun 2008 16:03:15 -0700 Subject: net: neighbour table ABI problem The neighbor table time of last use information is returned in the incorrect unit. Kernel to user space ABI's need to use USER_HZ (or milliseconds), otherwise the application has to try and discover the real system HZ value which is problematic. Linux has standardized on keeping USER_HZ consistent (100hz) even when kernel is running internally at some other value. This change is small, but it breaks the ABI for older version of iproute2 utilities. But these utilities are already broken since they are looking at the psched_hz values which are completely different. So let's just go ahead and fix both kernel and user space. Older utilities will just print wrong values. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/neighbour.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 5d9d7130bd6..3896de79dfb 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2057,9 +2057,9 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, goto nla_put_failure; } - ci.ndm_used = now - neigh->used; - ci.ndm_confirmed = now - neigh->confirmed; - ci.ndm_updated = now - neigh->updated; + ci.ndm_used = jiffies_to_clock_t(now - neigh->used); + ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed); + ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated); ci.ndm_refcnt = atomic_read(&neigh->refcnt) - 1; read_unlock_bh(&neigh->lock); -- cgit v1.2.3 From 7557af25155a82ac2dad73eec6b0166868bf8ea2 Mon Sep 17 00:00:00 2001 From: Brice Goglin Date: Tue, 3 Jun 2008 16:07:45 -0700 Subject: net_dma: remove duplicate assignment in dma_skb_copy_datagram_iovec No need to compute copy twice in the frags loop in dma_skb_copy_datagram_iovec(). Signed-off-by: Brice Goglin Acked-by: Shannon Nelson Signed-off-by: Maciej Sosnowski Signed-off-by: Dan Williams Signed-off-by: David S. Miller --- net/core/user_dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/user_dma.c b/net/core/user_dma.c index 0ad1cd57bc3..c77aff9c6eb 100644 --- a/net/core/user_dma.c +++ b/net/core/user_dma.c @@ -75,7 +75,7 @@ int dma_skb_copy_datagram_iovec(struct dma_chan *chan, end = start + skb_shinfo(skb)->frags[i].size; copy = end - offset; - if ((copy = end - offset) > 0) { + if (copy > 0) { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; struct page *page = frag->page; -- cgit v1.2.3 From 51b77cae0d5aa8e1546fca855dcfe48ddfadfa9c Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 3 Jun 2008 16:36:01 -0700 Subject: route: Mark unused route cache flags as such. Also removes an obsolete check for the unused flag RTCF_MASQ. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index df41026b60d..96be336064f 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1792,7 +1792,7 @@ static int __mkroute_input(struct sk_buff *skb, if (err) flags |= RTCF_DIRECTSRC; - if (out_dev == in_dev && err && !(flags & RTCF_MASQ) && + if (out_dev == in_dev && err && (IN_DEV_SHARED_MEDIA(out_dev) || inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) flags |= RTCF_DOREDIRECT; -- cgit v1.2.3 From 1f9d11c7c99da706e33646c3a9080dd5a8ef9a0b Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 3 Jun 2008 16:36:27 -0700 Subject: route: Mark unused routing attributes as such Also removes an unused policy entry for an attribute which is only used in kernel->user direction. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 0f1557a4ac7..0b2ac6a3d90 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -506,7 +506,6 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = { [RTA_PREFSRC] = { .type = NLA_U32 }, [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) }, - [RTA_PROTOINFO] = { .type = NLA_U32 }, [RTA_FLOW] = { .type = NLA_U32 }, }; -- cgit v1.2.3 From bc3ed28caaef55e7e3a9316464256353c5f9b1df Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 3 Jun 2008 16:36:54 -0700 Subject: netlink: Improve returned error codes Make nlmsg_trim(), nlmsg_cancel(), genlmsg_cancel(), and nla_nest_cancel() void functions. Return -EMSGSIZE instead of -1 if the provided message buffer is not big enough. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/core/neighbour.c | 3 ++- net/core/rtnetlink.c | 3 ++- net/netlink/attr.c | 12 ++++++------ net/netlink/genetlink.c | 6 ++++-- net/sched/sch_dsmark.c | 6 ++++-- net/sched/sch_gred.c | 3 ++- net/sched/sch_hfsc.c | 2 +- net/sched/sch_red.c | 3 ++- net/wireless/nl80211.c | 12 ++++++++---- 9 files changed, 31 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 3896de79dfb..65f01f71b3f 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1714,7 +1714,8 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) return nla_nest_end(skb, nest); nla_put_failure: - return nla_nest_cancel(skb, nest); + nla_nest_cancel(skb, nest); + return -EMSGSIZE; } static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index cf857c4dc7b..a9a77216310 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -498,7 +498,8 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) return nla_nest_end(skb, mx); nla_put_failure: - return nla_nest_cancel(skb, mx); + nla_nest_cancel(skb, mx); + return -EMSGSIZE; } int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, diff --git a/net/netlink/attr.c b/net/netlink/attr.c index feb326f4a75..47bbf45ae5d 100644 --- a/net/netlink/attr.c +++ b/net/netlink/attr.c @@ -400,13 +400,13 @@ void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data) * @attrlen: length of attribute payload * @data: head of attribute payload * - * Returns -1 if the tailroom of the skb is insufficient to store + * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute header and payload. */ int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) - return -1; + return -EMSGSIZE; __nla_put(skb, attrtype, attrlen, data); return 0; @@ -418,13 +418,13 @@ int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) * @attrlen: length of attribute payload * @data: head of attribute payload * - * Returns -1 if the tailroom of the skb is insufficient to store + * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute payload. */ int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) - return -1; + return -EMSGSIZE; __nla_put_nohdr(skb, attrlen, data); return 0; @@ -436,13 +436,13 @@ int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data) * @attrlen: length of attribute payload * @data: head of attribute payload * - * Returns -1 if the tailroom of the skb is insufficient to store + * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute payload. */ int nla_append(struct sk_buff *skb, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) - return -1; + return -EMSGSIZE; memcpy(skb_put(skb, attrlen), data, attrlen); return 0; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index d16929c9b4b..f5aa23c3e88 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -554,7 +554,8 @@ static int ctrl_fill_info(struct genl_family *family, u32 pid, u32 seq, return genlmsg_end(skb, hdr); nla_put_failure: - return genlmsg_cancel(skb, hdr); + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; } static int ctrl_fill_mcgrp_info(struct genl_multicast_group *grp, u32 pid, @@ -590,7 +591,8 @@ static int ctrl_fill_mcgrp_info(struct genl_multicast_group *grp, u32 pid, return genlmsg_end(skb, hdr); nla_put_failure: - return genlmsg_cancel(skb, hdr); + genlmsg_cancel(skb, hdr); + return -EMSGSIZE; } static int ctrl_dumpfamily(struct sk_buff *skb, struct netlink_callback *cb) diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 0df911fd67b..64465bacbe7 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -444,7 +444,8 @@ static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl, return nla_nest_end(skb, opts); nla_put_failure: - return nla_nest_cancel(skb, opts); + nla_nest_cancel(skb, opts); + return -EMSGSIZE; } static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb) @@ -466,7 +467,8 @@ static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb) return nla_nest_end(skb, opts); nla_put_failure: - return nla_nest_cancel(skb, opts); + nla_nest_cancel(skb, opts); + return -EMSGSIZE; } static const struct Qdisc_class_ops dsmark_class_ops = { diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 3a9d226ff1e..c89fba56db5 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -582,7 +582,8 @@ append_opt: return nla_nest_end(skb, opts); nla_put_failure: - return nla_nest_cancel(skb, opts); + nla_nest_cancel(skb, opts); + return -EMSGSIZE; } static void gred_destroy(struct Qdisc *sch) diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 87293d0db1d..fdfaa3fcc16 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1360,7 +1360,7 @@ hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, nla_put_failure: nla_nest_cancel(skb, nest); - return -1; + return -EMSGSIZE; } static int diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 3dcd493f4f4..5c569853b9c 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -281,7 +281,8 @@ static int red_dump(struct Qdisc *sch, struct sk_buff *skb) return nla_nest_end(skb, opts); nla_put_failure: - return nla_nest_cancel(skb, opts); + nla_nest_cancel(skb, opts); + return -EMSGSIZE; } static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 2bdd4dddc0e..fb75f265b39 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -187,7 +187,8 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags, return genlmsg_end(msg, hdr); nla_put_failure: - return genlmsg_cancel(msg, hdr); + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; } static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb) @@ -273,7 +274,8 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 pid, u32 seq, int flags, return genlmsg_end(msg, hdr); nla_put_failure: - return genlmsg_cancel(msg, hdr); + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; } static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *cb) @@ -928,7 +930,8 @@ static int nl80211_send_station(struct sk_buff *msg, u32 pid, u32 seq, return genlmsg_end(msg, hdr); nla_put_failure: - return genlmsg_cancel(msg, hdr); + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; } static int nl80211_dump_station(struct sk_buff *skb, @@ -1267,7 +1270,8 @@ static int nl80211_send_mpath(struct sk_buff *msg, u32 pid, u32 seq, return genlmsg_end(msg, hdr); nla_put_failure: - return genlmsg_cancel(msg, hdr); + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; } static int nl80211_dump_mpath(struct sk_buff *skb, -- cgit v1.2.3 From ab32cd793dca21eec846a8204390d9594ed994d5 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Tue, 3 Jun 2008 16:37:33 -0700 Subject: route: Remove unused ifa_anycast field The field was supposed to allow the creation of an anycast route by assigning an anycast address to an address prefix. It was never implemented so this field is unused and serves no purpose. Remove it. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 6848e4760f3..79a7ef6209f 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -90,7 +90,6 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = { [IFA_LOCAL] = { .type = NLA_U32 }, [IFA_ADDRESS] = { .type = NLA_U32 }, [IFA_BROADCAST] = { .type = NLA_U32 }, - [IFA_ANYCAST] = { .type = NLA_U32 }, [IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, }; @@ -536,9 +535,6 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh) if (tb[IFA_BROADCAST]) ifa->ifa_broadcast = nla_get_be32(tb[IFA_BROADCAST]); - if (tb[IFA_ANYCAST]) - ifa->ifa_anycast = nla_get_be32(tb[IFA_ANYCAST]); - if (tb[IFA_LABEL]) nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ); else @@ -745,7 +741,6 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) break; inet_del_ifa(in_dev, ifap, 0); ifa->ifa_broadcast = 0; - ifa->ifa_anycast = 0; ifa->ifa_scope = 0; } @@ -1113,7 +1108,6 @@ static inline size_t inet_nlmsg_size(void) + nla_total_size(4) /* IFA_ADDRESS */ + nla_total_size(4) /* IFA_LOCAL */ + nla_total_size(4) /* IFA_BROADCAST */ - + nla_total_size(4) /* IFA_ANYCAST */ + nla_total_size(IFNAMSIZ); /* IFA_LABEL */ } @@ -1143,9 +1137,6 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, if (ifa->ifa_broadcast) NLA_PUT_BE32(skb, IFA_BROADCAST, ifa->ifa_broadcast); - if (ifa->ifa_anycast) - NLA_PUT_BE32(skb, IFA_ANYCAST, ifa->ifa_anycast); - if (ifa->ifa_label[0]) NLA_PUT_STRING(skb, IFA_LABEL, ifa->ifa_label); -- cgit v1.2.3 From d2ee3f2c4b1db1320c1efb4dcaceeaf6c7e6c2d3 Mon Sep 17 00:00:00 2001 From: Dong Wei Date: Wed, 4 Jun 2008 09:57:51 -0700 Subject: netfilter: xt_connlimit: fix accouning when receive RST packet in ESTABLISHED state In xt_connlimit match module, the counter of an IP is decreased when the TCP packet is go through the chain with ip_conntrack state TW. Well, it's very natural that the server and client close the socket with FIN packet. But when the client/server close the socket with RST packet(using so_linger), the counter for this connection still exsit. The following patch can fix it which is based on linux-2.6.25.4 Signed-off-by: Dong Wei Acked-by: Jan Engelhardt Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/xt_connlimit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 2e89a00df92..70907f6baac 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -73,7 +73,8 @@ connlimit_iphash6(const union nf_inet_addr *addr, static inline bool already_closed(const struct nf_conn *conn) { if (nf_ct_protonum(conn) == IPPROTO_TCP) - return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT; + return conn->proto.tcp.state == TCP_CONNTRACK_TIME_WAIT || + conn->proto.tcp.state == TCP_CONNTRACK_CLOSE; else return 0; } -- cgit v1.2.3 From b9c698964614f71b9c8afeca163a945b4c2e2d20 Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Wed, 4 Jun 2008 09:58:27 -0700 Subject: netfilter: nf_conntrack_ipv6: fix inconsistent lock state in nf_ct_frag6_gather() [ 63.531438] ================================= [ 63.531520] [ INFO: inconsistent lock state ] [ 63.531520] 2.6.26-rc4 #7 [ 63.531520] --------------------------------- [ 63.531520] inconsistent {softirq-on-W} -> {in-softirq-W} usage. [ 63.531520] tcpsic6/3864 [HC0[0]:SC1[1]:HE1:SE0] takes: [ 63.531520] (&q->lock#2){-+..}, at: [] ipv6_frag_rcv+0xd0/0xbd0 [ 63.531520] {softirq-on-W} state was registered at: [ 63.531520] [] __lock_acquire+0x3aa/0x1080 [ 63.531520] [] lock_acquire+0x76/0xa0 [ 63.531520] [] _spin_lock+0x2b/0x40 [ 63.531520] [] nf_ct_frag6_gather+0x3f6/0x910 ... According to this and another similar lockdep report inet_fragment locks are taken from nf_ct_frag6_gather() with softirqs enabled, but these locks are mainly used in softirq context, so disabling BHs is necessary. Reported-and-tested-by: Eric Sesterhenn Signed-off-by: Jarek Poplawski Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv6/netfilter/nf_conntrack_reasm.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 2dccad48058..e65e26e210e 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -209,7 +209,9 @@ fq_find(__be32 id, struct in6_addr *src, struct in6_addr *dst) arg.dst = dst; hash = ip6qhashfn(id, src, dst); + local_bh_disable(); q = inet_frag_find(&nf_init_frags, &nf_frags, &arg, hash); + local_bh_enable(); if (q == NULL) goto oom; @@ -638,10 +640,10 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb) goto ret_orig; } - spin_lock(&fq->q.lock); + spin_lock_bh(&fq->q.lock); if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) { - spin_unlock(&fq->q.lock); + spin_unlock_bh(&fq->q.lock); pr_debug("Can't insert skb to queue\n"); fq_put(fq); goto ret_orig; @@ -653,7 +655,7 @@ struct sk_buff *nf_ct_frag6_gather(struct sk_buff *skb) if (ret_skb == NULL) pr_debug("Can't reassemble fragmented packets\n"); } - spin_unlock(&fq->q.lock); + spin_unlock_bh(&fq->q.lock); fq_put(fq); return ret_skb; -- cgit v1.2.3 From 8aca6cb1179ed9bef9351028c8d8af852903eae2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Wed, 4 Jun 2008 11:34:22 -0700 Subject: tcp: Fix inconsistency source (CA_Open only when !tcp_left_out(tp)) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is possible that this skip path causes TCP to end up into an invalid state where ca_state was left to CA_Open while some segments already came into sacked_out. If next valid ACK doesn't contain new SACK information TCP fails to enter into tcp_fastretrans_alert(). Thus at least high_seq is set incorrectly to a too high seqno because some new data segments could be sent in between (and also, limited transmit is not being correctly invoked there). Reordering in both directions can easily cause this situation to occur. I guess we would want to use tcp_moderate_cwnd(tp) there as well as it may be possible to use this to trigger oversized burst to network by sending an old ACK with huge amount of SACK info, but I'm a bit unsure about its effects (mainly to FlightSize), so to be on the safe side I just currently fixed it minimally to keep TCP's state consistent (obviously, such nasty ACKs have been possible this far). Though it seems that FlightSize is already underestimated by some amount, so probably on the long term we might want to trigger recovery there too, if appropriate, to make FlightSize calculation to resemble reality at the time when the losses where discovered (but such change scares me too much now and requires some more thinking anyway how to do that as it likely involves some code shuffling). This bug was found by Brian Vowell while running my TCP debug patch to find cause of another TCP issue (fackets_out miscount). Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b54d9d37b63..54a0b741278 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2483,6 +2483,20 @@ static inline void tcp_complete_cwr(struct sock *sk) tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); } +static void tcp_try_keep_open(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + int state = TCP_CA_Open; + + if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker) + state = TCP_CA_Disorder; + + if (inet_csk(sk)->icsk_ca_state != state) { + tcp_set_ca_state(sk, state); + tp->high_seq = tp->snd_nxt; + } +} + static void tcp_try_to_open(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); @@ -2496,15 +2510,7 @@ static void tcp_try_to_open(struct sock *sk, int flag) tcp_enter_cwr(sk, 1); if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { - int state = TCP_CA_Open; - - if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker) - state = TCP_CA_Disorder; - - if (inet_csk(sk)->icsk_ca_state != state) { - tcp_set_ca_state(sk, state); - tp->high_seq = tp->snd_nxt; - } + tcp_try_keep_open(sk); tcp_moderate_cwnd(tp); } else { tcp_cwnd_down(sk, flag); @@ -3310,8 +3316,11 @@ no_queue: return 1; old_ack: - if (TCP_SKB_CB(skb)->sacked) + if (TCP_SKB_CB(skb)->sacked) { tcp_sacktag_write_queue(sk, skb, prior_snd_una); + if (icsk->icsk_ca_state == TCP_CA_Open) + tcp_try_keep_open(sk); + } uninteresting_ack: SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt); -- cgit v1.2.3 From e51171019bb0e1f9fb57c25bd2e38ce652eaea27 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Thu, 29 May 2008 19:55:05 +0900 Subject: [SCTP]: Fix NULL dereference of asoc. Commit 7cbca67c073263c179f605bdbbdc565ab29d801d ("[IPV6]: Support Source Address Selection API (RFC5014)") introduced NULL dereference of asoc to sctp_v6_get_saddr in net/sctp/ipv6.c. Pointed out by Johann Felix Soden . Signed-off-by: YOSHIFUJI Hideaki --- net/sctp/ipv6.c | 5 +++-- net/sctp/protocol.c | 3 ++- net/sctp/transport.c | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index e45e44c6063..e4aac3266fc 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -299,7 +299,8 @@ static inline int sctp_v6_addr_match_len(union sctp_addr *s1, /* Fills in the source address(saddr) based on the destination address(daddr) * and asoc's bind address list. */ -static void sctp_v6_get_saddr(struct sctp_association *asoc, +static void sctp_v6_get_saddr(struct sctp_sock *sk, + struct sctp_association *asoc, struct dst_entry *dst, union sctp_addr *daddr, union sctp_addr *saddr) @@ -318,7 +319,7 @@ static void sctp_v6_get_saddr(struct sctp_association *asoc, if (!asoc) { ipv6_dev_get_saddr(dst ? ip6_dst_idev(dst)->dev : NULL, &daddr->v6.sin6_addr, - inet6_sk(asoc->base.sk)->srcprefs, + inet6_sk(&sk->inet.sk)->srcprefs, &saddr->v6.sin6_addr); SCTP_DEBUG_PRINTK("saddr from ipv6_get_saddr: " NIP6_FMT "\n", NIP6(saddr->v6.sin6_addr)); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 0ec234b762c..13ee7fa92e0 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -519,7 +519,8 @@ out: /* For v4, the source address is cached in the route entry(dst). So no need * to cache it separately and hence this is an empty routine. */ -static void sctp_v4_get_saddr(struct sctp_association *asoc, +static void sctp_v4_get_saddr(struct sctp_sock *sk, + struct sctp_association *asoc, struct dst_entry *dst, union sctp_addr *daddr, union sctp_addr *saddr) diff --git a/net/sctp/transport.c b/net/sctp/transport.c index f4938f6c5ab..62082e7b797 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -291,7 +291,7 @@ void sctp_transport_route(struct sctp_transport *transport, if (saddr) memcpy(&transport->saddr, saddr, sizeof(union sctp_addr)); else - af->get_saddr(asoc, dst, daddr, &transport->saddr); + af->get_saddr(opt, asoc, dst, daddr, &transport->saddr); transport->dst = dst; if ((transport->param_flags & SPP_PMTUD_DISABLE) && transport->pathmtu) { -- cgit v1.2.3 From a3c960899e042bc1c2b730a2115fa32da7802039 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Wed, 4 Jun 2008 01:30:25 +0900 Subject: [IPV6] UDP: Possible dst leak in udpv6_sendmsg. ip6_sk_dst_lookup returns held dst entry. It should be released on all paths beyond this point. Add missed release when up->pending is set. Bug report and initial patch by Denis V. Lunev . Signed-off-by: YOSHIFUJI Hideaki Acked-by: Denis V. Lunev --- net/ipv6/udp.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 1fd784f3e2e..47123bf5eb0 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -848,12 +848,14 @@ do_append_data: } else { dst_release(dst); } + dst = NULL; } if (err > 0) err = np->recverr ? net_xmit_errno(err) : 0; release_sock(sk); out: + dst_release(dst); fl6_sock_release(flowlabel); if (!err) return len; -- cgit v1.2.3 From 24ef0da7b864435f221f668bc8a324160d063e78 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Wed, 28 May 2008 16:54:22 +0200 Subject: [IPV6] ADDRCONF: Check range of prefix length As of now, the prefix length is not vaildated when adding or deleting addresses. The value is passed directly into the inet6_ifaddr structure and later passed on to memcmp() as length indicator which relies on the value never to exceed 128 (bits). Due to the missing check, the currently code allows for any 8 bit value to be passed on as prefix length while using the netlink interface, and any 32 bit value while using the ioctl interface. [Use unsigned int instead to generate better code - yoshfuji] Signed-off-by: Thomas Graf Signed-off-by: YOSHIFUJI Hideaki --- net/ipv6/addrconf.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 3a835578fd1..c3b20c5afa3 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2027,7 +2027,7 @@ err_exit: * Manual configuration of address on an interface */ static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx, - int plen, __u8 ifa_flags, __u32 prefered_lft, + unsigned int plen, __u8 ifa_flags, __u32 prefered_lft, __u32 valid_lft) { struct inet6_ifaddr *ifp; @@ -2039,6 +2039,9 @@ static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx, ASSERT_RTNL(); + if (plen > 128) + return -EINVAL; + /* check the lifetime */ if (!valid_lft || prefered_lft > valid_lft) return -EINVAL; @@ -2095,12 +2098,15 @@ static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx, } static int inet6_addr_del(struct net *net, int ifindex, struct in6_addr *pfx, - int plen) + unsigned int plen) { struct inet6_ifaddr *ifp; struct inet6_dev *idev; struct net_device *dev; + if (plen > 128) + return -EINVAL; + dev = __dev_get_by_index(net, ifindex); if (!dev) return -ENODEV; -- cgit v1.2.3 From 82836372311a5cbf9cc5f4f47f9b56cb9edfe90d Mon Sep 17 00:00:00 2001 From: Colin Date: Tue, 27 May 2008 00:04:43 +0800 Subject: [IPV6] TUNNEL6: Fix incoming packet length check for inter-protocol tunnel. I discover a strange behavior in [ipv4 in ipv6] tunnel. When IPv6 tunnel payload is less than 40(0x28), packet can be sent to network, received in physical interface, but not seen in IP tunnel interface. No counter increase in tunnel interface. Signed-off-by: YOSHIFUJI Hideaki --- net/ipv6/tunnel6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c index 6323921b40b..669f280989c 100644 --- a/net/ipv6/tunnel6.c +++ b/net/ipv6/tunnel6.c @@ -109,7 +109,7 @@ static int tunnel46_rcv(struct sk_buff *skb) { struct xfrm6_tunnel *handler; - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto drop; for (handler = tunnel46_handlers; handler; handler = handler->next) -- cgit v1.2.3 From baa2bfb8aef24bb7fe1875b256918724b3884662 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Fri, 30 May 2008 11:35:03 +0900 Subject: [IPV4] TUNNEL4: Fix incoming packet length check for inter-protocol tunnel. Signed-off-by: YOSHIFUJI Hideaki --- net/ipv4/tunnel4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index d3b709a6f26..cb1f0e83830 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -97,7 +97,7 @@ static int tunnel64_rcv(struct sk_buff *skb) { struct xfrm_tunnel *handler; - if (!pskb_may_pull(skb, sizeof(struct iphdr))) + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; for (handler = tunnel64_handlers; handler; handler = handler->next) -- cgit v1.2.3 From 4bed72e4f5502ea3322f0a00794815fa58951abe Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Tue, 27 May 2008 17:37:49 +0900 Subject: [IPV6] ADDRCONF: Allow longer lifetime on 64bit archs. - Allow longer lifetimes (>= 0x7fffffff/HZ) on 64bit archs by using unsigned long. - Shadow this arithmetic overflow workaround by introducing helper functions: addrconf_timeout_fixup() and addrconf_finite_timeout(). Signed-off-by: YOSHIFUJI Hideaki --- net/ipv6/addrconf.c | 97 +++++++++++++++++++++++++++-------------------------- net/ipv6/route.c | 12 ++----- 2 files changed, 53 insertions(+), 56 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c3b20c5afa3..147588f4c7c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -731,8 +731,13 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) onlink = -1; spin_lock(&ifa->lock); - lifetime = min_t(unsigned long, - ifa->valid_lft, 0x7fffffffUL/HZ); + + lifetime = addrconf_timeout_fixup(ifa->valid_lft, HZ); + /* + * Note: Because this address is + * not permanent, lifetime < + * LONG_MAX / HZ here. + */ if (time_before(expires, ifa->tstamp + lifetime * HZ)) expires = ifa->tstamp + lifetime * HZ; @@ -1722,7 +1727,6 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) __u32 valid_lft; __u32 prefered_lft; int addr_type; - unsigned long rt_expires; struct inet6_dev *in6_dev; pinfo = (struct prefix_info *) opt; @@ -1764,28 +1768,23 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) * 2) Configure prefixes with the auto flag set */ - if (valid_lft == INFINITY_LIFE_TIME) - rt_expires = ~0UL; - else if (valid_lft >= 0x7FFFFFFF/HZ) { + if (pinfo->onlink) { + struct rt6_info *rt; + unsigned long rt_expires; + /* Avoid arithmetic overflow. Really, we could * save rt_expires in seconds, likely valid_lft, * but it would require division in fib gc, that it * not good. */ - rt_expires = 0x7FFFFFFF - (0x7FFFFFFF % HZ); - } else - rt_expires = valid_lft * HZ; + if (HZ > USER_HZ) + rt_expires = addrconf_timeout_fixup(valid_lft, HZ); + else + rt_expires = addrconf_timeout_fixup(valid_lft, USER_HZ); - /* - * We convert this (in jiffies) to clock_t later. - * Avoid arithmetic overflow there as well. - * Overflow can happen only if HZ < USER_HZ. - */ - if (HZ < USER_HZ && ~rt_expires && rt_expires > 0x7FFFFFFF / USER_HZ) - rt_expires = 0x7FFFFFFF / USER_HZ; + if (addrconf_finite_timeout(rt_expires)) + rt_expires *= HZ; - if (pinfo->onlink) { - struct rt6_info *rt; rt = rt6_lookup(dev_net(dev), &pinfo->prefix, NULL, dev->ifindex, 1); @@ -1794,7 +1793,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) if (valid_lft == 0) { ip6_del_rt(rt); rt = NULL; - } else if (~rt_expires) { + } else if (addrconf_finite_timeout(rt_expires)) { /* not infinity */ rt->rt6i_expires = jiffies + rt_expires; rt->rt6i_flags |= RTF_EXPIRES; @@ -1803,9 +1802,9 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) rt->rt6i_expires = 0; } } else if (valid_lft) { - int flags = RTF_ADDRCONF | RTF_PREFIX_RT; clock_t expires = 0; - if (~rt_expires) { + int flags = RTF_ADDRCONF | RTF_PREFIX_RT; + if (addrconf_finite_timeout(rt_expires)) { /* not infinity */ flags |= RTF_EXPIRES; expires = jiffies_to_clock_t(rt_expires); @@ -2036,6 +2035,7 @@ static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx, int scope; u32 flags; clock_t expires; + unsigned long timeout; ASSERT_RTNL(); @@ -2055,22 +2055,23 @@ static int inet6_addr_add(struct net *net, int ifindex, struct in6_addr *pfx, scope = ipv6_addr_scope(pfx); - if (valid_lft == INFINITY_LIFE_TIME) { - ifa_flags |= IFA_F_PERMANENT; - flags = 0; - expires = 0; - } else { - if (valid_lft >= 0x7FFFFFFF/HZ) - valid_lft = 0x7FFFFFFF/HZ; + timeout = addrconf_timeout_fixup(valid_lft, HZ); + if (addrconf_finite_timeout(timeout)) { + expires = jiffies_to_clock_t(timeout * HZ); + valid_lft = timeout; flags = RTF_EXPIRES; - expires = jiffies_to_clock_t(valid_lft * HZ); + } else { + expires = 0; + flags = 0; + ifa_flags |= IFA_F_PERMANENT; } - if (prefered_lft == 0) - ifa_flags |= IFA_F_DEPRECATED; - else if ((prefered_lft >= 0x7FFFFFFF/HZ) && - (prefered_lft != INFINITY_LIFE_TIME)) - prefered_lft = 0x7FFFFFFF/HZ; + timeout = addrconf_timeout_fixup(prefered_lft, HZ); + if (addrconf_finite_timeout(timeout)) { + if (timeout == 0) + ifa_flags |= IFA_F_DEPRECATED; + prefered_lft = timeout; + } ifp = ipv6_add_addr(idev, pfx, plen, scope, ifa_flags); @@ -3175,26 +3176,28 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u8 ifa_flags, { u32 flags; clock_t expires; + unsigned long timeout; if (!valid_lft || (prefered_lft > valid_lft)) return -EINVAL; - if (valid_lft == INFINITY_LIFE_TIME) { - ifa_flags |= IFA_F_PERMANENT; - flags = 0; - expires = 0; - } else { - if (valid_lft >= 0x7FFFFFFF/HZ) - valid_lft = 0x7FFFFFFF/HZ; + timeout = addrconf_timeout_fixup(valid_lft, HZ); + if (addrconf_finite_timeout(timeout)) { + expires = jiffies_to_clock_t(timeout * HZ); + valid_lft = timeout; flags = RTF_EXPIRES; - expires = jiffies_to_clock_t(valid_lft * HZ); + } else { + expires = 0; + flags = 0; + ifa_flags |= IFA_F_PERMANENT; } - if (prefered_lft == 0) - ifa_flags |= IFA_F_DEPRECATED; - else if ((prefered_lft >= 0x7FFFFFFF/HZ) && - (prefered_lft != INFINITY_LIFE_TIME)) - prefered_lft = 0x7FFFFFFF/HZ; + timeout = addrconf_timeout_fixup(prefered_lft, HZ); + if (addrconf_finite_timeout(timeout)) { + if (timeout == 0) + ifa_flags |= IFA_F_DEPRECATED; + prefered_lft = timeout; + } spin_lock_bh(&ifp->lock); ifp->flags = (ifp->flags & ~(IFA_F_DEPRECATED | IFA_F_PERMANENT | IFA_F_NODAD | IFA_F_HOMEADDRESS)) | ifa_flags; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 48534c6c073..220cffe9e63 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -446,7 +446,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, struct route_info *rinfo = (struct route_info *) opt; struct in6_addr prefix_buf, *prefix; unsigned int pref; - u32 lifetime; + unsigned long lifetime; struct rt6_info *rt; if (len < sizeof(struct route_info)) { @@ -472,13 +472,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, if (pref == ICMPV6_ROUTER_PREF_INVALID) pref = ICMPV6_ROUTER_PREF_MEDIUM; - lifetime = ntohl(rinfo->lifetime); - if (lifetime == 0xffffffff) { - /* infinity */ - } else if (lifetime > 0x7fffffff/HZ - 1) { - /* Avoid arithmetic overflow */ - lifetime = 0x7fffffff/HZ - 1; - } + lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ); if (rinfo->length == 3) prefix = (struct in6_addr *)rinfo->prefix; @@ -506,7 +500,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); if (rt) { - if (lifetime == 0xffffffff) { + if (!addrconf_finite_timeout(lifetime)) { rt->rt6i_flags &= ~RTF_EXPIRES; } else { rt->rt6i_expires = jiffies + HZ * lifetime; -- cgit v1.2.3 From 05335c2220c4911b69cb1bdd79e603ab08088372 Mon Sep 17 00:00:00 2001 From: Yang Hongyang Date: Wed, 28 May 2008 16:23:47 +0800 Subject: [IPV6]: Fix the return value of get destination options with NULL data pointer If we pass NULL data buffer to getsockopt(), it will return 0, and the option length is set to -EFAULT: getsockopt(sk, IPPROTO_IPV6, IPV6_DSTOPTS, NULL, &len); This is because ipv6_getsockopt_sticky() will return -EFAULT or -EINVAL if some error occur. This patch fix this problem. Signed-off-by: Yang Hongyang Signed-off-by: YOSHIFUJI Hideaki --- net/ipv6/ipv6_sockglue.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 56d55fecf8e..aa7bedf780e 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -975,6 +975,9 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, len = ipv6_getsockopt_sticky(sk, np->opt, optname, optval, len); release_sock(sk); + /* check if ipv6_getsockopt_sticky() returns err code */ + if (len < 0) + return len; return put_user(len, optlen); } -- cgit v1.2.3 From 95b496b66615d8c43f77702049b1bd01e2f06595 Mon Sep 17 00:00:00 2001 From: Yang Hongyang Date: Wed, 28 May 2008 16:27:28 +0800 Subject: [IPV6]: Fix the data length of get destination options with short length If get destination options with length which is not enough for that option,getsockopt() will still return the real length of the option, which is larger then the buffer space. This is because ipv6_getsockopt_sticky() returns the real length of the option. This patch fix this problem. Signed-off-by: Yang Hongyang Signed-off-by: YOSHIFUJI Hideaki --- net/ipv6/ipv6_sockglue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index aa7bedf780e..9293b9f0ac2 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -832,7 +832,7 @@ static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt, len = min_t(unsigned int, len, ipv6_optlen(hdr)); if (copy_to_user(optval, hdr, len)) return -EFAULT; - return ipv6_optlen(hdr); + return len; } static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, -- cgit v1.2.3 From 187e38384c4abfbbb1b880fab234d16c2df23a25 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Wed, 4 Jun 2008 13:01:37 +0900 Subject: [IPV6]: Check outgoing interface even if source address is unspecified. The outgoing interface index (ipi6_ifindex) in IPV6_PKTINFO ancillary data, is not checked if the source address (ipi6_addr) is unspecified. If the ipi6_ifindex is the not-exist interface, it should be fail. Based on patch from Shan Wei and Brian Haley . Signed-off-by: Shan Wei Signed-off-by: Brian Haley Signed-off-by: YOSHIFUJI Hideaki --- net/ipv6/datagram.c | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 94fa6ae77cf..53e3883f766 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -509,7 +509,6 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl, for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { int addr_type; - struct net_device *dev = NULL; if (!CMSG_OK(msg, cmsg)) { err = -EINVAL; @@ -522,6 +521,9 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl, switch (cmsg->cmsg_type) { case IPV6_PKTINFO: case IPV6_2292PKTINFO: + { + struct net_device *dev = NULL; + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) { err = -EINVAL; goto exit_f; @@ -535,32 +537,32 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl, fl->oif = src_info->ipi6_ifindex; } - addr_type = ipv6_addr_type(&src_info->ipi6_addr); + addr_type = __ipv6_addr_type(&src_info->ipi6_addr); - if (addr_type == IPV6_ADDR_ANY) - break; + if (fl->oif) { + dev = dev_get_by_index(&init_net, fl->oif); + if (!dev) + return -ENODEV; + } else if (addr_type & IPV6_ADDR_LINKLOCAL) + return -EINVAL; - if (addr_type & IPV6_ADDR_LINKLOCAL) { - if (!src_info->ipi6_ifindex) - return -EINVAL; - else { - dev = dev_get_by_index(&init_net, src_info->ipi6_ifindex); - if (!dev) - return -ENODEV; - } - } - if (!ipv6_chk_addr(&init_net, &src_info->ipi6_addr, - dev, 0)) { - if (dev) - dev_put(dev); - err = -EINVAL; - goto exit_f; + if (addr_type != IPV6_ADDR_ANY) { + int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL; + if (!ipv6_chk_addr(&init_net, &src_info->ipi6_addr, + strict ? dev : NULL, 0)) + err = -EINVAL; + else + ipv6_addr_copy(&fl->fl6_src, &src_info->ipi6_addr); } + if (dev) dev_put(dev); - ipv6_addr_copy(&fl->fl6_src, &src_info->ipi6_addr); + if (err) + goto exit_f; + break; + } case IPV6_FLOWINFO: if (cmsg->cmsg_len < CMSG_LEN(4)) { -- cgit v1.2.3 From 91e1908f569dd96a25a3947de8771e6cc93999dd Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Wed, 4 Jun 2008 13:02:49 +0900 Subject: [IPV6] NETNS: Handle ancillary data in appropriate namespace. Signed-off-by: YOSHIFUJI Hideaki --- net/ipv6/datagram.c | 7 ++++--- net/ipv6/ip6_flowlabel.c | 2 +- net/ipv6/ipv6_sockglue.c | 2 +- net/ipv6/raw.c | 2 +- net/ipv6/udp.c | 2 +- 5 files changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 53e3883f766..b9c2de84a8a 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -496,7 +496,8 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) return 0; } -int datagram_send_ctl(struct msghdr *msg, struct flowi *fl, +int datagram_send_ctl(struct net *net, + struct msghdr *msg, struct flowi *fl, struct ipv6_txoptions *opt, int *hlimit, int *tclass) { @@ -540,7 +541,7 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl, addr_type = __ipv6_addr_type(&src_info->ipi6_addr); if (fl->oif) { - dev = dev_get_by_index(&init_net, fl->oif); + dev = dev_get_by_index(net, fl->oif); if (!dev) return -ENODEV; } else if (addr_type & IPV6_ADDR_LINKLOCAL) @@ -548,7 +549,7 @@ int datagram_send_ctl(struct msghdr *msg, struct flowi *fl, if (addr_type != IPV6_ADDR_ANY) { int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL; - if (!ipv6_chk_addr(&init_net, &src_info->ipi6_addr, + if (!ipv6_chk_addr(net, &src_info->ipi6_addr, strict ? dev : NULL, 0)) err = -EINVAL; else diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index eb7a940310f..37a4e777e34 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -354,7 +354,7 @@ fl_create(struct net *net, struct in6_flowlabel_req *freq, char __user *optval, msg.msg_control = (void*)(fl->opt+1); flowi.oif = 0; - err = datagram_send_ctl(&msg, &flowi, fl->opt, &junk, &junk); + err = datagram_send_ctl(net, &msg, &flowi, fl->opt, &junk, &junk); if (err) goto done; err = -EINVAL; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 9293b9f0ac2..3eef8e5b363 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -416,7 +416,7 @@ sticky_done: msg.msg_controllen = optlen; msg.msg_control = (void*)(opt+1); - retv = datagram_send_ctl(&msg, &fl, opt, &junk, &junk); + retv = datagram_send_ctl(net, &msg, &fl, opt, &junk, &junk); if (retv) goto done; update: diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 232e0dc45bf..603df76e052 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -813,7 +813,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(struct ipv6_txoptions); - err = datagram_send_ctl(msg, &fl, opt, &hlimit, &tclass); + err = datagram_send_ctl(sock_net(sk), msg, &fl, opt, &hlimit, &tclass); if (err < 0) { fl6_sock_release(flowlabel); return err; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 47123bf5eb0..1b35c472200 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -731,7 +731,7 @@ do_udp_sendmsg: memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(*opt); - err = datagram_send_ctl(msg, &fl, opt, &hlimit, &tclass); + err = datagram_send_ctl(sock_net(sk), msg, &fl, opt, &hlimit, &tclass); if (err < 0) { fl6_sock_release(flowlabel); return err; -- cgit v1.2.3 From 49d074f4009a7b5ce9c17b040f978abcb4d7f6f6 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Wed, 4 Jun 2008 15:49:06 +0400 Subject: [IPV6]: Do not change protocol for raw IPv6 sockets. It is not allowed to change underlying protocol for int fd = socket(PF_INET6, SOCK_RAW, IPPROTO_UDP); Signed-off-by: Denis V. Lunev Signed-off-by: YOSHIFUJI Hideaki --- net/ipv6/ipv6_sockglue.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 3eef8e5b363..1afe210d628 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -161,6 +161,9 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, struct ipv6_txoptions *opt; struct sk_buff *pktopt; + if (sk->sk_type == SOCK_RAW) + break; + if (sk->sk_protocol != IPPROTO_UDP && sk->sk_protocol != IPPROTO_UDPLITE && sk->sk_protocol != IPPROTO_TCP) -- cgit v1.2.3 From 36d926b94a9908937593e5669162305a071b9cc3 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Wed, 4 Jun 2008 15:49:07 +0400 Subject: [IPV6]: inet_sk(sk)->cork.opt leak IPv6 UDP sockets wth IPv4 mapped address use udp_sendmsg to send the data actually. In this case ip_flush_pending_frames should be called instead of ip6_flush_pending_frames. Signed-off-by: Denis V. Lunev Signed-off-by: YOSHIFUJI Hideaki --- net/ipv4/udp.c | 3 ++- net/ipv6/udp.c | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index db1cb7c96d6..56fcda3694b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -420,7 +420,7 @@ void udp_err(struct sk_buff *skb, u32 info) /* * Throw away all pending data and cancel the corking. Socket is locked. */ -static void udp_flush_pending_frames(struct sock *sk) +void udp_flush_pending_frames(struct sock *sk) { struct udp_sock *up = udp_sk(sk); @@ -430,6 +430,7 @@ static void udp_flush_pending_frames(struct sock *sk) ip_flush_pending_frames(sk); } } +EXPORT_SYMBOL(udp_flush_pending_frames); /** * udp4_hwcsum_outgoing - handle outgoing HW checksumming diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 1b35c472200..dd309626ae9 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -534,7 +534,9 @@ static void udp_v6_flush_pending_frames(struct sock *sk) { struct udp_sock *up = udp_sk(sk); - if (up->pending) { + if (up->pending == AF_INET) + udp_flush_pending_frames(sk); + else if (up->pending) { up->len = 0; up->pending = 0; ip6_flush_pending_frames(sk); -- cgit v1.2.3 From 9596cc826e2e52bfc318ca37a6c52fe3d72990a3 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Wed, 4 Jun 2008 15:49:08 +0400 Subject: [IPV6]: Do not change protocol for UDPv6 sockets with pending sent data. Signed-off-by: Denis V. Lunev Signed-off-by: YOSHIFUJI Hideaki --- net/ipv6/ipv6_sockglue.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 1afe210d628..26b83e512a0 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -164,9 +164,14 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (sk->sk_type == SOCK_RAW) break; - if (sk->sk_protocol != IPPROTO_UDP && - sk->sk_protocol != IPPROTO_UDPLITE && - sk->sk_protocol != IPPROTO_TCP) + if (sk->sk_protocol == IPPROTO_UDP || + sk->sk_protocol == IPPROTO_UDPLITE) { + struct udp_sock *up = udp_sk(sk); + if (up->pending == AF_INET6) { + retv = -EBUSY; + break; + } + } else if (sk->sk_protocol != IPPROTO_TCP) break; if (sk->sk_state != TCP_ESTABLISHED) { -- cgit v1.2.3 From a13366c632132bb9f8f2950a79773d8f68f4871e Mon Sep 17 00:00:00 2001 From: Adrian-Ken Rueegsegger Date: Wed, 4 Jun 2008 12:04:55 -0700 Subject: xfrm: xfrm_algo: correct usage of RIPEMD-160 This patch fixes the usage of RIPEMD-160 in xfrm_algo which in turn allows hmac(rmd160) to be used as authentication mechanism in IPsec ESP and AH (see RFC 2857). Signed-off-by: Adrian-Ken Rueegsegger Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/xfrm/xfrm_algo.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c index ac765dd9c7f..23a2cc04b8c 100644 --- a/net/xfrm/xfrm_algo.c +++ b/net/xfrm/xfrm_algo.c @@ -200,8 +200,8 @@ static struct xfrm_algo_desc aalg_list[] = { } }, { - .name = "hmac(ripemd160)", - .compat = "ripemd160", + .name = "hmac(rmd160)", + .compat = "rmd160", .uinfo = { .auth = { -- cgit v1.2.3 From a6604471db5e7a33474a7f16c64d6b118fae3e74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Wed, 4 Jun 2008 12:07:44 -0700 Subject: tcp: fix skb vs fack_count out-of-sync condition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This bug is able to corrupt fackets_out in very rare cases. In order for this to cause corruption: 1) DSACK in the middle of previous SACK block must be generated. 2) In order to take that particular branch, part or all of the DSACKed segment must already be SACKed so that we have that in cache in the first place. 3) The new info must be top enough so that fackets_out will be updated on this iteration. ...then fack_count is updated while skb wasn't, then we walk again that particular segment thus updating fack_count twice for a single skb and finally that value is assigned to fackets_out by tcp_sacktag_one. It is safe to call tcp_sacktag_one just once for a segment (at DSACK), no need to call again for plain SACK. Potential problem of the miscount are limited to premature entry to recovery and to inflated reordering metric (which could even cancel each other out in the most the luckiest scenarios :-)). Both are quite insignificant in worst case too and there exists also code to reset them (fackets_out once sacked_out becomes zero and reordering metric on RTO). This has been reported by a number of people, because it occurred quite rarely, it has been very evasive. Andy Furniss was able to get it to occur couple of times so that a bit more info was collected about the problem using a debug patch, though it still required lot of checking around. Thanks also to others who have tried to help here. This is listed as Bugzilla #10346. The bug was introduced by me in commit 68f8353b48 ([TCP]: Rewrite SACK block processing & sack_recv_cache use), I probably thought back then that there's need to scan that entry twice or didn't dare to make it go through it just once there. Going through twice would have required restoring fack_count after the walk but as noted above, I chose to drop the additional walk step altogether here. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 54a0b741278..eba873e9b56 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1392,9 +1392,9 @@ static struct sk_buff *tcp_maybe_skipping_dsack(struct sk_buff *skb, if (before(next_dup->start_seq, skip_to_seq)) { skb = tcp_sacktag_skip(skb, sk, next_dup->start_seq, fack_count); - tcp_sacktag_walk(skb, sk, NULL, - next_dup->start_seq, next_dup->end_seq, - 1, fack_count, reord, flag); + skb = tcp_sacktag_walk(skb, sk, NULL, + next_dup->start_seq, next_dup->end_seq, + 1, fack_count, reord, flag); } return skb; -- cgit v1.2.3 From 4141ddc02a92a6e3e5793601554c6033e83c25b9 Mon Sep 17 00:00:00 2001 From: Gui Jianfeng Date: Wed, 4 Jun 2008 12:37:33 -0700 Subject: sctp: retran_path update bug fix If the current retran_path is the only active one, it should update it to the the next inactive one. Signed-off-by: Gui Jianfeng Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/associola.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sctp/associola.c b/net/sctp/associola.c index b4cd2b71953..532634861db 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1203,6 +1203,9 @@ void sctp_assoc_update_retran_path(struct sctp_association *asoc) struct list_head *head = &asoc->peer.transport_addr_list; struct list_head *pos; + if (asoc->peer.transport_count == 1) + return; + /* Find the next transport in a round-robin fashion. */ t = asoc->peer.retran_path; pos = &t->transports; @@ -1217,6 +1220,15 @@ void sctp_assoc_update_retran_path(struct sctp_association *asoc) t = list_entry(pos, struct sctp_transport, transports); + /* We have exhausted the list, but didn't find any + * other active transports. If so, use the next + * transport. + */ + if (t == asoc->peer.retran_path) { + t = next; + break; + } + /* Try to find an active transport. */ if ((t->state == SCTP_ACTIVE) || @@ -1229,15 +1241,6 @@ void sctp_assoc_update_retran_path(struct sctp_association *asoc) if (!next) next = t; } - - /* We have exhausted the list, but didn't find any - * other active transports. If so, use the next - * transport. - */ - if (t == asoc->peer.retran_path) { - t = next; - break; - } } asoc->peer.retran_path = t; -- cgit v1.2.3 From 159c6bea37c54dfae44409467e0f17600722d541 Mon Sep 17 00:00:00 2001 From: Gui Jianfeng Date: Wed, 4 Jun 2008 12:38:07 -0700 Subject: sctp: Move sctp_v4_dst_saddr out of loop There's no need to execute sctp_v4_dst_saddr() for each iteration, just move it out of loop. Signed-off-by: Gui Jianfeng Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/protocol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 13ee7fa92e0..56bdaf7fc42 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -470,11 +470,11 @@ static struct dst_entry *sctp_v4_get_dst(struct sctp_association *asoc, /* Walk through the bind address list and look for a bind * address that matches the source address of the returned dst. */ + sctp_v4_dst_saddr(&dst_saddr, dst, htons(bp->port)); rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { if (!laddr->valid || (laddr->state != SCTP_ADDR_SRC)) continue; - sctp_v4_dst_saddr(&dst_saddr, dst, htons(bp->port)); if (sctp_v4_cmp_addr(&dst_saddr, &laddr->a)) goto out_unlock; } -- cgit v1.2.3 From a6465234814efda9ed1dccdba852953f7508e827 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Wed, 4 Jun 2008 12:38:43 -0700 Subject: sctp: Correctly implement Fast Recovery cwnd manipulations. Correctly keep track of Fast Recovery state and do not reduce congestion window multiple times during sucht state. Signed-off-by: Vlad Yasevich Tested-by: Wei Yongjun Signed-off-by: David S. Miller --- net/sctp/transport.c | 44 ++++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 62082e7b797..9647fb27722 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -79,6 +79,7 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer, peer->rttvar = 0; peer->srtt = 0; peer->rto_pending = 0; + peer->fast_recovery = 0; peer->last_time_heard = jiffies; peer->last_time_used = jiffies; @@ -403,11 +404,16 @@ void sctp_transport_raise_cwnd(struct sctp_transport *transport, cwnd = transport->cwnd; flight_size = transport->flight_size; + /* See if we need to exit Fast Recovery first */ + if (transport->fast_recovery && + TSN_lte(transport->fast_recovery_exit, sack_ctsn)) + transport->fast_recovery = 0; + /* The appropriate cwnd increase algorithm is performed if, and only - * if the cumulative TSN has advanced and the congestion window is + * if the cumulative TSN whould advanced and the congestion window is * being fully utilized. */ - if ((transport->asoc->ctsn_ack_point >= sack_ctsn) || + if (TSN_lte(sack_ctsn, transport->asoc->ctsn_ack_point) || (flight_size < cwnd)) return; @@ -416,17 +422,23 @@ void sctp_transport_raise_cwnd(struct sctp_transport *transport, pmtu = transport->asoc->pathmtu; if (cwnd <= ssthresh) { - /* RFC 2960 7.2.1, sctpimpguide-05 2.14.2 When cwnd is less - * than or equal to ssthresh an SCTP endpoint MUST use the - * slow start algorithm to increase cwnd only if the current - * congestion window is being fully utilized and an incoming - * SACK advances the Cumulative TSN Ack Point. Only when these - * two conditions are met can the cwnd be increased otherwise - * the cwnd MUST not be increased. If these conditions are met - * then cwnd MUST be increased by at most the lesser of - * 1) the total size of the previously outstanding DATA - * chunk(s) acknowledged, and 2) the destination's path MTU. + /* RFC 4960 7.2.1 + * o When cwnd is less than or equal to ssthresh, an SCTP + * endpoint MUST use the slow-start algorithm to increase + * cwnd only if the current congestion window is being fully + * utilized, an incoming SACK advances the Cumulative TSN + * Ack Point, and the data sender is not in Fast Recovery. + * Only when these three conditions are met can the cwnd be + * increased; otherwise, the cwnd MUST not be increased. + * If these conditions are met, then cwnd MUST be increased + * by, at most, the lesser of 1) the total size of the + * previously outstanding DATA chunk(s) acknowledged, and + * 2) the destination's path MTU. This upper bound protects + * against the ACK-Splitting attack outlined in [SAVAGE99]. */ + if (transport->fast_recovery) + return; + if (bytes_acked > pmtu) cwnd += pmtu; else @@ -502,6 +514,13 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, * cwnd = ssthresh * partial_bytes_acked = 0 */ + if (transport->fast_recovery) + return; + + /* Mark Fast recovery */ + transport->fast_recovery = 1; + transport->fast_recovery_exit = transport->asoc->next_tsn - 1; + transport->ssthresh = max(transport->cwnd/2, 4*transport->asoc->pathmtu); transport->cwnd = transport->ssthresh; @@ -586,6 +605,7 @@ void sctp_transport_reset(struct sctp_transport *t) t->flight_size = 0; t->error_count = 0; t->rto_pending = 0; + t->fast_recovery = 0; /* Initialize the state information for SFR-CACC */ t->cacc.changeover_active = 0; -- cgit v1.2.3 From 62aeaff5ccd96462b7077046357a6d7886175a57 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Wed, 4 Jun 2008 12:39:11 -0700 Subject: sctp: Start T3-RTX timer when fast retransmitting lowest TSN When we are trying to fast retransmit the lowest outstanding TSN, we need to restart the T3-RTX timer, so that subsequent timeouts will correctly tag all the packets necessary for retransmissions. Signed-off-by: Vlad Yasevich Tested-by: Wei Yongjun Signed-off-by: David S. Miller --- net/sctp/outqueue.c | 42 +++++++++++++++++++++++++++++++----------- net/sctp/transport.c | 4 ++-- 2 files changed, 33 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 59edfd25a19..5d3c441e84d 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -208,6 +208,7 @@ void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q) INIT_LIST_HEAD(&q->sacked); INIT_LIST_HEAD(&q->abandoned); + q->fast_rtx = 0; q->outstanding_bytes = 0; q->empty = 1; q->cork = 0; @@ -500,6 +501,7 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport, case SCTP_RTXR_FAST_RTX: SCTP_INC_STATS(SCTP_MIB_FAST_RETRANSMITS); sctp_transport_lower_cwnd(transport, SCTP_LOWER_CWND_FAST_RTX); + q->fast_rtx = 1; break; case SCTP_RTXR_PMTUD: SCTP_INC_STATS(SCTP_MIB_PMTUD_RETRANSMITS); @@ -543,10 +545,13 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt, sctp_xmit_t status; struct sctp_chunk *chunk, *chunk1; struct sctp_association *asoc; + int fast_rtx; int error = 0; + int timer = 0; asoc = q->asoc; lqueue = &q->retransmit; + fast_rtx = q->fast_rtx; /* RFC 2960 6.3.3 Handle T3-rtx Expiration * @@ -587,13 +592,12 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt, switch (status) { case SCTP_XMIT_PMTU_FULL: /* Send this packet. */ - if ((error = sctp_packet_transmit(pkt)) == 0) - *start_timer = 1; + error = sctp_packet_transmit(pkt); /* If we are retransmitting, we should only * send a single packet. */ - if (rtx_timeout) { + if (rtx_timeout || fast_rtx) { list_add(lchunk, lqueue); lchunk = NULL; } @@ -603,8 +607,7 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt, case SCTP_XMIT_RWND_FULL: /* Send this packet. */ - if ((error = sctp_packet_transmit(pkt)) == 0) - *start_timer = 1; + error = sctp_packet_transmit(pkt); /* Stop sending DATA as there is no more room * at the receiver. @@ -615,8 +618,7 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt, case SCTP_XMIT_NAGLE_DELAY: /* Send this packet. */ - if ((error = sctp_packet_transmit(pkt)) == 0) - *start_timer = 1; + error = sctp_packet_transmit(pkt); /* Stop sending DATA because of nagle delay. */ list_add(lchunk, lqueue); @@ -635,7 +637,14 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt, if (chunk->fast_retransmit > 0) chunk->fast_retransmit = -1; - *start_timer = 1; + /* Force start T3-rtx timer when fast retransmitting + * the earliest outstanding TSN + */ + if (!timer && fast_rtx && + ntohl(chunk->subh.data_hdr->tsn) == + asoc->ctsn_ack_point + 1) + timer = 2; + q->empty = 0; /* Retrieve a new chunk to bundle. */ @@ -643,12 +652,16 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt, break; } + /* Set the timer if there were no errors */ + if (!error && !timer) + timer = 1; + /* If we are here due to a retransmit timeout or a fast * retransmit and if there are any chunks left in the retransmit * queue that could not fit in the PMTU sized packet, they need * to be marked as ineligible for a subsequent fast retransmit. */ - if (rtx_timeout && !lchunk) { + if (rtx_timeout && fast_rtx) { list_for_each_entry(chunk1, lqueue, transmitted_list) { if (chunk1->fast_retransmit > 0) chunk1->fast_retransmit = -1; @@ -656,6 +669,12 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt, } } + *start_timer = timer; + + /* Clear fast retransmit hint */ + if (fast_rtx) + q->fast_rtx = 0; + return error; } @@ -862,7 +881,8 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) rtx_timeout, &start_timer); if (start_timer) - sctp_transport_reset_timers(transport); + sctp_transport_reset_timers(transport, + start_timer-1); /* This can happen on COOKIE-ECHO resend. Only * one chunk can get bundled with a COOKIE-ECHO. @@ -977,7 +997,7 @@ int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) list_add_tail(&chunk->transmitted_list, &transport->transmitted); - sctp_transport_reset_timers(transport); + sctp_transport_reset_timers(transport, start_timer-1); q->empty = 0; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 9647fb27722..3f34f61221e 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -191,7 +191,7 @@ static void sctp_transport_destroy(struct sctp_transport *transport) /* Start T3_rtx timer if it is not already running and update the heartbeat * timer. This routine is called every time a DATA chunk is sent. */ -void sctp_transport_reset_timers(struct sctp_transport *transport) +void sctp_transport_reset_timers(struct sctp_transport *transport, int force) { /* RFC 2960 6.3.2 Retransmission Timer Rules * @@ -201,7 +201,7 @@ void sctp_transport_reset_timers(struct sctp_transport *transport) * address. */ - if (!timer_pending(&transport->T3_rtx_timer)) + if (force || !timer_pending(&transport->T3_rtx_timer)) if (!mod_timer(&transport->T3_rtx_timer, jiffies + transport->rto)) sctp_transport_hold(transport); -- cgit v1.2.3 From 8b750ce54bd8ab5f75d519ee450e1b0c5226ebe9 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Wed, 4 Jun 2008 12:39:36 -0700 Subject: sctp: Flush the queue only once during fast retransmit. When fast retransmit is triggered by a sack, we should flush the queue only once so that only 1 retransmit happens. Also, since we could potentially have non-fast-rtx chunks on the retransmit queue, we need make sure any chunks eligable for fast retransmit are sent first during fast retransmission. Signed-off-by: Vlad Yasevich Tested-by: Wei Yongjun Signed-off-by: David S. Miller --- net/sctp/outqueue.c | 82 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 48 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 5d3c441e84d..ace6770e904 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -520,9 +520,15 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport, * the sender SHOULD try to advance the "Advanced.Peer.Ack.Point" by * following the procedures outlined in C1 - C5. */ - sctp_generate_fwdtsn(q, q->asoc->ctsn_ack_point); + if (reason == SCTP_RTXR_T3_RTX) + sctp_generate_fwdtsn(q, q->asoc->ctsn_ack_point); - error = sctp_outq_flush(q, /* rtx_timeout */ 1); + /* Flush the queues only on timeout, since fast_rtx is only + * triggered during sack processing and the queue + * will be flushed at the end. + */ + if (reason != SCTP_RTXR_FAST_RTX) + error = sctp_outq_flush(q, /* rtx_timeout */ 1); if (error) q->asoc->base.sk->sk_err = -error; @@ -540,7 +546,6 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt, int rtx_timeout, int *start_timer) { struct list_head *lqueue; - struct list_head *lchunk; struct sctp_transport *transport = pkt->transport; sctp_xmit_t status; struct sctp_chunk *chunk, *chunk1; @@ -548,12 +553,16 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt, int fast_rtx; int error = 0; int timer = 0; + int done = 0; asoc = q->asoc; lqueue = &q->retransmit; fast_rtx = q->fast_rtx; - /* RFC 2960 6.3.3 Handle T3-rtx Expiration + /* This loop handles time-out retransmissions, fast retransmissions, + * and retransmissions due to opening of whindow. + * + * RFC 2960 6.3.3 Handle T3-rtx Expiration * * E3) Determine how many of the earliest (i.e., lowest TSN) * outstanding DATA chunks for the address for which the @@ -568,12 +577,12 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt, * [Just to be painfully clear, if we are retransmitting * because a timeout just happened, we should send only ONE * packet of retransmitted data.] + * + * For fast retransmissions we also send only ONE packet. However, + * if we are just flushing the queue due to open window, we'll + * try to send as much as possible. */ - lchunk = sctp_list_dequeue(lqueue); - - while (lchunk) { - chunk = list_entry(lchunk, struct sctp_chunk, - transmitted_list); + list_for_each_entry_safe(chunk, chunk1, lqueue, transmitted_list) { /* Make sure that Gap Acked TSNs are not retransmitted. A * simple approach is just to move such TSNs out of the @@ -581,11 +590,18 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt, * next chunk. */ if (chunk->tsn_gap_acked) { - list_add_tail(lchunk, &transport->transmitted); - lchunk = sctp_list_dequeue(lqueue); + list_del(&chunk->transmitted_list); + list_add_tail(&chunk->transmitted_list, + &transport->transmitted); continue; } + /* If we are doing fast retransmit, ignore non-fast_rtransmit + * chunks + */ + if (fast_rtx && !chunk->fast_retransmit) + continue; + /* Attempt to append this chunk to the packet. */ status = sctp_packet_append_chunk(pkt, chunk); @@ -597,12 +613,10 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt, /* If we are retransmitting, we should only * send a single packet. */ - if (rtx_timeout || fast_rtx) { - list_add(lchunk, lqueue); - lchunk = NULL; - } + if (rtx_timeout || fast_rtx) + done = 1; - /* Bundle lchunk in the next round. */ + /* Bundle next chunk in the next round. */ break; case SCTP_XMIT_RWND_FULL: @@ -612,8 +626,7 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt, /* Stop sending DATA as there is no more room * at the receiver. */ - list_add(lchunk, lqueue); - lchunk = NULL; + done = 1; break; case SCTP_XMIT_NAGLE_DELAY: @@ -621,15 +634,16 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt, error = sctp_packet_transmit(pkt); /* Stop sending DATA because of nagle delay. */ - list_add(lchunk, lqueue); - lchunk = NULL; + done = 1; break; default: /* The append was successful, so add this chunk to * the transmitted list. */ - list_add_tail(lchunk, &transport->transmitted); + list_del(&chunk->transmitted_list); + list_add_tail(&chunk->transmitted_list, + &transport->transmitted); /* Mark the chunk as ineligible for fast retransmit * after it is retransmitted. @@ -646,9 +660,6 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt, timer = 2; q->empty = 0; - - /* Retrieve a new chunk to bundle. */ - lchunk = sctp_list_dequeue(lqueue); break; } @@ -656,16 +667,19 @@ static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt, if (!error && !timer) timer = 1; - /* If we are here due to a retransmit timeout or a fast - * retransmit and if there are any chunks left in the retransmit - * queue that could not fit in the PMTU sized packet, they need - * to be marked as ineligible for a subsequent fast retransmit. - */ - if (rtx_timeout && fast_rtx) { - list_for_each_entry(chunk1, lqueue, transmitted_list) { - if (chunk1->fast_retransmit > 0) - chunk1->fast_retransmit = -1; - } + if (done) + break; + } + + /* If we are here due to a retransmit timeout or a fast + * retransmit and if there are any chunks left in the retransmit + * queue that could not fit in the PMTU sized packet, they need + * to be marked as ineligible for a subsequent fast retransmit. + */ + if (rtx_timeout || fast_rtx) { + list_for_each_entry(chunk1, lqueue, transmitted_list) { + if (chunk1->fast_retransmit > 0) + chunk1->fast_retransmit = -1; } } -- cgit v1.2.3 From b9031d9d87b24e24cd32ea15b5f4220a1e8da909 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Wed, 4 Jun 2008 12:40:15 -0700 Subject: sctp: Fix ECN markings for IPv6 Commit e9df2e8fd8fbc95c57dbd1d33dada66c4627b44c ("[IPV6]: Use appropriate sock tclass setting for routing lookup.") also changed the way that ECN capable transports mark this capability in IPv6. As a result, SCTP was not marking ECN capablity because the traffic class was never set. This patch brings back the markings for IPv6 traffic. Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/ipv6.c | 6 ++++++ net/sctp/output.c | 2 +- net/sctp/protocol.c | 6 ++++++ 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index e4aac3266fc..a2f4d4d5159 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -727,6 +727,11 @@ static void sctp_v6_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr) seq_printf(seq, NIP6_FMT " ", NIP6(addr->v6.sin6_addr)); } +static void sctp_v6_ecn_capable(struct sock *sk) +{ + inet6_sk(sk)->tclass |= INET_ECN_ECT_0; +} + /* Initialize a PF_INET6 socket msg_name. */ static void sctp_inet6_msgname(char *msgname, int *addr_len) { @@ -997,6 +1002,7 @@ static struct sctp_af sctp_af_inet6 = { .skb_iif = sctp_v6_skb_iif, .is_ce = sctp_v6_is_ce, .seq_dump_addr = sctp_v6_seq_dump_addr, + .ecn_capable = sctp_v6_ecn_capable, .net_header_len = sizeof(struct ipv6hdr), .sockaddr_len = sizeof(struct sockaddr_in6), #ifdef CONFIG_COMPAT diff --git a/net/sctp/output.c b/net/sctp/output.c index cf4f9fb6819..6d45bae93b4 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -548,7 +548,7 @@ int sctp_packet_transmit(struct sctp_packet *packet) * Note: The works for IPv6 layer checks this bit too later * in transmission. See IP6_ECN_flow_xmit(). */ - INET_ECN_xmit(nskb->sk); + (*tp->af_specific->ecn_capable)(nskb->sk); /* Set up the IP options. */ /* BUG: not implemented diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 56bdaf7fc42..b435a193c5d 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -617,6 +617,11 @@ static void sctp_v4_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr) seq_printf(seq, "%d.%d.%d.%d ", NIPQUAD(addr->v4.sin_addr)); } +static void sctp_v4_ecn_capable(struct sock *sk) +{ + INET_ECN_xmit(sk); +} + /* Event handler for inet address addition/deletion events. * The sctp_local_addr_list needs to be protocted by a spin lock since * multiple notifiers (say IPv4 and IPv6) may be running at the same @@ -935,6 +940,7 @@ static struct sctp_af sctp_af_inet = { .skb_iif = sctp_v4_skb_iif, .is_ce = sctp_v4_is_ce, .seq_dump_addr = sctp_v4_seq_dump_addr, + .ecn_capable = sctp_v4_ecn_capable, .net_header_len = sizeof(struct iphdr), .sockaddr_len = sizeof(struct sockaddr_in), #ifdef CONFIG_COMPAT -- cgit v1.2.3 From 22dd485022f3d0b162ceb5e67d85de7c3806aa20 Mon Sep 17 00:00:00 2001 From: "Denis V. Lunev" Date: Wed, 4 Jun 2008 15:16:12 -0700 Subject: raw: Raw socket leak. The program below just leaks the raw kernel socket int main() { int fd = socket(PF_INET, SOCK_RAW, IPPROTO_UDP); struct sockaddr_in addr; memset(&addr, 0, sizeof(addr)); inet_aton("127.0.0.1", &addr.sin_addr); addr.sin_family = AF_INET; addr.sin_port = htons(2048); sendto(fd, "a", 1, MSG_MORE, &addr, sizeof(addr)); return 0; } Corked packet is allocated via sock_wmalloc which holds the owner socket, so one should uncork it and flush all pending data on close. Do this in the same way as in UDP. Signed-off-by: Denis V. Lunev Acked-by: Alexey Kuznetsov Signed-off-by: David S. Miller --- net/ipv4/raw.c | 9 +++++++++ net/ipv6/raw.c | 9 +++++++++ 2 files changed, 18 insertions(+) (limited to 'net') diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index fead049daf4..e7e091d365f 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -608,6 +608,14 @@ static void raw_close(struct sock *sk, long timeout) sk_common_release(sk); } +static int raw_destroy(struct sock *sk) +{ + lock_sock(sk); + ip_flush_pending_frames(sk); + release_sock(sk); + return 0; +} + /* This gets rid of all the nasties in af_inet. -DaveM */ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -820,6 +828,7 @@ struct proto raw_prot = { .name = "RAW", .owner = THIS_MODULE, .close = raw_close, + .destroy = raw_destroy, .connect = ip4_datagram_connect, .disconnect = udp_disconnect, .ioctl = raw_ioctl, diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 603df76e052..8fee9a15b2d 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1164,6 +1164,14 @@ static void rawv6_close(struct sock *sk, long timeout) sk_common_release(sk); } +static int raw6_destroy(struct sock *sk) +{ + lock_sock(sk); + ip6_flush_pending_frames(sk); + release_sock(sk); + return 0; +} + static int rawv6_init_sk(struct sock *sk) { struct raw6_sock *rp = raw6_sk(sk); @@ -1187,6 +1195,7 @@ struct proto rawv6_prot = { .name = "RAWv6", .owner = THIS_MODULE, .close = rawv6_close, + .destroy = raw6_destroy, .connect = ip6_datagram_connect, .disconnect = udp_disconnect, .ioctl = rawv6_ioctl, -- cgit v1.2.3 From 26af65cbeb2467a486ae4fc7242c94e470c67c50 Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Wed, 4 Jun 2008 15:19:35 -0700 Subject: tcp: Increment OUTRSTS in tcp_send_active_reset() TCP "resets sent" counter is not incremented when a TCP Reset is sent via tcp_send_active_reset(). Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e399bde7813..ad993ecb481 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2131,6 +2131,8 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb, 0, priority)) NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); + + TCP_INC_STATS(TCP_MIB_OUTRSTS); } /* WARNING: This routine must only be called when we have already sent -- cgit v1.2.3 From 293ad60401da621b8b329abbe8c388edb25f658a Mon Sep 17 00:00:00 2001 From: Octavian Purdila Date: Wed, 4 Jun 2008 15:45:58 -0700 Subject: tcp: Fix for race due to temporary drop of the socket lock in skb_splice_bits. skb_splice_bits temporary drops the socket lock while iterating over the socket queue in order to break a reverse locking condition which happens with sendfile. This, however, opens a window of opportunity for tcp_collapse() to aggregate skbs and thus potentially free the current skb used in skb_splice_bits and tcp_read_sock. This patch fixes the problem by (re-)getting the same "logical skb" after the lock has been temporary dropped. Based on idea and initial patch from Evgeniy Polyakov. Signed-off-by: Octavian Purdila Acked-by: Evgeniy Polyakov Signed-off-by: David S. Miller --- net/core/skbuff.c | 5 +++-- net/ipv4/tcp.c | 9 ++++++++- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 5c459f2b798..1e556d31211 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1445,6 +1445,7 @@ done: if (spd.nr_pages) { int ret; + struct sock *sk = __skb->sk; /* * Drop the socket lock, otherwise we have reverse @@ -1455,9 +1456,9 @@ done: * we call into ->sendpage() with the i_mutex lock held * and networking will grab the socket lock. */ - release_sock(__skb->sk); + release_sock(sk); ret = splice_to_pipe(pipe, &spd); - lock_sock(__skb->sk); + lock_sock(sk); return ret; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f8865313862..ab66683b804 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1227,7 +1227,14 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, copied += used; offset += used; } - if (offset != skb->len) + /* + * If recv_actor drops the lock (e.g. TCP splice + * receive) the skb pointer might be invalid when + * getting here: tcp_collapse might have deleted it + * while aggregating skbs from the socket queue. + */ + skb = tcp_recv_skb(sk, seq-1, &offset); + if (!skb || (offset+1 != skb->len)) break; } if (tcp_hdr(skb)->fin) { -- cgit v1.2.3 From 507b06d0622480f8026d49a94f86068bb0fd6ed6 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Tue, 3 Jun 2008 23:39:55 -0400 Subject: mac80211: send association event on IBSS create Otherwise userspace has no idea the IBSS creation succeeded. Signed-off-by: Dan Williams Signed-off-by: John W. Linville --- net/mac80211/mlme.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 841278f1df8..af375da9df2 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2336,6 +2336,7 @@ static int ieee80211_sta_join_ibss(struct net_device *dev, u8 *pos; struct ieee80211_sub_if_data *sdata; struct ieee80211_supported_band *sband; + union iwreq_data wrqu; sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; @@ -2479,6 +2480,10 @@ static int ieee80211_sta_join_ibss(struct net_device *dev, ifsta->state = IEEE80211_IBSS_JOINED; mod_timer(&ifsta->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL); + memset(&wrqu, 0, sizeof(wrqu)); + memcpy(wrqu.ap_addr.sa_data, bss->bssid, ETH_ALEN); + wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL); + return res; } -- cgit v1.2.3 From ad81b2f97d42e13ef78bb3798e046cd5f0492980 Mon Sep 17 00:00:00 2001 From: Assaf Krauss Date: Wed, 4 Jun 2008 20:27:59 +0300 Subject: mac80211: Fixing slow IBSS rejoin This patch fixes the issue of slow reconnection to an IBSS cell after disconnection from it. Now the interface's bssid is reset upon ifdown. ieee80211_sta_find_ibss: if (found && memcmp(ifsta->bssid, bssid, ETH_ALEN) != 0 && (bss = ieee80211_rx_bss_get(dev, bssid, local->hw.conf.channel->center_freq, ifsta->ssid, ifsta->ssid_len))) Note: In general disconnection is still not handled properly in mac80211 Signed-off-by: Assaf Krauss Signed-off-by: Tomas Winkler Signed-off-by: John W. Linville --- net/mac80211/main.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 5c876450b14..98c0b5e56ec 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -511,6 +511,7 @@ static int ieee80211_stop(struct net_device *dev) case IEEE80211_IF_TYPE_STA: case IEEE80211_IF_TYPE_IBSS: sdata->u.sta.state = IEEE80211_DISABLED; + memset(sdata->u.sta.bssid, 0, ETH_ALEN); del_timer_sync(&sdata->u.sta.timer); /* * When we get here, the interface is marked down. -- cgit v1.2.3 From 872ba53395b2a8be08c3ea2d39e225e5b4a8cb40 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 4 Jun 2008 13:59:34 -0400 Subject: mac80211: decrease IBSS creation latency Sufficient scans (at least 2 or 3) should have been done within 7 seconds to find an existing IBSS to join. This should improve IBSS creation latency; and since IBSS merging is still in effect, shouldn't have detrimental effects on eventual IBSS convergence. Signed-off-by: Dan Williams Signed-off-by: John W. Linville --- net/mac80211/mlme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index af375da9df2..affe42f8484 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -44,7 +44,7 @@ #define IEEE80211_RETRY_AUTH_INTERVAL (1 * HZ) #define IEEE80211_SCAN_INTERVAL (2 * HZ) #define IEEE80211_SCAN_INTERVAL_SLOW (15 * HZ) -#define IEEE80211_IBSS_JOIN_TIMEOUT (20 * HZ) +#define IEEE80211_IBSS_JOIN_TIMEOUT (7 * HZ) #define IEEE80211_PROBE_DELAY (HZ / 33) #define IEEE80211_CHANNEL_TIME (HZ / 33) -- cgit v1.2.3 From be038b376465953c358d675cb38a611898a49dc2 Mon Sep 17 00:00:00 2001 From: Assaf Krauss Date: Thu, 5 Jun 2008 19:55:21 +0300 Subject: mac80211: Checking IBSS support while changing channel in ad-hoc mode This patch adds a check to the set_channel flow. When attempting to change the channel while in IBSS mode, and the new channel does not support IBSS mode, the flow return with an error value with no consequences on the mac80211 and driver state. Signed-off-by: Assaf Krauss Signed-off-by: Emmanuel Grumbach Signed-off-by: Tomas Winkler Signed-off-by: John W. Linville --- net/mac80211/ieee80211_i.h | 2 +- net/mac80211/mlme.c | 11 ++++------- net/mac80211/wext.c | 15 +++++++++++---- 3 files changed, 16 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index c7314bf4bec..006486b2672 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -899,7 +899,7 @@ extern const struct iw_handler_def ieee80211_iw_handler_def; /* ieee80211_ioctl.c */ -int ieee80211_set_freq(struct ieee80211_local *local, int freq); +int ieee80211_set_freq(struct net_device *dev, int freq); /* ieee80211_sta.c */ void ieee80211_sta_timer(unsigned long data); void ieee80211_sta_work(struct work_struct *work); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index affe42f8484..4d2b582dd05 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2359,13 +2359,10 @@ static int ieee80211_sta_join_ibss(struct net_device *dev, sdata->drop_unencrypted = bss->capability & WLAN_CAPABILITY_PRIVACY ? 1 : 0; - res = ieee80211_set_freq(local, bss->freq); + res = ieee80211_set_freq(dev, bss->freq); - if (local->oper_channel->flags & IEEE80211_CHAN_NO_IBSS) { - printk(KERN_DEBUG "%s: IBSS not allowed on frequency " - "%d MHz\n", dev->name, local->oper_channel->center_freq); - return -1; - } + if (res) + return res; /* Set beacon template */ skb = dev_alloc_skb(local->hw.extra_tx_headroom + 400); @@ -3491,7 +3488,7 @@ static int ieee80211_sta_config_auth(struct net_device *dev, spin_unlock_bh(&local->sta_bss_lock); if (selected) { - ieee80211_set_freq(local, selected->freq); + ieee80211_set_freq(dev, selected->freq); if (!(ifsta->flags & IEEE80211_STA_SSID_SET)) ieee80211_sta_set_ssid(dev, selected->ssid, selected->ssid_len); diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c index 8311bb24f9f..a8bb8e31b1e 100644 --- a/net/mac80211/wext.c +++ b/net/mac80211/wext.c @@ -290,14 +290,22 @@ static int ieee80211_ioctl_giwmode(struct net_device *dev, return 0; } -int ieee80211_set_freq(struct ieee80211_local *local, int freqMHz) +int ieee80211_set_freq(struct net_device *dev, int freqMHz) { int ret = -EINVAL; struct ieee80211_channel *chan; + struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); chan = ieee80211_get_channel(local->hw.wiphy, freqMHz); if (chan && !(chan->flags & IEEE80211_CHAN_DISABLED)) { + if (sdata->vif.type == IEEE80211_IF_TYPE_IBSS && + chan->flags & IEEE80211_CHAN_NO_IBSS) { + printk(KERN_DEBUG "%s: IBSS not allowed on frequency " + "%d MHz\n", dev->name, chan->center_freq); + return ret; + } local->oper_channel = chan; if (local->sta_sw_scanning || local->sta_hw_scanning) @@ -315,7 +323,6 @@ static int ieee80211_ioctl_siwfreq(struct net_device *dev, struct iw_request_info *info, struct iw_freq *freq, char *extra) { - struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr); struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); if (sdata->vif.type == IEEE80211_IF_TYPE_STA) @@ -329,14 +336,14 @@ static int ieee80211_ioctl_siwfreq(struct net_device *dev, IEEE80211_STA_AUTO_CHANNEL_SEL; return 0; } else - return ieee80211_set_freq(local, + return ieee80211_set_freq(dev, ieee80211_channel_to_frequency(freq->m)); } else { int i, div = 1000000; for (i = 0; i < freq->e; i++) div /= 10; if (div > 0) - return ieee80211_set_freq(local, freq->m / div); + return ieee80211_set_freq(dev, freq->m / div); else return -EINVAL; } -- cgit v1.2.3 From 2e761e0532a784816e7e822dbaaece8c5d4be14d Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 9 Jun 2008 15:53:30 -0700 Subject: ipv6 netns: init net is used to set bindv6only for new sock The bindv6only is tuned via sysctl. It is already on a struct net and per-net sysctls allow for its modification (ipv6_sysctl_net_init). Despite this the value configured in the init net is used for the rest of them. Signed-off-by: Pavel Emelyanov Signed-off-by: David S. Miller --- net/ipv6/af_inet6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 3c6aafb0218..e84b3fd17fb 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -191,7 +191,7 @@ lookup_protocol: np->mcast_hops = -1; np->mc_loop = 1; np->pmtudisc = IPV6_PMTUDISC_WANT; - np->ipv6only = init_net.ipv6.sysctl.bindv6only; + np->ipv6only = net->ipv6.sysctl.bindv6only; /* Init the ipv4 part of the socket since we can have sockets * using v6 API for ipv4. -- cgit v1.2.3