From 23475d66bd8600e0c5353f86c1b74f68df27bdb5 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:08 -0400 Subject: [PATCH] RPC: Report connection errors properly when mounting with "soft" Fix up xprt_connect_status: the soft timeout logic was clobbering tk_status, so TCP connect errors were not properly reported on soft mounts. Test-plan: Destructive testing (unplugging the network temporarily). Connectathon with UDP and TCP. Version: Thu, 11 Aug 2005 16:01:28 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 3c654e06b08..b28ea0cc0cb 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -592,24 +592,33 @@ xprt_connect_status(struct rpc_task *task) return; } - /* if soft mounted, just cause this RPC to fail */ - if (RPC_IS_SOFT(task)) - task->tk_status = -EIO; - switch (task->tk_status) { case -ECONNREFUSED: case -ECONNRESET: + dprintk("RPC: %4d xprt_connect_status: server %s refused connection\n", + task->tk_pid, task->tk_client->cl_server); + break; case -ENOTCONN: - return; + dprintk("RPC: %4d xprt_connect_status: connection broken\n", + task->tk_pid); + break; case -ETIMEDOUT: - dprintk("RPC: %4d xprt_connect_status: timed out\n", + dprintk("RPC: %4d xprt_connect_status: connect attempt timed out\n", task->tk_pid); break; default: - printk(KERN_ERR "RPC: error %d connecting to server %s\n", - -task->tk_status, task->tk_client->cl_server); + dprintk("RPC: %4d xprt_connect_status: error %d connecting to server %s\n", + task->tk_pid, -task->tk_status, task->tk_client->cl_server); + xprt_release_write(xprt, task); + task->tk_status = -EIO; + return; + } + + /* if soft mounted, just cause this RPC to fail */ + if (RPC_IS_SOFT(task)) { + xprt_release_write(xprt, task); + task->tk_status = -EIO; } - xprt_release_write(xprt, task); } /* -- cgit v1.2.3 From da35187801732397a7e05fb9e77f3700cc35f5db Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:11 -0400 Subject: [PATCH] RPC: proper soft timeout behavior for rpcbind Implement a best practice: for soft mounts, an rpcbind timeout should cause an RPC request to fail. This also provides an FSM hook for retrying an rpcbind with a different rpcbind protocol version. We'll use this later to try multiple rpcbind protocol versions when binding. To enable this, expose the RPC error code returned during a portmap request to the FSM so it can make some decision about how to report, retry, or fail the request. Test-plan: Hundreds of passes with connectathon NFSv3 locking suite, on the client and server. Version: Thu, 11 Aug 2005 16:01:53 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 97 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 77 insertions(+), 20 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index f17e6153b68..2d3cf0a52d8 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -53,6 +53,7 @@ static void call_allocate(struct rpc_task *task); static void call_encode(struct rpc_task *task); static void call_decode(struct rpc_task *task); static void call_bind(struct rpc_task *task); +static void call_bind_status(struct rpc_task *task); static void call_transmit(struct rpc_task *task); static void call_status(struct rpc_task *task); static void call_refresh(struct rpc_task *task); @@ -734,43 +735,94 @@ static void call_bind(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; - struct rpc_xprt *xprt = clnt->cl_xprt; - - dprintk("RPC: %4d call_bind xprt %p %s connected\n", task->tk_pid, - xprt, (xprt_connected(xprt) ? "is" : "is not")); - task->tk_action = (xprt_connected(xprt)) ? call_transmit : call_connect; + dprintk("RPC: %4d call_bind (status %d)\n", + task->tk_pid, task->tk_status); + task->tk_action = call_connect; if (!clnt->cl_port) { - task->tk_action = call_connect; + task->tk_action = call_bind_status; task->tk_timeout = RPC_CONNECT_TIMEOUT; rpc_getport(task, clnt); } } /* - * 4a. Connect to the RPC server (TCP case) + * 4a. Sort out bind result + */ +static void +call_bind_status(struct rpc_task *task) +{ + int status = -EACCES; + + if (task->tk_status >= 0) { + dprintk("RPC: %4d call_bind_status (status %d)\n", + task->tk_pid, task->tk_status); + task->tk_status = 0; + task->tk_action = call_connect; + return; + } + + switch (task->tk_status) { + case -EACCES: + dprintk("RPC: %4d remote rpcbind: RPC program/version unavailable\n", + task->tk_pid); + break; + case -ETIMEDOUT: + dprintk("RPC: %4d rpcbind request timed out\n", + task->tk_pid); + if (RPC_IS_SOFT(task)) { + status = -EIO; + break; + } + goto retry_bind; + case -EPFNOSUPPORT: + dprintk("RPC: %4d remote rpcbind service unavailable\n", + task->tk_pid); + break; + case -EPROTONOSUPPORT: + dprintk("RPC: %4d remote rpcbind version 2 unavailable\n", + task->tk_pid); + break; + default: + dprintk("RPC: %4d unrecognized rpcbind error (%d)\n", + task->tk_pid, -task->tk_status); + status = -EIO; + break; + } + + rpc_exit(task, status); + return; + +retry_bind: + task->tk_status = 0; + task->tk_action = call_bind; + return; +} + +/* + * 4b. Connect to the RPC server */ static void call_connect(struct rpc_task *task) { - struct rpc_clnt *clnt = task->tk_client; + struct rpc_xprt *xprt = task->tk_xprt; - dprintk("RPC: %4d call_connect status %d\n", - task->tk_pid, task->tk_status); + dprintk("RPC: %4d call_connect xprt %p %s connected\n", + task->tk_pid, xprt, + (xprt_connected(xprt) ? "is" : "is not")); - if (xprt_connected(clnt->cl_xprt)) { - task->tk_action = call_transmit; - return; + task->tk_action = call_transmit; + if (!xprt_connected(xprt)) { + task->tk_action = call_connect_status; + if (task->tk_status < 0) + return; + xprt_connect(task); } - task->tk_action = call_connect_status; - if (task->tk_status < 0) - return; - xprt_connect(task); } /* - * 4b. Sort out connect result + * 4c. Sort out connect result */ static void call_connect_status(struct rpc_task *task) @@ -778,6 +830,9 @@ call_connect_status(struct rpc_task *task) struct rpc_clnt *clnt = task->tk_client; int status = task->tk_status; + dprintk("RPC: %5u call_connect_status (status %d)\n", + task->tk_pid, task->tk_status); + task->tk_status = 0; if (status >= 0) { clnt->cl_stats->netreconn++; @@ -785,17 +840,19 @@ call_connect_status(struct rpc_task *task) return; } - /* Something failed: we may have to rebind */ + /* Something failed: remote service port may have changed */ if (clnt->cl_autobind) clnt->cl_port = 0; + switch (status) { case -ENOTCONN: case -ETIMEDOUT: case -EAGAIN: - task->tk_action = (clnt->cl_port == 0) ? call_bind : call_connect; + task->tk_action = call_bind; break; default: rpc_exit(task, -EIO); + break; } } -- cgit v1.2.3 From eab5c084b858fd95a873fc2b97de9a9ad937b4ed Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:14 -0400 Subject: [PATCH] NFS: use a constant value for TCP retransmit timeouts Implement a best practice: don't use exponential backoff when computing retransmit timeout values on TCP connections, but simply retransmit at regular intervals. This also fixes a bug introduced when xprt_reset_majortimeo() was added. Test-plan: Enable RPC debugging and watch timeout behavior on a NFS/TCP mount. Version: Thu, 11 Aug 2005 16:02:19 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index b28ea0cc0cb..0e4ffdaa012 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1453,7 +1453,7 @@ xprt_default_timeout(struct rpc_timeout *to, int proto) if (proto == IPPROTO_UDP) xprt_set_timeout(to, 5, 5 * HZ); else - xprt_set_timeout(to, 5, 60 * HZ); + xprt_set_timeout(to, 2, 60 * HZ); } /* @@ -1464,7 +1464,7 @@ xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long incr) { to->to_initval = to->to_increment = incr; - to->to_maxval = incr * retr; + to->to_maxval = to->to_initval + (incr * retr); to->to_retries = retr; to->to_exponential = 0; } -- cgit v1.2.3 From 602f83273c89fdd25f24757564d8001cf723e740 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:17 -0400 Subject: [PATCH] RPC: portmapper doesn't need a reserved port The in-kernel portmapper does not require a reserved port for making bind queries. Test-plan: Tens of runs of the Connectathon locking suite with TCP and UDP against several other NFS server implementations using NFSv3, not NFSv4 (which doesn't require rpcbind). Version: Thu, 11 Aug 2005 16:02:43 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/pmap_clnt.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/sunrpc') diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c index 4e81f276692..d8e3f220002 100644 --- a/net/sunrpc/pmap_clnt.c +++ b/net/sunrpc/pmap_clnt.c @@ -208,6 +208,7 @@ pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto) if (IS_ERR(xprt)) return (struct rpc_clnt *)xprt; xprt->addr.sin_port = htons(RPC_PMAP_PORT); + xprt->resvport = 0; /* printk("pmap: create clnt\n"); */ clnt = rpc_new_client(xprt, hostname, -- cgit v1.2.3 From 094bb20b9fcab3a1652a77741caba6b78097d622 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:20 -0400 Subject: [PATCH] RPC: extract socket logic common to both client and server Clean-up: Move some code that is common to both RPC client- and server-side socket transports into its own source file, net/sunrpc/socklib.c. Test-plan: Compile kernel with CONFIG_NFS enabled. Millions of fsx operations over UDP, client and server. Connectathon over UDP. Version: Thu, 11 Aug 2005 16:03:09 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/Makefile | 2 +- net/sunrpc/socklib.c | 175 +++++++++++++++++++++++++++++++++++++++++++++++++++ net/sunrpc/svcsock.c | 3 - net/sunrpc/xdr.c | 75 ---------------------- net/sunrpc/xprt.c | 64 ------------------- 5 files changed, 176 insertions(+), 143 deletions(-) create mode 100644 net/sunrpc/socklib.c (limited to 'net/sunrpc') diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile index 46a2ce00a29..f0a95562717 100644 --- a/net/sunrpc/Makefile +++ b/net/sunrpc/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_SUNRPC) += sunrpc.o obj-$(CONFIG_SUNRPC_GSS) += auth_gss/ -sunrpc-y := clnt.o xprt.o sched.o \ +sunrpc-y := clnt.o xprt.o socklib.o sched.o \ auth.o auth_null.o auth_unix.o \ svc.o svcsock.o svcauth.o svcauth_unix.o \ pmap_clnt.o timer.o xdr.o \ diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c new file mode 100644 index 00000000000..8f97e90f36c --- /dev/null +++ b/net/sunrpc/socklib.c @@ -0,0 +1,175 @@ +/* + * linux/net/sunrpc/socklib.c + * + * Common socket helper routines for RPC client and server + * + * Copyright (C) 1995, 1996 Olaf Kirch + */ + +#include +#include +#include +#include + + +/** + * skb_read_bits - copy some data bits from skb to internal buffer + * @desc: sk_buff copy helper + * @to: copy destination + * @len: number of bytes to copy + * + * Possibly called several times to iterate over an sk_buff and copy + * data out of it. + */ +static size_t skb_read_bits(skb_reader_t *desc, void *to, size_t len) +{ + if (len > desc->count) + len = desc->count; + if (skb_copy_bits(desc->skb, desc->offset, to, len)) + return 0; + desc->count -= len; + desc->offset += len; + return len; +} + +/** + * skb_read_and_csum_bits - copy and checksum from skb to buffer + * @desc: sk_buff copy helper + * @to: copy destination + * @len: number of bytes to copy + * + * Same as skb_read_bits, but calculate a checksum at the same time. + */ +static size_t skb_read_and_csum_bits(skb_reader_t *desc, void *to, size_t len) +{ + unsigned int csum2, pos; + + if (len > desc->count) + len = desc->count; + pos = desc->offset; + csum2 = skb_copy_and_csum_bits(desc->skb, pos, to, len, 0); + desc->csum = csum_block_add(desc->csum, csum2, pos); + desc->count -= len; + desc->offset += len; + return len; +} + +/** + * xdr_partial_copy_from_skb - copy data out of an skb + * @xdr: target XDR buffer + * @base: starting offset + * @desc: sk_buff copy helper + * @copy_actor: virtual method for copying data + * + */ +ssize_t xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, skb_reader_t *desc, skb_read_actor_t copy_actor) +{ + struct page **ppage = xdr->pages; + unsigned int len, pglen = xdr->page_len; + ssize_t copied = 0; + int ret; + + len = xdr->head[0].iov_len; + if (base < len) { + len -= base; + ret = copy_actor(desc, (char *)xdr->head[0].iov_base + base, len); + copied += ret; + if (ret != len || !desc->count) + goto out; + base = 0; + } else + base -= len; + + if (unlikely(pglen == 0)) + goto copy_tail; + if (unlikely(base >= pglen)) { + base -= pglen; + goto copy_tail; + } + if (base || xdr->page_base) { + pglen -= base; + base += xdr->page_base; + ppage += base >> PAGE_CACHE_SHIFT; + base &= ~PAGE_CACHE_MASK; + } + do { + char *kaddr; + + /* ACL likes to be lazy in allocating pages - ACLs + * are small by default but can get huge. */ + if (unlikely(*ppage == NULL)) { + *ppage = alloc_page(GFP_ATOMIC); + if (unlikely(*ppage == NULL)) { + if (copied == 0) + copied = -ENOMEM; + goto out; + } + } + + len = PAGE_CACHE_SIZE; + kaddr = kmap_atomic(*ppage, KM_SKB_SUNRPC_DATA); + if (base) { + len -= base; + if (pglen < len) + len = pglen; + ret = copy_actor(desc, kaddr + base, len); + base = 0; + } else { + if (pglen < len) + len = pglen; + ret = copy_actor(desc, kaddr, len); + } + flush_dcache_page(*ppage); + kunmap_atomic(kaddr, KM_SKB_SUNRPC_DATA); + copied += ret; + if (ret != len || !desc->count) + goto out; + ppage++; + } while ((pglen -= len) != 0); +copy_tail: + len = xdr->tail[0].iov_len; + if (base < len) + copied += copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base); +out: + return copied; +} + +/** + * csum_partial_copy_to_xdr - checksum and copy data + * @xdr: target XDR buffer + * @skb: source skb + * + * We have set things up such that we perform the checksum of the UDP + * packet in parallel with the copies into the RPC client iovec. -DaveM + */ +int csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) +{ + skb_reader_t desc; + + desc.skb = skb; + desc.offset = sizeof(struct udphdr); + desc.count = skb->len - desc.offset; + + if (skb->ip_summed == CHECKSUM_UNNECESSARY) + goto no_checksum; + + desc.csum = csum_partial(skb->data, desc.offset, skb->csum); + if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_and_csum_bits) < 0) + return -1; + if (desc.offset != skb->len) { + unsigned int csum2; + csum2 = skb_checksum(skb, desc.offset, skb->len - desc.offset, 0); + desc.csum = csum_block_add(desc.csum, csum2, desc.offset); + } + if (desc.count) + return -1; + if ((unsigned short)csum_fold(desc.csum)) + return -1; + return 0; +no_checksum: + if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits) < 0) + return -1; + if (desc.count) + return -1; + return 0; +} diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 30ec3efc48a..130f2b5d93d 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -548,9 +548,6 @@ svc_write_space(struct sock *sk) /* * Receive a datagram from a UDP socket. */ -extern int -csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb); - static int svc_udp_recvfrom(struct svc_rqst *rqstp) { diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index fde16f40a58..9cc12aeed22 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -176,81 +176,6 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset, xdr->buflen += len; } -ssize_t -xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, - skb_reader_t *desc, - skb_read_actor_t copy_actor) -{ - struct page **ppage = xdr->pages; - unsigned int len, pglen = xdr->page_len; - ssize_t copied = 0; - int ret; - - len = xdr->head[0].iov_len; - if (base < len) { - len -= base; - ret = copy_actor(desc, (char *)xdr->head[0].iov_base + base, len); - copied += ret; - if (ret != len || !desc->count) - goto out; - base = 0; - } else - base -= len; - - if (pglen == 0) - goto copy_tail; - if (base >= pglen) { - base -= pglen; - goto copy_tail; - } - if (base || xdr->page_base) { - pglen -= base; - base += xdr->page_base; - ppage += base >> PAGE_CACHE_SHIFT; - base &= ~PAGE_CACHE_MASK; - } - do { - char *kaddr; - - /* ACL likes to be lazy in allocating pages - ACLs - * are small by default but can get huge. */ - if (unlikely(*ppage == NULL)) { - *ppage = alloc_page(GFP_ATOMIC); - if (unlikely(*ppage == NULL)) { - if (copied == 0) - copied = -ENOMEM; - goto out; - } - } - - len = PAGE_CACHE_SIZE; - kaddr = kmap_atomic(*ppage, KM_SKB_SUNRPC_DATA); - if (base) { - len -= base; - if (pglen < len) - len = pglen; - ret = copy_actor(desc, kaddr + base, len); - base = 0; - } else { - if (pglen < len) - len = pglen; - ret = copy_actor(desc, kaddr, len); - } - flush_dcache_page(*ppage); - kunmap_atomic(kaddr, KM_SKB_SUNRPC_DATA); - copied += ret; - if (ret != len || !desc->count) - goto out; - ppage++; - } while ((pglen -= len) != 0); -copy_tail: - len = xdr->tail[0].iov_len; - if (base < len) - copied += copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base); -out: - return copied; -} - int xdr_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 0e4ffdaa012..67444f494fe 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -691,70 +691,6 @@ xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied) return; } -static size_t -skb_read_bits(skb_reader_t *desc, void *to, size_t len) -{ - if (len > desc->count) - len = desc->count; - if (skb_copy_bits(desc->skb, desc->offset, to, len)) - return 0; - desc->count -= len; - desc->offset += len; - return len; -} - -static size_t -skb_read_and_csum_bits(skb_reader_t *desc, void *to, size_t len) -{ - unsigned int csum2, pos; - - if (len > desc->count) - len = desc->count; - pos = desc->offset; - csum2 = skb_copy_and_csum_bits(desc->skb, pos, to, len, 0); - desc->csum = csum_block_add(desc->csum, csum2, pos); - desc->count -= len; - desc->offset += len; - return len; -} - -/* - * We have set things up such that we perform the checksum of the UDP - * packet in parallel with the copies into the RPC client iovec. -DaveM - */ -int -csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) -{ - skb_reader_t desc; - - desc.skb = skb; - desc.offset = sizeof(struct udphdr); - desc.count = skb->len - desc.offset; - - if (skb->ip_summed == CHECKSUM_UNNECESSARY) - goto no_checksum; - - desc.csum = csum_partial(skb->data, desc.offset, skb->csum); - if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_and_csum_bits) < 0) - return -1; - if (desc.offset != skb->len) { - unsigned int csum2; - csum2 = skb_checksum(skb, desc.offset, skb->len - desc.offset, 0); - desc.csum = csum_block_add(desc.csum, csum2, desc.offset); - } - if (desc.count) - return -1; - if ((unsigned short)csum_fold(desc.csum)) - return -1; - return 0; -no_checksum: - if (xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits) < 0) - return -1; - if (desc.count) - return -1; - return 0; -} - /* * Input handler for RPC replies. Called from a bottom half and hence * atomic. -- cgit v1.2.3 From a246b0105bbd9a70a698f69baae2042996f2a0e9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:23 -0400 Subject: [PATCH] RPC: introduce client-side transport switch Move the bulk of client-side socket-specific code into a separate source file, net/sunrpc/xprtsock.c. Test-plan: Millions of fsx operations. Performance characterization such as "sio" or "iozone". Destructive testing (unplugging the network temporarily, server reboots). Connectathon with v2, v3, and v4. Version: Thu, 11 Aug 2005 16:03:38 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/Makefile | 2 +- net/sunrpc/clnt.c | 3 +- net/sunrpc/sysctl.c | 3 + net/sunrpc/xdr.c | 102 +---- net/sunrpc/xprt.c | 916 +++----------------------------------------- net/sunrpc/xprtsock.c | 1021 +++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 1070 insertions(+), 977 deletions(-) create mode 100644 net/sunrpc/xprtsock.c (limited to 'net/sunrpc') diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile index f0a95562717..cdcab9ca4c6 100644 --- a/net/sunrpc/Makefile +++ b/net/sunrpc/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_SUNRPC) += sunrpc.o obj-$(CONFIG_SUNRPC_GSS) += auth_gss/ -sunrpc-y := clnt.o xprt.o socklib.o sched.o \ +sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ auth.o auth_null.o auth_unix.o \ svc.o svcsock.o svcauth.o svcauth_unix.o \ pmap_clnt.o timer.o xdr.o \ diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 2d3cf0a52d8..ab50c3c9e6a 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -525,8 +525,7 @@ rpc_setbufsize(struct rpc_clnt *clnt, unsigned int sndsize, unsigned int rcvsize xprt->rcvsize = 0; if (rcvsize) xprt->rcvsize = rcvsize + RPC_SLACK_SPACE; - if (xprt_connected(xprt)) - xprt_sock_setbufsize(xprt); + xprt->ops->set_buffer_size(xprt); } /* diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index 1b9616a12e2..ef483262f17 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -119,6 +119,9 @@ done: return 0; } +unsigned int xprt_udp_slot_table_entries = RPC_DEF_SLOT_TABLE; +unsigned int xprt_tcp_slot_table_entries = RPC_DEF_SLOT_TABLE; + static unsigned int min_slot_table_size = RPC_MIN_SLOT_TABLE; static unsigned int max_slot_table_size = RPC_MAX_SLOT_TABLE; diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 9cc12aeed22..32df43372ee 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -6,15 +6,12 @@ * Copyright (C) 1995, 1996 Olaf Kirch */ +#include #include -#include #include #include #include #include -#include -#include -#include #include #include @@ -177,103 +174,6 @@ xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset, } -int -xdr_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, - struct xdr_buf *xdr, unsigned int base, int msgflags) -{ - struct page **ppage = xdr->pages; - unsigned int len, pglen = xdr->page_len; - int err, ret = 0; - ssize_t (*sendpage)(struct socket *, struct page *, int, size_t, int); - - len = xdr->head[0].iov_len; - if (base < len || (addr != NULL && base == 0)) { - struct kvec iov = { - .iov_base = xdr->head[0].iov_base + base, - .iov_len = len - base, - }; - struct msghdr msg = { - .msg_name = addr, - .msg_namelen = addrlen, - .msg_flags = msgflags, - }; - if (xdr->len > len) - msg.msg_flags |= MSG_MORE; - - if (iov.iov_len != 0) - err = kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); - else - err = kernel_sendmsg(sock, &msg, NULL, 0, 0); - if (ret == 0) - ret = err; - else if (err > 0) - ret += err; - if (err != iov.iov_len) - goto out; - base = 0; - } else - base -= len; - - if (pglen == 0) - goto copy_tail; - if (base >= pglen) { - base -= pglen; - goto copy_tail; - } - if (base || xdr->page_base) { - pglen -= base; - base += xdr->page_base; - ppage += base >> PAGE_CACHE_SHIFT; - base &= ~PAGE_CACHE_MASK; - } - - sendpage = sock->ops->sendpage ? : sock_no_sendpage; - do { - int flags = msgflags; - - len = PAGE_CACHE_SIZE; - if (base) - len -= base; - if (pglen < len) - len = pglen; - - if (pglen != len || xdr->tail[0].iov_len != 0) - flags |= MSG_MORE; - - /* Hmm... We might be dealing with highmem pages */ - if (PageHighMem(*ppage)) - sendpage = sock_no_sendpage; - err = sendpage(sock, *ppage, base, len, flags); - if (ret == 0) - ret = err; - else if (err > 0) - ret += err; - if (err != len) - goto out; - base = 0; - ppage++; - } while ((pglen -= len) != 0); -copy_tail: - len = xdr->tail[0].iov_len; - if (base < len) { - struct kvec iov = { - .iov_base = xdr->tail[0].iov_base + base, - .iov_len = len - base, - }; - struct msghdr msg = { - .msg_flags = msgflags, - }; - err = kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); - if (ret == 0) - ret = err; - else if (err > 0) - ret += err; - } -out: - return ret; -} - - /* * Helper routines for doing 'memmove' like operations on a struct xdr_buf * diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 67444f494fe..4342acf4d1c 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -32,37 +32,16 @@ * tasks that rely on callbacks. * * Copyright (C) 1995-1997, Olaf Kirch - * - * TCP callback races fixes (C) 1998 Red Hat Software - * TCP send fixes (C) 1998 Red Hat Software - * TCP NFS related read + write fixes - * (C) 1999 Dave Airlie, University of Limerick, Ireland - * - * Rewrite of larges part of the code in order to stabilize TCP stuff. - * Fix behaviour when socket buffer is full. - * (C) 1999 Trond Myklebust */ +#include + #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include #include #include -#include -#include -#include -#include +#include /* * Local variables @@ -74,64 +53,17 @@ #endif #define XPRT_MAX_BACKOFF (8) -#define XPRT_IDLE_TIMEOUT (5*60*HZ) -#define XPRT_MAX_RESVPORT (800) /* * Local functions */ static void xprt_request_init(struct rpc_task *, struct rpc_xprt *); static inline void do_xprt_reserve(struct rpc_task *); -static void xprt_disconnect(struct rpc_xprt *); static void xprt_connect_status(struct rpc_task *task); -static struct rpc_xprt * xprt_setup(int proto, struct sockaddr_in *ap, - struct rpc_timeout *to); -static struct socket *xprt_create_socket(struct rpc_xprt *, int, int); -static void xprt_bind_socket(struct rpc_xprt *, struct socket *); static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); static int xprt_clear_backlog(struct rpc_xprt *xprt); -#ifdef RPC_DEBUG_DATA -/* - * Print the buffer contents (first 128 bytes only--just enough for - * diropres return). - */ -static void -xprt_pktdump(char *msg, u32 *packet, unsigned int count) -{ - u8 *buf = (u8 *) packet; - int j; - - dprintk("RPC: %s\n", msg); - for (j = 0; j < count && j < 128; j += 4) { - if (!(j & 31)) { - if (j) - dprintk("\n"); - dprintk("0x%04x ", j); - } - dprintk("%02x%02x%02x%02x ", - buf[j], buf[j+1], buf[j+2], buf[j+3]); - } - dprintk("\n"); -} -#else -static inline void -xprt_pktdump(char *msg, u32 *packet, unsigned int count) -{ - /* NOP */ -} -#endif - -/* - * Look up RPC transport given an INET socket - */ -static inline struct rpc_xprt * -xprt_from_sock(struct sock *sk) -{ - return (struct rpc_xprt *) sk->sk_user_data; -} - /* * Serialize write access to sockets, in order to prevent different * requests from interfering with each other. @@ -234,62 +166,6 @@ xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) spin_unlock_bh(&xprt->sock_lock); } -/* - * Write data to socket. - */ -static inline int -xprt_sendmsg(struct rpc_xprt *xprt, struct rpc_rqst *req) -{ - struct socket *sock = xprt->sock; - struct xdr_buf *xdr = &req->rq_snd_buf; - struct sockaddr *addr = NULL; - int addrlen = 0; - unsigned int skip; - int result; - - if (!sock) - return -ENOTCONN; - - xprt_pktdump("packet data:", - req->rq_svec->iov_base, - req->rq_svec->iov_len); - - /* For UDP, we need to provide an address */ - if (!xprt->stream) { - addr = (struct sockaddr *) &xprt->addr; - addrlen = sizeof(xprt->addr); - } - /* Dont repeat bytes */ - skip = req->rq_bytes_sent; - - clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags); - result = xdr_sendpages(sock, addr, addrlen, xdr, skip, MSG_DONTWAIT); - - dprintk("RPC: xprt_sendmsg(%d) = %d\n", xdr->len - skip, result); - - if (result >= 0) - return result; - - switch (result) { - case -ECONNREFUSED: - /* When the server has died, an ICMP port unreachable message - * prompts ECONNREFUSED. - */ - case -EAGAIN: - break; - case -ECONNRESET: - case -ENOTCONN: - case -EPIPE: - /* connection broken */ - if (xprt->stream) - result = -ENOTCONN; - break; - default: - printk(KERN_NOTICE "RPC: sendmsg returned error %d\n", -result); - } - return result; -} - /* * Van Jacobson congestion avoidance. Check if the congestion window * overflowed. Put the task to sleep if this is the case. @@ -405,48 +281,20 @@ int xprt_adjust_timeout(struct rpc_rqst *req) return status; } -/* - * Close down a transport socket - */ -static void -xprt_close(struct rpc_xprt *xprt) -{ - struct socket *sock = xprt->sock; - struct sock *sk = xprt->inet; - - if (!sk) - return; - - write_lock_bh(&sk->sk_callback_lock); - xprt->inet = NULL; - xprt->sock = NULL; - - sk->sk_user_data = NULL; - sk->sk_data_ready = xprt->old_data_ready; - sk->sk_state_change = xprt->old_state_change; - sk->sk_write_space = xprt->old_write_space; - write_unlock_bh(&sk->sk_callback_lock); - - sk->sk_no_check = 0; - - sock_release(sock); -} - static void xprt_socket_autoclose(void *args) { struct rpc_xprt *xprt = (struct rpc_xprt *)args; xprt_disconnect(xprt); - xprt_close(xprt); + xprt->ops->close(xprt); xprt_release_write(xprt, NULL); } /* * Mark a transport as disconnected */ -static void -xprt_disconnect(struct rpc_xprt *xprt) +void xprt_disconnect(struct rpc_xprt *xprt) { dprintk("RPC: disconnected transport %p\n", xprt); spin_lock_bh(&xprt->sock_lock); @@ -479,57 +327,6 @@ out_abort: spin_unlock(&xprt->sock_lock); } -static void xprt_socket_connect(void *args) -{ - struct rpc_xprt *xprt = (struct rpc_xprt *)args; - struct socket *sock = xprt->sock; - int status = -EIO; - - if (xprt->shutdown || xprt->addr.sin_port == 0) - goto out; - - /* - * Start by resetting any existing state - */ - xprt_close(xprt); - sock = xprt_create_socket(xprt, xprt->prot, xprt->resvport); - if (sock == NULL) { - /* couldn't create socket or bind to reserved port; - * this is likely a permanent error, so cause an abort */ - goto out; - } - xprt_bind_socket(xprt, sock); - xprt_sock_setbufsize(xprt); - - status = 0; - if (!xprt->stream) - goto out; - - /* - * Tell the socket layer to start connecting... - */ - status = sock->ops->connect(sock, (struct sockaddr *) &xprt->addr, - sizeof(xprt->addr), O_NONBLOCK); - dprintk("RPC: %p connect status %d connected %d sock state %d\n", - xprt, -status, xprt_connected(xprt), sock->sk->sk_state); - if (status < 0) { - switch (status) { - case -EINPROGRESS: - case -EALREADY: - goto out_clear; - } - } -out: - if (status < 0) - rpc_wake_up_status(&xprt->pending, status); - else - rpc_wake_up(&xprt->pending); -out_clear: - smp_mb__before_clear_bit(); - clear_bit(XPRT_CONNECTING, &xprt->sockstate); - smp_mb__after_clear_bit(); -} - /* * Attempt to connect a TCP socket. * @@ -552,30 +349,16 @@ void xprt_connect(struct rpc_task *task) if (!xprt_lock_write(xprt, task)) return; if (xprt_connected(xprt)) - goto out_write; + xprt_release_write(xprt, task); + else { + if (task->tk_rqstp) + task->tk_rqstp->rq_bytes_sent = 0; - if (task->tk_rqstp) - task->tk_rqstp->rq_bytes_sent = 0; - - task->tk_timeout = RPC_CONNECT_TIMEOUT; - rpc_sleep_on(&xprt->pending, task, xprt_connect_status, NULL); - if (!test_and_set_bit(XPRT_CONNECTING, &xprt->sockstate)) { - /* Note: if we are here due to a dropped connection - * we delay reconnecting by RPC_REESTABLISH_TIMEOUT/HZ - * seconds - */ - if (xprt->sock != NULL) - schedule_delayed_work(&xprt->sock_connect, - RPC_REESTABLISH_TIMEOUT); - else { - schedule_work(&xprt->sock_connect); - if (!RPC_IS_ASYNC(task)) - flush_scheduled_work(); - } + task->tk_timeout = RPC_CONNECT_TIMEOUT; + rpc_sleep_on(&xprt->pending, task, xprt_connect_status, NULL); + xprt->ops->connect(task); } return; - out_write: - xprt_release_write(xprt, task); } /* @@ -624,8 +407,7 @@ xprt_connect_status(struct rpc_task *task) /* * Look up the RPC request corresponding to a reply, and then lock it. */ -static inline struct rpc_rqst * -xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid) +struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid) { struct list_head *pos; struct rpc_rqst *req = NULL; @@ -644,8 +426,7 @@ xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid) * Complete reply received. * The TCP code relies on us to remove the request from xprt->pending. */ -static void -xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied) +void xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied) { struct rpc_task *task = req->rq_task; struct rpc_clnt *clnt = task->tk_client; @@ -691,409 +472,6 @@ xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied) return; } -/* - * Input handler for RPC replies. Called from a bottom half and hence - * atomic. - */ -static void -udp_data_ready(struct sock *sk, int len) -{ - struct rpc_task *task; - struct rpc_xprt *xprt; - struct rpc_rqst *rovr; - struct sk_buff *skb; - int err, repsize, copied; - u32 _xid, *xp; - - read_lock(&sk->sk_callback_lock); - dprintk("RPC: udp_data_ready...\n"); - if (!(xprt = xprt_from_sock(sk))) { - printk("RPC: udp_data_ready request not found!\n"); - goto out; - } - - dprintk("RPC: udp_data_ready client %p\n", xprt); - - if ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL) - goto out; - - if (xprt->shutdown) - goto dropit; - - repsize = skb->len - sizeof(struct udphdr); - if (repsize < 4) { - printk("RPC: impossible RPC reply size %d!\n", repsize); - goto dropit; - } - - /* Copy the XID from the skb... */ - xp = skb_header_pointer(skb, sizeof(struct udphdr), - sizeof(_xid), &_xid); - if (xp == NULL) - goto dropit; - - /* Look up and lock the request corresponding to the given XID */ - spin_lock(&xprt->sock_lock); - rovr = xprt_lookup_rqst(xprt, *xp); - if (!rovr) - goto out_unlock; - task = rovr->rq_task; - - dprintk("RPC: %4d received reply\n", task->tk_pid); - - if ((copied = rovr->rq_private_buf.buflen) > repsize) - copied = repsize; - - /* Suck it into the iovec, verify checksum if not done by hw. */ - if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) - goto out_unlock; - - /* Something worked... */ - dst_confirm(skb->dst); - - xprt_complete_rqst(xprt, rovr, copied); - - out_unlock: - spin_unlock(&xprt->sock_lock); - dropit: - skb_free_datagram(sk, skb); - out: - read_unlock(&sk->sk_callback_lock); -} - -/* - * Copy from an skb into memory and shrink the skb. - */ -static inline size_t -tcp_copy_data(skb_reader_t *desc, void *p, size_t len) -{ - if (len > desc->count) - len = desc->count; - if (skb_copy_bits(desc->skb, desc->offset, p, len)) { - dprintk("RPC: failed to copy %zu bytes from skb. %zu bytes remain\n", - len, desc->count); - return 0; - } - desc->offset += len; - desc->count -= len; - dprintk("RPC: copied %zu bytes from skb. %zu bytes remain\n", - len, desc->count); - return len; -} - -/* - * TCP read fragment marker - */ -static inline void -tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc) -{ - size_t len, used; - char *p; - - p = ((char *) &xprt->tcp_recm) + xprt->tcp_offset; - len = sizeof(xprt->tcp_recm) - xprt->tcp_offset; - used = tcp_copy_data(desc, p, len); - xprt->tcp_offset += used; - if (used != len) - return; - xprt->tcp_reclen = ntohl(xprt->tcp_recm); - if (xprt->tcp_reclen & 0x80000000) - xprt->tcp_flags |= XPRT_LAST_FRAG; - else - xprt->tcp_flags &= ~XPRT_LAST_FRAG; - xprt->tcp_reclen &= 0x7fffffff; - xprt->tcp_flags &= ~XPRT_COPY_RECM; - xprt->tcp_offset = 0; - /* Sanity check of the record length */ - if (xprt->tcp_reclen < 4) { - printk(KERN_ERR "RPC: Invalid TCP record fragment length\n"); - xprt_disconnect(xprt); - } - dprintk("RPC: reading TCP record fragment of length %d\n", - xprt->tcp_reclen); -} - -static void -tcp_check_recm(struct rpc_xprt *xprt) -{ - dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u, tcp_flags = %lx\n", - xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen, xprt->tcp_flags); - if (xprt->tcp_offset == xprt->tcp_reclen) { - xprt->tcp_flags |= XPRT_COPY_RECM; - xprt->tcp_offset = 0; - if (xprt->tcp_flags & XPRT_LAST_FRAG) { - xprt->tcp_flags &= ~XPRT_COPY_DATA; - xprt->tcp_flags |= XPRT_COPY_XID; - xprt->tcp_copied = 0; - } - } -} - -/* - * TCP read xid - */ -static inline void -tcp_read_xid(struct rpc_xprt *xprt, skb_reader_t *desc) -{ - size_t len, used; - char *p; - - len = sizeof(xprt->tcp_xid) - xprt->tcp_offset; - dprintk("RPC: reading XID (%Zu bytes)\n", len); - p = ((char *) &xprt->tcp_xid) + xprt->tcp_offset; - used = tcp_copy_data(desc, p, len); - xprt->tcp_offset += used; - if (used != len) - return; - xprt->tcp_flags &= ~XPRT_COPY_XID; - xprt->tcp_flags |= XPRT_COPY_DATA; - xprt->tcp_copied = 4; - dprintk("RPC: reading reply for XID %08x\n", - ntohl(xprt->tcp_xid)); - tcp_check_recm(xprt); -} - -/* - * TCP read and complete request - */ -static inline void -tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc) -{ - struct rpc_rqst *req; - struct xdr_buf *rcvbuf; - size_t len; - ssize_t r; - - /* Find and lock the request corresponding to this xid */ - spin_lock(&xprt->sock_lock); - req = xprt_lookup_rqst(xprt, xprt->tcp_xid); - if (!req) { - xprt->tcp_flags &= ~XPRT_COPY_DATA; - dprintk("RPC: XID %08x request not found!\n", - ntohl(xprt->tcp_xid)); - spin_unlock(&xprt->sock_lock); - return; - } - - rcvbuf = &req->rq_private_buf; - len = desc->count; - if (len > xprt->tcp_reclen - xprt->tcp_offset) { - skb_reader_t my_desc; - - len = xprt->tcp_reclen - xprt->tcp_offset; - memcpy(&my_desc, desc, sizeof(my_desc)); - my_desc.count = len; - r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied, - &my_desc, tcp_copy_data); - desc->count -= r; - desc->offset += r; - } else - r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied, - desc, tcp_copy_data); - - if (r > 0) { - xprt->tcp_copied += r; - xprt->tcp_offset += r; - } - if (r != len) { - /* Error when copying to the receive buffer, - * usually because we weren't able to allocate - * additional buffer pages. All we can do now - * is turn off XPRT_COPY_DATA, so the request - * will not receive any additional updates, - * and time out. - * Any remaining data from this record will - * be discarded. - */ - xprt->tcp_flags &= ~XPRT_COPY_DATA; - dprintk("RPC: XID %08x truncated request\n", - ntohl(xprt->tcp_xid)); - dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n", - xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen); - goto out; - } - - dprintk("RPC: XID %08x read %Zd bytes\n", - ntohl(xprt->tcp_xid), r); - dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n", - xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen); - - if (xprt->tcp_copied == req->rq_private_buf.buflen) - xprt->tcp_flags &= ~XPRT_COPY_DATA; - else if (xprt->tcp_offset == xprt->tcp_reclen) { - if (xprt->tcp_flags & XPRT_LAST_FRAG) - xprt->tcp_flags &= ~XPRT_COPY_DATA; - } - -out: - if (!(xprt->tcp_flags & XPRT_COPY_DATA)) { - dprintk("RPC: %4d received reply complete\n", - req->rq_task->tk_pid); - xprt_complete_rqst(xprt, req, xprt->tcp_copied); - } - spin_unlock(&xprt->sock_lock); - tcp_check_recm(xprt); -} - -/* - * TCP discard extra bytes from a short read - */ -static inline void -tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc) -{ - size_t len; - - len = xprt->tcp_reclen - xprt->tcp_offset; - if (len > desc->count) - len = desc->count; - desc->count -= len; - desc->offset += len; - xprt->tcp_offset += len; - dprintk("RPC: discarded %Zu bytes\n", len); - tcp_check_recm(xprt); -} - -/* - * TCP record receive routine - * We first have to grab the record marker, then the XID, then the data. - */ -static int -tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, - unsigned int offset, size_t len) -{ - struct rpc_xprt *xprt = rd_desc->arg.data; - skb_reader_t desc = { - .skb = skb, - .offset = offset, - .count = len, - .csum = 0 - }; - - dprintk("RPC: tcp_data_recv\n"); - do { - /* Read in a new fragment marker if necessary */ - /* Can we ever really expect to get completely empty fragments? */ - if (xprt->tcp_flags & XPRT_COPY_RECM) { - tcp_read_fraghdr(xprt, &desc); - continue; - } - /* Read in the xid if necessary */ - if (xprt->tcp_flags & XPRT_COPY_XID) { - tcp_read_xid(xprt, &desc); - continue; - } - /* Read in the request data */ - if (xprt->tcp_flags & XPRT_COPY_DATA) { - tcp_read_request(xprt, &desc); - continue; - } - /* Skip over any trailing bytes on short reads */ - tcp_read_discard(xprt, &desc); - } while (desc.count); - dprintk("RPC: tcp_data_recv done\n"); - return len - desc.count; -} - -static void tcp_data_ready(struct sock *sk, int bytes) -{ - struct rpc_xprt *xprt; - read_descriptor_t rd_desc; - - read_lock(&sk->sk_callback_lock); - dprintk("RPC: tcp_data_ready...\n"); - if (!(xprt = xprt_from_sock(sk))) { - printk("RPC: tcp_data_ready socket info not found!\n"); - goto out; - } - if (xprt->shutdown) - goto out; - - /* We use rd_desc to pass struct xprt to tcp_data_recv */ - rd_desc.arg.data = xprt; - rd_desc.count = 65536; - tcp_read_sock(sk, &rd_desc, tcp_data_recv); -out: - read_unlock(&sk->sk_callback_lock); -} - -static void -tcp_state_change(struct sock *sk) -{ - struct rpc_xprt *xprt; - - read_lock(&sk->sk_callback_lock); - if (!(xprt = xprt_from_sock(sk))) - goto out; - dprintk("RPC: tcp_state_change client %p...\n", xprt); - dprintk("RPC: state %x conn %d dead %d zapped %d\n", - sk->sk_state, xprt_connected(xprt), - sock_flag(sk, SOCK_DEAD), - sock_flag(sk, SOCK_ZAPPED)); - - switch (sk->sk_state) { - case TCP_ESTABLISHED: - spin_lock_bh(&xprt->sock_lock); - if (!xprt_test_and_set_connected(xprt)) { - /* Reset TCP record info */ - xprt->tcp_offset = 0; - xprt->tcp_reclen = 0; - xprt->tcp_copied = 0; - xprt->tcp_flags = XPRT_COPY_RECM | XPRT_COPY_XID; - rpc_wake_up(&xprt->pending); - } - spin_unlock_bh(&xprt->sock_lock); - break; - case TCP_SYN_SENT: - case TCP_SYN_RECV: - break; - default: - xprt_disconnect(xprt); - break; - } - out: - read_unlock(&sk->sk_callback_lock); -} - -/* - * Called when more output buffer space is available for this socket. - * We try not to wake our writers until they can make "significant" - * progress, otherwise we'll waste resources thrashing sock_sendmsg - * with a bunch of small requests. - */ -static void -xprt_write_space(struct sock *sk) -{ - struct rpc_xprt *xprt; - struct socket *sock; - - read_lock(&sk->sk_callback_lock); - if (!(xprt = xprt_from_sock(sk)) || !(sock = sk->sk_socket)) - goto out; - if (xprt->shutdown) - goto out; - - /* Wait until we have enough socket memory */ - if (xprt->stream) { - /* from net/core/stream.c:sk_stream_write_space */ - if (sk_stream_wspace(sk) < sk_stream_min_wspace(sk)) - goto out; - } else { - /* from net/core/sock.c:sock_def_write_space */ - if (!sock_writeable(sk)) - goto out; - } - - if (!test_and_clear_bit(SOCK_NOSPACE, &sock->flags)) - goto out; - - spin_lock_bh(&xprt->sock_lock); - if (xprt->snd_task) - rpc_wake_up_task(xprt->snd_task); - spin_unlock_bh(&xprt->sock_lock); -out: - read_unlock(&sk->sk_callback_lock); -} - /* * RPC receive timeout handler. */ @@ -1161,19 +539,10 @@ xprt_transmit(struct rpc_task *task) struct rpc_clnt *clnt = task->tk_client; struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; - int status, retry = 0; - + int status; dprintk("RPC: %4d xprt_transmit(%u)\n", task->tk_pid, req->rq_slen); - /* set up everything as needed. */ - /* Write the record marker */ - if (xprt->stream) { - u32 *marker = req->rq_svec[0].iov_base; - - *marker = htonl(0x80000000|(req->rq_slen-sizeof(*marker))); - } - smp_rmb(); if (!req->rq_received) { if (list_empty(&req->rq_list)) { @@ -1191,41 +560,9 @@ xprt_transmit(struct rpc_task *task) } else if (!req->rq_bytes_sent) return; - /* Continue transmitting the packet/record. We must be careful - * to cope with writespace callbacks arriving _after_ we have - * called xprt_sendmsg(). - */ - while (1) { - req->rq_xtime = jiffies; - status = xprt_sendmsg(xprt, req); - - if (status < 0) - break; - - if (xprt->stream) { - req->rq_bytes_sent += status; - - /* If we've sent the entire packet, immediately - * reset the count of bytes sent. */ - if (req->rq_bytes_sent >= req->rq_slen) { - req->rq_bytes_sent = 0; - goto out_receive; - } - } else { - if (status >= req->rq_slen) - goto out_receive; - status = -EAGAIN; - break; - } - - dprintk("RPC: %4d xmit incomplete (%d left of %d)\n", - task->tk_pid, req->rq_slen - req->rq_bytes_sent, - req->rq_slen); - - status = -EAGAIN; - if (retry++ > 50) - break; - } + status = xprt->ops->send_request(task); + if (!status) + goto out_receive; /* Note: at this point, task->tk_sleeping has not yet been set, * hence there is no danger of the waking up task being put on @@ -1234,26 +571,10 @@ xprt_transmit(struct rpc_task *task) task->tk_status = status; switch (status) { - case -EAGAIN: - if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) { - /* Protect against races with xprt_write_space */ - spin_lock_bh(&xprt->sock_lock); - /* Don't race with disconnect */ - if (!xprt_connected(xprt)) - task->tk_status = -ENOTCONN; - else if (test_bit(SOCK_NOSPACE, &xprt->sock->flags)) { - task->tk_timeout = req->rq_timeout; - rpc_sleep_on(&xprt->pending, task, NULL, NULL); - } - spin_unlock_bh(&xprt->sock_lock); - return; - } - /* Keep holding the socket if it is blocked */ - rpc_delay(task, HZ>>4); - return; case -ECONNREFUSED: task->tk_timeout = RPC_REESTABLISH_TIMEOUT; rpc_sleep_on(&xprt->sending, task, NULL, NULL); + case -EAGAIN: case -ENOTCONN: return; default: @@ -1367,7 +688,8 @@ xprt_release(struct rpc_task *task) list_del(&req->rq_list); xprt->last_used = jiffies; if (list_empty(&xprt->recv) && !xprt->shutdown) - mod_timer(&xprt->timer, xprt->last_used + XPRT_IDLE_TIMEOUT); + mod_timer(&xprt->timer, + xprt->last_used + RPC_IDLE_DISCONNECT_TIMEOUT); spin_unlock_bh(&xprt->sock_lock); task->tk_rqstp = NULL; memset(req, 0, sizeof(*req)); /* mark unused */ @@ -1380,18 +702,6 @@ xprt_release(struct rpc_task *task) spin_unlock(&xprt->xprt_lock); } -/* - * Set default timeout parameters - */ -static void -xprt_default_timeout(struct rpc_timeout *to, int proto) -{ - if (proto == IPPROTO_UDP) - xprt_set_timeout(to, 5, 5 * HZ); - else - xprt_set_timeout(to, 2, 60 * HZ); -} - /* * Set constant timeout */ @@ -1405,68 +715,51 @@ xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long incr) to->to_exponential = 0; } -unsigned int xprt_udp_slot_table_entries = RPC_DEF_SLOT_TABLE; -unsigned int xprt_tcp_slot_table_entries = RPC_DEF_SLOT_TABLE; - /* * Initialize an RPC client */ static struct rpc_xprt * xprt_setup(int proto, struct sockaddr_in *ap, struct rpc_timeout *to) { + int result; struct rpc_xprt *xprt; - unsigned int entries; - size_t slot_table_size; struct rpc_rqst *req; - dprintk("RPC: setting up %s transport...\n", - proto == IPPROTO_UDP? "UDP" : "TCP"); - - entries = (proto == IPPROTO_TCP)? - xprt_tcp_slot_table_entries : xprt_udp_slot_table_entries; - if ((xprt = kmalloc(sizeof(struct rpc_xprt), GFP_KERNEL)) == NULL) return ERR_PTR(-ENOMEM); memset(xprt, 0, sizeof(*xprt)); /* Nnnngh! */ - xprt->max_reqs = entries; - slot_table_size = entries * sizeof(xprt->slot[0]); - xprt->slot = kmalloc(slot_table_size, GFP_KERNEL); - if (xprt->slot == NULL) { - kfree(xprt); - return ERR_PTR(-ENOMEM); - } - memset(xprt->slot, 0, slot_table_size); xprt->addr = *ap; - xprt->prot = proto; - xprt->stream = (proto == IPPROTO_TCP)? 1 : 0; - if (xprt->stream) { - xprt->cwnd = RPC_MAXCWND(xprt); - xprt->nocong = 1; - xprt->max_payload = (1U << 31) - 1; - } else { - xprt->cwnd = RPC_INITCWND; - xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); + + switch (proto) { + case IPPROTO_UDP: + result = xs_setup_udp(xprt, to); + break; + case IPPROTO_TCP: + result = xs_setup_tcp(xprt, to); + break; + default: + printk(KERN_ERR "RPC: unrecognized transport protocol: %d\n", + proto); + result = -EIO; + break; + } + if (result) { + kfree(xprt); + return ERR_PTR(result); } + spin_lock_init(&xprt->sock_lock); spin_lock_init(&xprt->xprt_lock); init_waitqueue_head(&xprt->cong_wait); INIT_LIST_HEAD(&xprt->free); INIT_LIST_HEAD(&xprt->recv); - INIT_WORK(&xprt->sock_connect, xprt_socket_connect, xprt); INIT_WORK(&xprt->task_cleanup, xprt_socket_autoclose, xprt); init_timer(&xprt->timer); xprt->timer.function = xprt_init_autodisconnect; xprt->timer.data = (unsigned long) xprt; xprt->last_used = jiffies; - xprt->port = XPRT_MAX_RESVPORT; - - /* Set timeout parameters */ - if (to) { - xprt->timeout = *to; - } else - xprt_default_timeout(&xprt->timeout, xprt->prot); rpc_init_wait_queue(&xprt->pending, "xprt_pending"); rpc_init_wait_queue(&xprt->sending, "xprt_sending"); @@ -1474,134 +767,17 @@ xprt_setup(int proto, struct sockaddr_in *ap, struct rpc_timeout *to) rpc_init_priority_wait_queue(&xprt->backlog, "xprt_backlog"); /* initialize free list */ - for (req = &xprt->slot[entries-1]; req >= &xprt->slot[0]; req--) + for (req = &xprt->slot[xprt->max_reqs-1]; req >= &xprt->slot[0]; req--) list_add(&req->rq_list, &xprt->free); xprt_init_xid(xprt); - /* Check whether we want to use a reserved port */ - xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; - dprintk("RPC: created transport %p with %u slots\n", xprt, xprt->max_reqs); return xprt; } -/* - * Bind to a reserved port - */ -static inline int xprt_bindresvport(struct rpc_xprt *xprt, struct socket *sock) -{ - struct sockaddr_in myaddr = { - .sin_family = AF_INET, - }; - int err, port; - - /* Were we already bound to a given port? Try to reuse it */ - port = xprt->port; - do { - myaddr.sin_port = htons(port); - err = sock->ops->bind(sock, (struct sockaddr *) &myaddr, - sizeof(myaddr)); - if (err == 0) { - xprt->port = port; - return 0; - } - if (--port == 0) - port = XPRT_MAX_RESVPORT; - } while (err == -EADDRINUSE && port != xprt->port); - - printk("RPC: Can't bind to reserved port (%d).\n", -err); - return err; -} - -static void -xprt_bind_socket(struct rpc_xprt *xprt, struct socket *sock) -{ - struct sock *sk = sock->sk; - - if (xprt->inet) - return; - - write_lock_bh(&sk->sk_callback_lock); - sk->sk_user_data = xprt; - xprt->old_data_ready = sk->sk_data_ready; - xprt->old_state_change = sk->sk_state_change; - xprt->old_write_space = sk->sk_write_space; - if (xprt->prot == IPPROTO_UDP) { - sk->sk_data_ready = udp_data_ready; - sk->sk_no_check = UDP_CSUM_NORCV; - xprt_set_connected(xprt); - } else { - tcp_sk(sk)->nonagle = 1; /* disable Nagle's algorithm */ - sk->sk_data_ready = tcp_data_ready; - sk->sk_state_change = tcp_state_change; - xprt_clear_connected(xprt); - } - sk->sk_write_space = xprt_write_space; - - /* Reset to new socket */ - xprt->sock = sock; - xprt->inet = sk; - write_unlock_bh(&sk->sk_callback_lock); - - return; -} - -/* - * Set socket buffer length - */ -void -xprt_sock_setbufsize(struct rpc_xprt *xprt) -{ - struct sock *sk = xprt->inet; - - if (xprt->stream) - return; - if (xprt->rcvsize) { - sk->sk_userlocks |= SOCK_RCVBUF_LOCK; - sk->sk_rcvbuf = xprt->rcvsize * xprt->max_reqs * 2; - } - if (xprt->sndsize) { - sk->sk_userlocks |= SOCK_SNDBUF_LOCK; - sk->sk_sndbuf = xprt->sndsize * xprt->max_reqs * 2; - sk->sk_write_space(sk); - } -} - -/* - * Datastream sockets are created here, but xprt_connect will create - * and connect stream sockets. - */ -static struct socket * xprt_create_socket(struct rpc_xprt *xprt, int proto, int resvport) -{ - struct socket *sock; - int type, err; - - dprintk("RPC: xprt_create_socket(%s %d)\n", - (proto == IPPROTO_UDP)? "udp" : "tcp", proto); - - type = (proto == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; - - if ((err = sock_create_kern(PF_INET, type, proto, &sock)) < 0) { - printk("RPC: can't create socket (%d).\n", -err); - return NULL; - } - - /* If the caller has the capability, bind to a reserved port */ - if (resvport && xprt_bindresvport(xprt, sock) < 0) { - printk("RPC: can't bind to reserved port.\n"); - goto failed; - } - - return sock; - -failed: - sock_release(sock); - return NULL; -} - /* * Create an RPC client transport given the protocol and peer address. */ @@ -1631,10 +807,6 @@ xprt_shutdown(struct rpc_xprt *xprt) rpc_wake_up(&xprt->backlog); wake_up(&xprt->cong_wait); del_timer_sync(&xprt->timer); - - /* synchronously wait for connect worker to finish */ - cancel_delayed_work(&xprt->sock_connect); - flush_scheduled_work(); } /* @@ -1655,9 +827,7 @@ xprt_destroy(struct rpc_xprt *xprt) { dprintk("RPC: destroying transport %p\n", xprt); xprt_shutdown(xprt); - xprt_disconnect(xprt); - xprt_close(xprt); - kfree(xprt->slot); + xprt->ops->destroy(xprt); kfree(xprt); return 0; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c new file mode 100644 index 00000000000..fa1180ac482 --- /dev/null +++ b/net/sunrpc/xprtsock.c @@ -0,0 +1,1021 @@ +/* + * linux/net/sunrpc/xprtsock.c + * + * Client-side transport implementation for sockets. + * + * TCP callback races fixes (C) 1998 Red Hat Software + * TCP send fixes (C) 1998 Red Hat Software + * TCP NFS related read + write fixes + * (C) 1999 Dave Airlie, University of Limerick, Ireland + * + * Rewrite of larges part of the code in order to stabilize TCP stuff. + * Fix behaviour when socket buffer is full. + * (C) 1999 Trond Myklebust + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# undef RPC_DEBUG_DATA +# define RPCDBG_FACILITY RPCDBG_XPRT +#endif + +#define XPRT_MAX_RESVPORT (800) + +#ifdef RPC_DEBUG_DATA +/* + * Print the buffer contents (first 128 bytes only--just enough for + * diropres return). + */ +static void +xprt_pktdump(char *msg, u32 *packet, unsigned int count) +{ + u8 *buf = (u8 *) packet; + int j; + + dprintk("RPC: %s\n", msg); + for (j = 0; j < count && j < 128; j += 4) { + if (!(j & 31)) { + if (j) + dprintk("\n"); + dprintk("0x%04x ", j); + } + dprintk("%02x%02x%02x%02x ", + buf[j], buf[j+1], buf[j+2], buf[j+3]); + } + dprintk("\n"); +} +#else +static inline void +xprt_pktdump(char *msg, u32 *packet, unsigned int count) +{ + /* NOP */ +} +#endif + +/* + * Look up RPC transport given an INET socket + */ +static inline struct rpc_xprt * +xprt_from_sock(struct sock *sk) +{ + return (struct rpc_xprt *) sk->sk_user_data; +} + +static int +xdr_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, + struct xdr_buf *xdr, unsigned int base, int msgflags) +{ + struct page **ppage = xdr->pages; + unsigned int len, pglen = xdr->page_len; + int err, ret = 0; + ssize_t (*sendpage)(struct socket *, struct page *, int, size_t, int); + + len = xdr->head[0].iov_len; + if (base < len || (addr != NULL && base == 0)) { + struct kvec iov = { + .iov_base = xdr->head[0].iov_base + base, + .iov_len = len - base, + }; + struct msghdr msg = { + .msg_name = addr, + .msg_namelen = addrlen, + .msg_flags = msgflags, + }; + if (xdr->len > len) + msg.msg_flags |= MSG_MORE; + + if (iov.iov_len != 0) + err = kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); + else + err = kernel_sendmsg(sock, &msg, NULL, 0, 0); + if (ret == 0) + ret = err; + else if (err > 0) + ret += err; + if (err != iov.iov_len) + goto out; + base = 0; + } else + base -= len; + + if (pglen == 0) + goto copy_tail; + if (base >= pglen) { + base -= pglen; + goto copy_tail; + } + if (base || xdr->page_base) { + pglen -= base; + base += xdr->page_base; + ppage += base >> PAGE_CACHE_SHIFT; + base &= ~PAGE_CACHE_MASK; + } + + sendpage = sock->ops->sendpage ? : sock_no_sendpage; + do { + int flags = msgflags; + + len = PAGE_CACHE_SIZE; + if (base) + len -= base; + if (pglen < len) + len = pglen; + + if (pglen != len || xdr->tail[0].iov_len != 0) + flags |= MSG_MORE; + + /* Hmm... We might be dealing with highmem pages */ + if (PageHighMem(*ppage)) + sendpage = sock_no_sendpage; + err = sendpage(sock, *ppage, base, len, flags); + if (ret == 0) + ret = err; + else if (err > 0) + ret += err; + if (err != len) + goto out; + base = 0; + ppage++; + } while ((pglen -= len) != 0); +copy_tail: + len = xdr->tail[0].iov_len; + if (base < len) { + struct kvec iov = { + .iov_base = xdr->tail[0].iov_base + base, + .iov_len = len - base, + }; + struct msghdr msg = { + .msg_flags = msgflags, + }; + err = kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); + if (ret == 0) + ret = err; + else if (err > 0) + ret += err; + } +out: + return ret; +} + +/* + * Write data to socket. + */ +static inline int +xprt_sendmsg(struct rpc_xprt *xprt, struct rpc_rqst *req) +{ + struct socket *sock = xprt->sock; + struct xdr_buf *xdr = &req->rq_snd_buf; + struct sockaddr *addr = NULL; + int addrlen = 0; + unsigned int skip; + int result; + + if (!sock) + return -ENOTCONN; + + xprt_pktdump("packet data:", + req->rq_svec->iov_base, + req->rq_svec->iov_len); + + /* For UDP, we need to provide an address */ + if (!xprt->stream) { + addr = (struct sockaddr *) &xprt->addr; + addrlen = sizeof(xprt->addr); + } + /* Dont repeat bytes */ + skip = req->rq_bytes_sent; + + clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags); + result = xdr_sendpages(sock, addr, addrlen, xdr, skip, MSG_DONTWAIT); + + dprintk("RPC: xprt_sendmsg(%d) = %d\n", xdr->len - skip, result); + + if (result >= 0) + return result; + + switch (result) { + case -ECONNREFUSED: + /* When the server has died, an ICMP port unreachable message + * prompts ECONNREFUSED. + */ + case -EAGAIN: + break; + case -ECONNRESET: + case -ENOTCONN: + case -EPIPE: + /* connection broken */ + if (xprt->stream) + result = -ENOTCONN; + break; + default: + printk(KERN_NOTICE "RPC: sendmsg returned error %d\n", -result); + } + return result; +} + +static int +xprt_send_request(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + int status, retry = 0; + + /* set up everything as needed. */ + /* Write the record marker */ + if (xprt->stream) { + u32 *marker = req->rq_svec[0].iov_base; + + *marker = htonl(0x80000000|(req->rq_slen-sizeof(*marker))); + } + + /* Continue transmitting the packet/record. We must be careful + * to cope with writespace callbacks arriving _after_ we have + * called xprt_sendmsg(). + */ + while (1) { + req->rq_xtime = jiffies; + status = xprt_sendmsg(xprt, req); + + if (status < 0) + break; + + if (xprt->stream) { + req->rq_bytes_sent += status; + + /* If we've sent the entire packet, immediately + * reset the count of bytes sent. */ + if (req->rq_bytes_sent >= req->rq_slen) { + req->rq_bytes_sent = 0; + return 0; + } + } else { + if (status >= req->rq_slen) + return 0; + status = -EAGAIN; + break; + } + + dprintk("RPC: %4d xmit incomplete (%d left of %d)\n", + task->tk_pid, req->rq_slen - req->rq_bytes_sent, + req->rq_slen); + + status = -EAGAIN; + if (retry++ > 50) + break; + } + + if (status == -EAGAIN) { + if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) { + /* Protect against races with xprt_write_space */ + spin_lock_bh(&xprt->sock_lock); + /* Don't race with disconnect */ + if (!xprt_connected(xprt)) + task->tk_status = -ENOTCONN; + else if (test_bit(SOCK_NOSPACE, &xprt->sock->flags)) { + task->tk_timeout = req->rq_timeout; + rpc_sleep_on(&xprt->pending, task, NULL, NULL); + } + spin_unlock_bh(&xprt->sock_lock); + return status; + } + /* Keep holding the socket if it is blocked */ + rpc_delay(task, HZ>>4); + } + return status; +} + +/* + * Close down a transport socket + */ +static void +xprt_close(struct rpc_xprt *xprt) +{ + struct socket *sock = xprt->sock; + struct sock *sk = xprt->inet; + + if (!sk) + return; + + write_lock_bh(&sk->sk_callback_lock); + xprt->inet = NULL; + xprt->sock = NULL; + + sk->sk_user_data = NULL; + sk->sk_data_ready = xprt->old_data_ready; + sk->sk_state_change = xprt->old_state_change; + sk->sk_write_space = xprt->old_write_space; + write_unlock_bh(&sk->sk_callback_lock); + + sk->sk_no_check = 0; + + sock_release(sock); +} + +static void xprt_socket_destroy(struct rpc_xprt *xprt) +{ + cancel_delayed_work(&xprt->sock_connect); + flush_scheduled_work(); + + xprt_disconnect(xprt); + xprt_close(xprt); + kfree(xprt->slot); +} + +/* + * Input handler for RPC replies. Called from a bottom half and hence + * atomic. + */ +static void +udp_data_ready(struct sock *sk, int len) +{ + struct rpc_task *task; + struct rpc_xprt *xprt; + struct rpc_rqst *rovr; + struct sk_buff *skb; + int err, repsize, copied; + u32 _xid, *xp; + + read_lock(&sk->sk_callback_lock); + dprintk("RPC: udp_data_ready...\n"); + if (!(xprt = xprt_from_sock(sk))) { + printk("RPC: udp_data_ready request not found!\n"); + goto out; + } + + dprintk("RPC: udp_data_ready client %p\n", xprt); + + if ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL) + goto out; + + if (xprt->shutdown) + goto dropit; + + repsize = skb->len - sizeof(struct udphdr); + if (repsize < 4) { + printk("RPC: impossible RPC reply size %d!\n", repsize); + goto dropit; + } + + /* Copy the XID from the skb... */ + xp = skb_header_pointer(skb, sizeof(struct udphdr), + sizeof(_xid), &_xid); + if (xp == NULL) + goto dropit; + + /* Look up and lock the request corresponding to the given XID */ + spin_lock(&xprt->sock_lock); + rovr = xprt_lookup_rqst(xprt, *xp); + if (!rovr) + goto out_unlock; + task = rovr->rq_task; + + dprintk("RPC: %4d received reply\n", task->tk_pid); + + if ((copied = rovr->rq_private_buf.buflen) > repsize) + copied = repsize; + + /* Suck it into the iovec, verify checksum if not done by hw. */ + if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) + goto out_unlock; + + /* Something worked... */ + dst_confirm(skb->dst); + + xprt_complete_rqst(xprt, rovr, copied); + + out_unlock: + spin_unlock(&xprt->sock_lock); + dropit: + skb_free_datagram(sk, skb); + out: + read_unlock(&sk->sk_callback_lock); +} + +/* + * Copy from an skb into memory and shrink the skb. + */ +static inline size_t +tcp_copy_data(skb_reader_t *desc, void *p, size_t len) +{ + if (len > desc->count) + len = desc->count; + if (skb_copy_bits(desc->skb, desc->offset, p, len)) { + dprintk("RPC: failed to copy %zu bytes from skb. %zu bytes remain\n", + len, desc->count); + return 0; + } + desc->offset += len; + desc->count -= len; + dprintk("RPC: copied %zu bytes from skb. %zu bytes remain\n", + len, desc->count); + return len; +} + +/* + * TCP read fragment marker + */ +static inline void +tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc) +{ + size_t len, used; + char *p; + + p = ((char *) &xprt->tcp_recm) + xprt->tcp_offset; + len = sizeof(xprt->tcp_recm) - xprt->tcp_offset; + used = tcp_copy_data(desc, p, len); + xprt->tcp_offset += used; + if (used != len) + return; + xprt->tcp_reclen = ntohl(xprt->tcp_recm); + if (xprt->tcp_reclen & 0x80000000) + xprt->tcp_flags |= XPRT_LAST_FRAG; + else + xprt->tcp_flags &= ~XPRT_LAST_FRAG; + xprt->tcp_reclen &= 0x7fffffff; + xprt->tcp_flags &= ~XPRT_COPY_RECM; + xprt->tcp_offset = 0; + /* Sanity check of the record length */ + if (xprt->tcp_reclen < 4) { + printk(KERN_ERR "RPC: Invalid TCP record fragment length\n"); + xprt_disconnect(xprt); + } + dprintk("RPC: reading TCP record fragment of length %d\n", + xprt->tcp_reclen); +} + +static void +tcp_check_recm(struct rpc_xprt *xprt) +{ + dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u, tcp_flags = %lx\n", + xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen, xprt->tcp_flags); + if (xprt->tcp_offset == xprt->tcp_reclen) { + xprt->tcp_flags |= XPRT_COPY_RECM; + xprt->tcp_offset = 0; + if (xprt->tcp_flags & XPRT_LAST_FRAG) { + xprt->tcp_flags &= ~XPRT_COPY_DATA; + xprt->tcp_flags |= XPRT_COPY_XID; + xprt->tcp_copied = 0; + } + } +} + +/* + * TCP read xid + */ +static inline void +tcp_read_xid(struct rpc_xprt *xprt, skb_reader_t *desc) +{ + size_t len, used; + char *p; + + len = sizeof(xprt->tcp_xid) - xprt->tcp_offset; + dprintk("RPC: reading XID (%Zu bytes)\n", len); + p = ((char *) &xprt->tcp_xid) + xprt->tcp_offset; + used = tcp_copy_data(desc, p, len); + xprt->tcp_offset += used; + if (used != len) + return; + xprt->tcp_flags &= ~XPRT_COPY_XID; + xprt->tcp_flags |= XPRT_COPY_DATA; + xprt->tcp_copied = 4; + dprintk("RPC: reading reply for XID %08x\n", + ntohl(xprt->tcp_xid)); + tcp_check_recm(xprt); +} + +/* + * TCP read and complete request + */ +static inline void +tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc) +{ + struct rpc_rqst *req; + struct xdr_buf *rcvbuf; + size_t len; + ssize_t r; + + /* Find and lock the request corresponding to this xid */ + spin_lock(&xprt->sock_lock); + req = xprt_lookup_rqst(xprt, xprt->tcp_xid); + if (!req) { + xprt->tcp_flags &= ~XPRT_COPY_DATA; + dprintk("RPC: XID %08x request not found!\n", + ntohl(xprt->tcp_xid)); + spin_unlock(&xprt->sock_lock); + return; + } + + rcvbuf = &req->rq_private_buf; + len = desc->count; + if (len > xprt->tcp_reclen - xprt->tcp_offset) { + skb_reader_t my_desc; + + len = xprt->tcp_reclen - xprt->tcp_offset; + memcpy(&my_desc, desc, sizeof(my_desc)); + my_desc.count = len; + r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied, + &my_desc, tcp_copy_data); + desc->count -= r; + desc->offset += r; + } else + r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied, + desc, tcp_copy_data); + + if (r > 0) { + xprt->tcp_copied += r; + xprt->tcp_offset += r; + } + if (r != len) { + /* Error when copying to the receive buffer, + * usually because we weren't able to allocate + * additional buffer pages. All we can do now + * is turn off XPRT_COPY_DATA, so the request + * will not receive any additional updates, + * and time out. + * Any remaining data from this record will + * be discarded. + */ + xprt->tcp_flags &= ~XPRT_COPY_DATA; + dprintk("RPC: XID %08x truncated request\n", + ntohl(xprt->tcp_xid)); + dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n", + xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen); + goto out; + } + + dprintk("RPC: XID %08x read %Zd bytes\n", + ntohl(xprt->tcp_xid), r); + dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u\n", + xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen); + + if (xprt->tcp_copied == req->rq_private_buf.buflen) + xprt->tcp_flags &= ~XPRT_COPY_DATA; + else if (xprt->tcp_offset == xprt->tcp_reclen) { + if (xprt->tcp_flags & XPRT_LAST_FRAG) + xprt->tcp_flags &= ~XPRT_COPY_DATA; + } + +out: + if (!(xprt->tcp_flags & XPRT_COPY_DATA)) { + dprintk("RPC: %4d received reply complete\n", + req->rq_task->tk_pid); + xprt_complete_rqst(xprt, req, xprt->tcp_copied); + } + spin_unlock(&xprt->sock_lock); + tcp_check_recm(xprt); +} + +/* + * TCP discard extra bytes from a short read + */ +static inline void +tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc) +{ + size_t len; + + len = xprt->tcp_reclen - xprt->tcp_offset; + if (len > desc->count) + len = desc->count; + desc->count -= len; + desc->offset += len; + xprt->tcp_offset += len; + dprintk("RPC: discarded %Zu bytes\n", len); + tcp_check_recm(xprt); +} + +/* + * TCP record receive routine + * We first have to grab the record marker, then the XID, then the data. + */ +static int +tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, + unsigned int offset, size_t len) +{ + struct rpc_xprt *xprt = rd_desc->arg.data; + skb_reader_t desc = { + .skb = skb, + .offset = offset, + .count = len, + .csum = 0 + }; + + dprintk("RPC: tcp_data_recv\n"); + do { + /* Read in a new fragment marker if necessary */ + /* Can we ever really expect to get completely empty fragments? */ + if (xprt->tcp_flags & XPRT_COPY_RECM) { + tcp_read_fraghdr(xprt, &desc); + continue; + } + /* Read in the xid if necessary */ + if (xprt->tcp_flags & XPRT_COPY_XID) { + tcp_read_xid(xprt, &desc); + continue; + } + /* Read in the request data */ + if (xprt->tcp_flags & XPRT_COPY_DATA) { + tcp_read_request(xprt, &desc); + continue; + } + /* Skip over any trailing bytes on short reads */ + tcp_read_discard(xprt, &desc); + } while (desc.count); + dprintk("RPC: tcp_data_recv done\n"); + return len - desc.count; +} + +static void tcp_data_ready(struct sock *sk, int bytes) +{ + struct rpc_xprt *xprt; + read_descriptor_t rd_desc; + + read_lock(&sk->sk_callback_lock); + dprintk("RPC: tcp_data_ready...\n"); + if (!(xprt = xprt_from_sock(sk))) { + printk("RPC: tcp_data_ready socket info not found!\n"); + goto out; + } + if (xprt->shutdown) + goto out; + + /* We use rd_desc to pass struct xprt to tcp_data_recv */ + rd_desc.arg.data = xprt; + rd_desc.count = 65536; + tcp_read_sock(sk, &rd_desc, tcp_data_recv); +out: + read_unlock(&sk->sk_callback_lock); +} + +static void +tcp_state_change(struct sock *sk) +{ + struct rpc_xprt *xprt; + + read_lock(&sk->sk_callback_lock); + if (!(xprt = xprt_from_sock(sk))) + goto out; + dprintk("RPC: tcp_state_change client %p...\n", xprt); + dprintk("RPC: state %x conn %d dead %d zapped %d\n", + sk->sk_state, xprt_connected(xprt), + sock_flag(sk, SOCK_DEAD), + sock_flag(sk, SOCK_ZAPPED)); + + switch (sk->sk_state) { + case TCP_ESTABLISHED: + spin_lock_bh(&xprt->sock_lock); + if (!xprt_test_and_set_connected(xprt)) { + /* Reset TCP record info */ + xprt->tcp_offset = 0; + xprt->tcp_reclen = 0; + xprt->tcp_copied = 0; + xprt->tcp_flags = XPRT_COPY_RECM | XPRT_COPY_XID; + rpc_wake_up(&xprt->pending); + } + spin_unlock_bh(&xprt->sock_lock); + break; + case TCP_SYN_SENT: + case TCP_SYN_RECV: + break; + default: + xprt_disconnect(xprt); + break; + } + out: + read_unlock(&sk->sk_callback_lock); +} + +/* + * Called when more output buffer space is available for this socket. + * We try not to wake our writers until they can make "significant" + * progress, otherwise we'll waste resources thrashing sock_sendmsg + * with a bunch of small requests. + */ +static void +xprt_write_space(struct sock *sk) +{ + struct rpc_xprt *xprt; + struct socket *sock; + + read_lock(&sk->sk_callback_lock); + if (!(xprt = xprt_from_sock(sk)) || !(sock = sk->sk_socket)) + goto out; + if (xprt->shutdown) + goto out; + + /* Wait until we have enough socket memory */ + if (xprt->stream) { + /* from net/core/stream.c:sk_stream_write_space */ + if (sk_stream_wspace(sk) < sk_stream_min_wspace(sk)) + goto out; + } else { + /* from net/core/sock.c:sock_def_write_space */ + if (!sock_writeable(sk)) + goto out; + } + + if (!test_and_clear_bit(SOCK_NOSPACE, &sock->flags)) + goto out; + + spin_lock_bh(&xprt->sock_lock); + if (xprt->snd_task) + rpc_wake_up_task(xprt->snd_task); + spin_unlock_bh(&xprt->sock_lock); +out: + read_unlock(&sk->sk_callback_lock); +} + +/* + * Set socket buffer length + */ +static void +xprt_sock_setbufsize(struct rpc_xprt *xprt) +{ + struct sock *sk = xprt->inet; + + if (xprt->stream) + return; + if (xprt->rcvsize) { + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + sk->sk_rcvbuf = xprt->rcvsize * xprt->max_reqs * 2; + } + if (xprt->sndsize) { + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + sk->sk_sndbuf = xprt->sndsize * xprt->max_reqs * 2; + sk->sk_write_space(sk); + } +} + +/* + * Bind to a reserved port + */ +static inline int xprt_bindresvport(struct rpc_xprt *xprt, struct socket *sock) +{ + struct sockaddr_in myaddr = { + .sin_family = AF_INET, + }; + int err, port; + + /* Were we already bound to a given port? Try to reuse it */ + port = xprt->port; + do { + myaddr.sin_port = htons(port); + err = sock->ops->bind(sock, (struct sockaddr *) &myaddr, + sizeof(myaddr)); + if (err == 0) { + xprt->port = port; + return 0; + } + if (--port == 0) + port = XPRT_MAX_RESVPORT; + } while (err == -EADDRINUSE && port != xprt->port); + + printk("RPC: Can't bind to reserved port (%d).\n", -err); + return err; +} + +static void +xprt_bind_socket(struct rpc_xprt *xprt, struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (xprt->inet) + return; + + write_lock_bh(&sk->sk_callback_lock); + sk->sk_user_data = xprt; + xprt->old_data_ready = sk->sk_data_ready; + xprt->old_state_change = sk->sk_state_change; + xprt->old_write_space = sk->sk_write_space; + if (xprt->prot == IPPROTO_UDP) { + sk->sk_data_ready = udp_data_ready; + sk->sk_no_check = UDP_CSUM_NORCV; + xprt_set_connected(xprt); + } else { + tcp_sk(sk)->nonagle = 1; /* disable Nagle's algorithm */ + sk->sk_data_ready = tcp_data_ready; + sk->sk_state_change = tcp_state_change; + xprt_clear_connected(xprt); + } + sk->sk_write_space = xprt_write_space; + + /* Reset to new socket */ + xprt->sock = sock; + xprt->inet = sk; + write_unlock_bh(&sk->sk_callback_lock); + + return; +} + +/* + * Datastream sockets are created here, but xprt_connect will create + * and connect stream sockets. + */ +static struct socket * xprt_create_socket(struct rpc_xprt *xprt, int proto, int resvport) +{ + struct socket *sock; + int type, err; + + dprintk("RPC: xprt_create_socket(%s %d)\n", + (proto == IPPROTO_UDP)? "udp" : "tcp", proto); + + type = (proto == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; + + if ((err = sock_create_kern(PF_INET, type, proto, &sock)) < 0) { + printk("RPC: can't create socket (%d).\n", -err); + return NULL; + } + + /* If the caller has the capability, bind to a reserved port */ + if (resvport && xprt_bindresvport(xprt, sock) < 0) { + printk("RPC: can't bind to reserved port.\n"); + goto failed; + } + + return sock; + +failed: + sock_release(sock); + return NULL; +} + +static void xprt_socket_connect(void *args) +{ + struct rpc_xprt *xprt = (struct rpc_xprt *)args; + struct socket *sock = xprt->sock; + int status = -EIO; + + if (xprt->shutdown || xprt->addr.sin_port == 0) + goto out; + + /* + * Start by resetting any existing state + */ + xprt_close(xprt); + sock = xprt_create_socket(xprt, xprt->prot, xprt->resvport); + if (sock == NULL) { + /* couldn't create socket or bind to reserved port; + * this is likely a permanent error, so cause an abort */ + goto out; + } + xprt_bind_socket(xprt, sock); + xprt_sock_setbufsize(xprt); + + status = 0; + if (!xprt->stream) + goto out; + + /* + * Tell the socket layer to start connecting... + */ + status = sock->ops->connect(sock, (struct sockaddr *) &xprt->addr, + sizeof(xprt->addr), O_NONBLOCK); + dprintk("RPC: %p connect status %d connected %d sock state %d\n", + xprt, -status, xprt_connected(xprt), sock->sk->sk_state); + if (status < 0) { + switch (status) { + case -EINPROGRESS: + case -EALREADY: + goto out_clear; + } + } +out: + if (status < 0) + rpc_wake_up_status(&xprt->pending, status); + else + rpc_wake_up(&xprt->pending); +out_clear: + smp_mb__before_clear_bit(); + clear_bit(XPRT_CONNECTING, &xprt->sockstate); + smp_mb__after_clear_bit(); +} + +static void +xprt_connect_sock(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + + if (!test_and_set_bit(XPRT_CONNECTING, &xprt->sockstate)) { + /* Note: if we are here due to a dropped connection + * we delay reconnecting by RPC_REESTABLISH_TIMEOUT/HZ + * seconds + */ + if (xprt->sock != NULL) + schedule_delayed_work(&xprt->sock_connect, + RPC_REESTABLISH_TIMEOUT); + else { + schedule_work(&xprt->sock_connect); + /* flush_scheduled_work can sleep... */ + if (!RPC_IS_ASYNC(task)) + flush_scheduled_work(); + } + } +} + +/* + * Set default timeout parameters + */ +static void +xprt_default_timeout(struct rpc_timeout *to, int proto) +{ + if (proto == IPPROTO_UDP) + xprt_set_timeout(to, 5, 5 * HZ); + else + xprt_set_timeout(to, 2, 60 * HZ); +} + +static struct rpc_xprt_ops xprt_socket_ops = { + .set_buffer_size = xprt_sock_setbufsize, + .connect = xprt_connect_sock, + .send_request = xprt_send_request, + .close = xprt_close, + .destroy = xprt_socket_destroy, +}; + +extern unsigned int xprt_udp_slot_table_entries; +extern unsigned int xprt_tcp_slot_table_entries; + +int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) +{ + size_t slot_table_size; + + dprintk("RPC: setting up udp-ipv4 transport...\n"); + + xprt->max_reqs = xprt_udp_slot_table_entries; + slot_table_size = xprt->max_reqs * sizeof(xprt->slot[0]); + xprt->slot = kmalloc(slot_table_size, GFP_KERNEL); + if (xprt->slot == NULL) + return -ENOMEM; + memset(xprt->slot, 0, slot_table_size); + + xprt->prot = IPPROTO_UDP; + xprt->port = XPRT_MAX_RESVPORT; + xprt->stream = 0; + xprt->nocong = 0; + xprt->cwnd = RPC_INITCWND; + xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; + /* XXX: header size can vary due to auth type, IPv6, etc. */ + xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); + + INIT_WORK(&xprt->sock_connect, xprt_socket_connect, xprt); + + xprt->ops = &xprt_socket_ops; + + if (to) + xprt->timeout = *to; + else + xprt_default_timeout(to, xprt->prot); + + return 0; +} + +int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) +{ + size_t slot_table_size; + + dprintk("RPC: setting up tcp-ipv4 transport...\n"); + + xprt->max_reqs = xprt_tcp_slot_table_entries; + slot_table_size = xprt->max_reqs * sizeof(xprt->slot[0]); + xprt->slot = kmalloc(slot_table_size, GFP_KERNEL); + if (xprt->slot == NULL) + return -ENOMEM; + memset(xprt->slot, 0, slot_table_size); + + xprt->prot = IPPROTO_TCP; + xprt->port = XPRT_MAX_RESVPORT; + xprt->stream = 1; + xprt->nocong = 1; + xprt->cwnd = RPC_MAXCWND(xprt); + xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; + xprt->max_payload = (1U << 31) - 1; + + INIT_WORK(&xprt->sock_connect, xprt_socket_connect, xprt); + + xprt->ops = &xprt_socket_ops; + + if (to) + xprt->timeout = *to; + else + xprt_default_timeout(to, xprt->prot); + + return 0; +} -- cgit v1.2.3 From 9903cd1c27a1f30e8efea75e125be3b2002f7cb9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:26 -0400 Subject: [PATCH] RPC: transport switch function naming Introduce block header comments and a function naming convention to the socket transport implementation. Provide a debug setting for transports that is separate from RPCDBG_XPRT. Eliminate xprt_default_timeout(). Provide block comments for exposed interfaces in xprt.c, and eliminate the useless obvious comments. Convert printk's to dprintk's. Test-plan: Compile kernel with CONFIG_NFS enabled. Version: Thu, 11 Aug 2005 16:04:04 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 147 ++++++++-------- net/sunrpc/xprtsock.c | 464 ++++++++++++++++++++++++++------------------------ 2 files changed, 310 insertions(+), 301 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 4342acf4d1c..589195e630e 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -227,9 +227,6 @@ xprt_adjust_cwnd(struct rpc_xprt *xprt, int result) xprt->cwnd = cwnd; } -/* - * Reset the major timeout value - */ static void xprt_reset_majortimeo(struct rpc_rqst *req) { struct rpc_timeout *to = &req->rq_xprt->timeout; @@ -244,8 +241,10 @@ static void xprt_reset_majortimeo(struct rpc_rqst *req) req->rq_majortimeo += jiffies; } -/* - * Adjust timeout values etc for next retransmit +/** + * xprt_adjust_timeout - adjust timeout values for next retransmit + * @req: RPC request containing parameters to use for the adjustment + * */ int xprt_adjust_timeout(struct rpc_rqst *req) { @@ -291,8 +290,10 @@ xprt_socket_autoclose(void *args) xprt_release_write(xprt, NULL); } -/* - * Mark a transport as disconnected +/** + * xprt_disconnect - mark a transport as disconnected + * @xprt: transport to flag for disconnect + * */ void xprt_disconnect(struct rpc_xprt *xprt) { @@ -303,9 +304,6 @@ void xprt_disconnect(struct rpc_xprt *xprt) spin_unlock_bh(&xprt->sock_lock); } -/* - * Used to allow disconnection when we've been idle - */ static void xprt_init_autodisconnect(unsigned long data) { @@ -327,8 +325,9 @@ out_abort: spin_unlock(&xprt->sock_lock); } -/* - * Attempt to connect a TCP socket. +/** + * xprt_connect - schedule a transport connect operation + * @task: RPC task that is requesting the connect * */ void xprt_connect(struct rpc_task *task) @@ -361,11 +360,7 @@ void xprt_connect(struct rpc_task *task) return; } -/* - * We arrive here when awoken from waiting on connection establishment. - */ -static void -xprt_connect_status(struct rpc_task *task) +static void xprt_connect_status(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; @@ -404,8 +399,11 @@ xprt_connect_status(struct rpc_task *task) } } -/* - * Look up the RPC request corresponding to a reply, and then lock it. +/** + * xprt_lookup_rqst - find an RPC request corresponding to an XID + * @xprt: transport on which the original request was transmitted + * @xid: RPC XID of incoming reply + * */ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid) { @@ -422,9 +420,12 @@ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid) return req; } -/* - * Complete reply received. - * The TCP code relies on us to remove the request from xprt->pending. +/** + * xprt_complete_rqst - called when reply processing is complete + * @xprt: controlling transport + * @req: RPC request that just completed + * @copied: actual number of bytes received from the transport + * */ void xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied) { @@ -498,12 +499,12 @@ out: spin_unlock(&xprt->sock_lock); } -/* - * Place the actual RPC call. - * We have to copy the iovec because sendmsg fiddles with its contents. +/** + * xprt_prepare_transmit - reserve the transport before sending a request + * @task: RPC task about to send a request + * */ -int -xprt_prepare_transmit(struct rpc_task *task) +int xprt_prepare_transmit(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; @@ -533,8 +534,13 @@ out_unlock: return err; } -void -xprt_transmit(struct rpc_task *task) +/** + * xprt_transmit - send an RPC request on a transport + * @task: controlling RPC task + * + * We have to copy the iovec because sendmsg fiddles with its contents. + */ +void xprt_transmit(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; struct rpc_rqst *req = task->tk_rqstp; @@ -604,11 +610,7 @@ xprt_transmit(struct rpc_task *task) spin_unlock_bh(&xprt->sock_lock); } -/* - * Reserve an RPC call slot. - */ -static inline void -do_xprt_reserve(struct rpc_task *task) +static inline void do_xprt_reserve(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; @@ -628,8 +630,14 @@ do_xprt_reserve(struct rpc_task *task) rpc_sleep_on(&xprt->backlog, task, NULL, NULL); } -void -xprt_reserve(struct rpc_task *task) +/** + * xprt_reserve - allocate an RPC request slot + * @task: RPC task requesting a slot allocation + * + * If no more slots are available, place the task on the transport's + * backlog queue. + */ +void xprt_reserve(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; @@ -641,9 +649,6 @@ xprt_reserve(struct rpc_task *task) } } -/* - * Allocate a 'unique' XID - */ static inline u32 xprt_alloc_xid(struct rpc_xprt *xprt) { return xprt->xid++; @@ -654,11 +659,7 @@ static inline void xprt_init_xid(struct rpc_xprt *xprt) get_random_bytes(&xprt->xid, sizeof(xprt->xid)); } -/* - * Initialize RPC request - */ -static void -xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt) +static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt) { struct rpc_rqst *req = task->tk_rqstp; @@ -670,11 +671,12 @@ xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt) req, ntohl(req->rq_xid)); } -/* - * Release an RPC call slot +/** + * xprt_release - release an RPC request slot + * @task: task which is finished with the slot + * */ -void -xprt_release(struct rpc_task *task) +void xprt_release(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; struct rpc_rqst *req; @@ -702,11 +704,14 @@ xprt_release(struct rpc_task *task) spin_unlock(&xprt->xprt_lock); } -/* - * Set constant timeout +/** + * xprt_set_timeout - set constant RPC timeout + * @to: RPC timeout parameters to set up + * @retr: number of retries + * @incr: amount of increase after each retry + * */ -void -xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long incr) +void xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long incr) { to->to_initval = to->to_increment = incr; @@ -715,11 +720,7 @@ xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long incr) to->to_exponential = 0; } -/* - * Initialize an RPC client - */ -static struct rpc_xprt * -xprt_setup(int proto, struct sockaddr_in *ap, struct rpc_timeout *to) +static struct rpc_xprt *xprt_setup(int proto, struct sockaddr_in *ap, struct rpc_timeout *to) { int result; struct rpc_xprt *xprt; @@ -778,11 +779,14 @@ xprt_setup(int proto, struct sockaddr_in *ap, struct rpc_timeout *to) return xprt; } -/* - * Create an RPC client transport given the protocol and peer address. +/** + * xprt_create_proto - create an RPC client transport + * @proto: requested transport protocol + * @sap: remote peer's address + * @to: timeout parameters for new transport + * */ -struct rpc_xprt * -xprt_create_proto(int proto, struct sockaddr_in *sap, struct rpc_timeout *to) +struct rpc_xprt *xprt_create_proto(int proto, struct sockaddr_in *sap, struct rpc_timeout *to) { struct rpc_xprt *xprt; @@ -794,11 +798,7 @@ xprt_create_proto(int proto, struct sockaddr_in *sap, struct rpc_timeout *to) return xprt; } -/* - * Prepare for transport shutdown. - */ -static void -xprt_shutdown(struct rpc_xprt *xprt) +static void xprt_shutdown(struct rpc_xprt *xprt) { xprt->shutdown = 1; rpc_wake_up(&xprt->sending); @@ -809,21 +809,18 @@ xprt_shutdown(struct rpc_xprt *xprt) del_timer_sync(&xprt->timer); } -/* - * Clear the xprt backlog queue - */ -static int -xprt_clear_backlog(struct rpc_xprt *xprt) { +static int xprt_clear_backlog(struct rpc_xprt *xprt) { rpc_wake_up_next(&xprt->backlog); wake_up(&xprt->cong_wait); return 1; } -/* - * Destroy an RPC transport, killing off all requests. +/** + * xprt_destroy - destroy an RPC transport, killing off all requests. + * @xprt: transport to destroy + * */ -int -xprt_destroy(struct rpc_xprt *xprt) +int xprt_destroy(struct rpc_xprt *xprt) { dprintk("RPC: destroying transport %p\n", xprt); xprt_shutdown(xprt); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index fa1180ac482..80222de3afa 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -33,23 +33,21 @@ #include #include +/* + * Maximum port number to use when requesting a reserved port. + */ +#define XS_MAX_RESVPORT (800U) + #ifdef RPC_DEBUG # undef RPC_DEBUG_DATA -# define RPCDBG_FACILITY RPCDBG_XPRT +# define RPCDBG_FACILITY RPCDBG_TRANS #endif -#define XPRT_MAX_RESVPORT (800) - #ifdef RPC_DEBUG_DATA -/* - * Print the buffer contents (first 128 bytes only--just enough for - * diropres return). - */ -static void -xprt_pktdump(char *msg, u32 *packet, unsigned int count) +static void xs_pktdump(char *msg, u32 *packet, unsigned int count) { - u8 *buf = (u8 *) packet; - int j; + u8 *buf = (u8 *) packet; + int j; dprintk("RPC: %s\n", msg); for (j = 0; j < count && j < 128; j += 4) { @@ -64,25 +62,22 @@ xprt_pktdump(char *msg, u32 *packet, unsigned int count) dprintk("\n"); } #else -static inline void -xprt_pktdump(char *msg, u32 *packet, unsigned int count) +static inline void xs_pktdump(char *msg, u32 *packet, unsigned int count) { /* NOP */ } #endif -/* - * Look up RPC transport given an INET socket +/** + * xs_sendpages - write pages directly to a socket + * @sock: socket to send on + * @addr: UDP only -- address of destination + * @addrlen: UDP only -- length of destination address + * @xdr: buffer containing this request + * @base: starting position in the buffer + * */ -static inline struct rpc_xprt * -xprt_from_sock(struct sock *sk) -{ - return (struct rpc_xprt *) sk->sk_user_data; -} - -static int -xdr_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, - struct xdr_buf *xdr, unsigned int base, int msgflags) +static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, int msgflags) { struct page **ppage = xdr->pages; unsigned int len, pglen = xdr->page_len; @@ -125,7 +120,7 @@ xdr_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, } if (base || xdr->page_base) { pglen -= base; - base += xdr->page_base; + base += xdr->page_base; ppage += base >> PAGE_CACHE_SHIFT; base &= ~PAGE_CACHE_MASK; } @@ -176,23 +171,25 @@ out: return ret; } -/* - * Write data to socket. +/** + * xs_sendmsg - write an RPC request to a socket + * @xprt: generic transport + * @req: the RPC request to write + * */ -static inline int -xprt_sendmsg(struct rpc_xprt *xprt, struct rpc_rqst *req) +static int xs_sendmsg(struct rpc_xprt *xprt, struct rpc_rqst *req) { - struct socket *sock = xprt->sock; - struct xdr_buf *xdr = &req->rq_snd_buf; + struct socket *sock = xprt->sock; + struct xdr_buf *xdr = &req->rq_snd_buf; struct sockaddr *addr = NULL; int addrlen = 0; - unsigned int skip; - int result; + unsigned int skip; + int result; if (!sock) return -ENOTCONN; - xprt_pktdump("packet data:", + xs_pktdump("packet data:", req->rq_svec->iov_base, req->rq_svec->iov_len); @@ -201,13 +198,13 @@ xprt_sendmsg(struct rpc_xprt *xprt, struct rpc_rqst *req) addr = (struct sockaddr *) &xprt->addr; addrlen = sizeof(xprt->addr); } - /* Dont repeat bytes */ + /* Don't repeat bytes */ skip = req->rq_bytes_sent; clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags); - result = xdr_sendpages(sock, addr, addrlen, xdr, skip, MSG_DONTWAIT); + result = xs_sendpages(sock, addr, addrlen, xdr, skip, MSG_DONTWAIT); - dprintk("RPC: xprt_sendmsg(%d) = %d\n", xdr->len - skip, result); + dprintk("RPC: xs_sendmsg(%d) = %d\n", xdr->len - skip, result); if (result >= 0) return result; @@ -215,8 +212,7 @@ xprt_sendmsg(struct rpc_xprt *xprt, struct rpc_rqst *req) switch (result) { case -ECONNREFUSED: /* When the server has died, an ICMP port unreachable message - * prompts ECONNREFUSED. - */ + * prompts ECONNREFUSED. */ case -EAGAIN: break; case -ECONNRESET: @@ -227,13 +223,25 @@ xprt_sendmsg(struct rpc_xprt *xprt, struct rpc_rqst *req) result = -ENOTCONN; break; default: - printk(KERN_NOTICE "RPC: sendmsg returned error %d\n", -result); + break; } return result; } -static int -xprt_send_request(struct rpc_task *task) +/** + * xs_send_request - write an RPC request to a socket + * @task: address of RPC task that manages the state of an RPC request + * + * Return values: + * 0: The request has been sent + * EAGAIN: The socket was blocked, please call again later to + * complete the request + * other: Some other error occured, the request was not sent + * + * XXX: In the case of soft timeouts, should we eventually give up + * if the socket is not able to make progress? + */ +static int xs_send_request(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; @@ -242,18 +250,18 @@ xprt_send_request(struct rpc_task *task) /* set up everything as needed. */ /* Write the record marker */ if (xprt->stream) { - u32 *marker = req->rq_svec[0].iov_base; + u32 *marker = req->rq_svec[0].iov_base; *marker = htonl(0x80000000|(req->rq_slen-sizeof(*marker))); } /* Continue transmitting the packet/record. We must be careful * to cope with writespace callbacks arriving _after_ we have - * called xprt_sendmsg(). + * called sendmsg(). */ while (1) { req->rq_xtime = jiffies; - status = xprt_sendmsg(xprt, req); + status = xs_sendmsg(xprt, req); if (status < 0) break; @@ -285,7 +293,7 @@ xprt_send_request(struct rpc_task *task) if (status == -EAGAIN) { if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) { - /* Protect against races with xprt_write_space */ + /* Protect against races with xs_write_space */ spin_lock_bh(&xprt->sock_lock); /* Don't race with disconnect */ if (!xprt_connected(xprt)) @@ -303,65 +311,77 @@ xprt_send_request(struct rpc_task *task) return status; } -/* - * Close down a transport socket +/** + * xs_close - close a socket + * @xprt: transport + * */ -static void -xprt_close(struct rpc_xprt *xprt) +static void xs_close(struct rpc_xprt *xprt) { - struct socket *sock = xprt->sock; - struct sock *sk = xprt->inet; + struct socket *sock = xprt->sock; + struct sock *sk = xprt->inet; if (!sk) return; + dprintk("RPC: xs_close xprt %p\n", xprt); + write_lock_bh(&sk->sk_callback_lock); xprt->inet = NULL; xprt->sock = NULL; - sk->sk_user_data = NULL; - sk->sk_data_ready = xprt->old_data_ready; + sk->sk_user_data = NULL; + sk->sk_data_ready = xprt->old_data_ready; sk->sk_state_change = xprt->old_state_change; - sk->sk_write_space = xprt->old_write_space; + sk->sk_write_space = xprt->old_write_space; write_unlock_bh(&sk->sk_callback_lock); - sk->sk_no_check = 0; + sk->sk_no_check = 0; sock_release(sock); } -static void xprt_socket_destroy(struct rpc_xprt *xprt) +/** + * xs_destroy - prepare to shutdown a transport + * @xprt: doomed transport + * + */ +static void xs_destroy(struct rpc_xprt *xprt) { + dprintk("RPC: xs_destroy xprt %p\n", xprt); + cancel_delayed_work(&xprt->sock_connect); flush_scheduled_work(); xprt_disconnect(xprt); - xprt_close(xprt); + xs_close(xprt); kfree(xprt->slot); } -/* - * Input handler for RPC replies. Called from a bottom half and hence - * atomic. +static inline struct rpc_xprt *xprt_from_sock(struct sock *sk) +{ + return (struct rpc_xprt *) sk->sk_user_data; +} + +/** + * xs_udp_data_ready - "data ready" callback for UDP sockets + * @sk: socket with data to read + * @len: how much data to read + * */ -static void -udp_data_ready(struct sock *sk, int len) +static void xs_udp_data_ready(struct sock *sk, int len) { - struct rpc_task *task; - struct rpc_xprt *xprt; + struct rpc_task *task; + struct rpc_xprt *xprt; struct rpc_rqst *rovr; - struct sk_buff *skb; + struct sk_buff *skb; int err, repsize, copied; u32 _xid, *xp; read_lock(&sk->sk_callback_lock); - dprintk("RPC: udp_data_ready...\n"); - if (!(xprt = xprt_from_sock(sk))) { - printk("RPC: udp_data_ready request not found!\n"); + dprintk("RPC: xs_udp_data_ready...\n"); + if (!(xprt = xprt_from_sock(sk))) goto out; - } - - dprintk("RPC: udp_data_ready client %p\n", xprt); if ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL) goto out; @@ -371,7 +391,7 @@ udp_data_ready(struct sock *sk, int len) repsize = skb->len - sizeof(struct udphdr); if (repsize < 4) { - printk("RPC: impossible RPC reply size %d!\n", repsize); + dprintk("RPC: impossible RPC reply size %d!\n", repsize); goto dropit; } @@ -410,11 +430,7 @@ udp_data_ready(struct sock *sk, int len) read_unlock(&sk->sk_callback_lock); } -/* - * Copy from an skb into memory and shrink the skb. - */ -static inline size_t -tcp_copy_data(skb_reader_t *desc, void *p, size_t len) +static inline size_t xs_tcp_copy_data(skb_reader_t *desc, void *p, size_t len) { if (len > desc->count) len = desc->count; @@ -430,18 +446,14 @@ tcp_copy_data(skb_reader_t *desc, void *p, size_t len) return len; } -/* - * TCP read fragment marker - */ -static inline void -tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc) +static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc) { size_t len, used; char *p; p = ((char *) &xprt->tcp_recm) + xprt->tcp_offset; len = sizeof(xprt->tcp_recm) - xprt->tcp_offset; - used = tcp_copy_data(desc, p, len); + used = xs_tcp_copy_data(desc, p, len); xprt->tcp_offset += used; if (used != len) return; @@ -455,15 +467,15 @@ tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc) xprt->tcp_offset = 0; /* Sanity check of the record length */ if (xprt->tcp_reclen < 4) { - printk(KERN_ERR "RPC: Invalid TCP record fragment length\n"); + dprintk("RPC: invalid TCP record fragment length\n"); xprt_disconnect(xprt); + return; } dprintk("RPC: reading TCP record fragment of length %d\n", xprt->tcp_reclen); } -static void -tcp_check_recm(struct rpc_xprt *xprt) +static void xs_tcp_check_recm(struct rpc_xprt *xprt) { dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, tcp_reclen = %u, tcp_flags = %lx\n", xprt, xprt->tcp_copied, xprt->tcp_offset, xprt->tcp_reclen, xprt->tcp_flags); @@ -478,11 +490,7 @@ tcp_check_recm(struct rpc_xprt *xprt) } } -/* - * TCP read xid - */ -static inline void -tcp_read_xid(struct rpc_xprt *xprt, skb_reader_t *desc) +static inline void xs_tcp_read_xid(struct rpc_xprt *xprt, skb_reader_t *desc) { size_t len, used; char *p; @@ -490,7 +498,7 @@ tcp_read_xid(struct rpc_xprt *xprt, skb_reader_t *desc) len = sizeof(xprt->tcp_xid) - xprt->tcp_offset; dprintk("RPC: reading XID (%Zu bytes)\n", len); p = ((char *) &xprt->tcp_xid) + xprt->tcp_offset; - used = tcp_copy_data(desc, p, len); + used = xs_tcp_copy_data(desc, p, len); xprt->tcp_offset += used; if (used != len) return; @@ -499,14 +507,10 @@ tcp_read_xid(struct rpc_xprt *xprt, skb_reader_t *desc) xprt->tcp_copied = 4; dprintk("RPC: reading reply for XID %08x\n", ntohl(xprt->tcp_xid)); - tcp_check_recm(xprt); + xs_tcp_check_recm(xprt); } -/* - * TCP read and complete request - */ -static inline void -tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc) +static inline void xs_tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc) { struct rpc_rqst *req; struct xdr_buf *rcvbuf; @@ -533,12 +537,12 @@ tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc) memcpy(&my_desc, desc, sizeof(my_desc)); my_desc.count = len; r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied, - &my_desc, tcp_copy_data); + &my_desc, xs_tcp_copy_data); desc->count -= r; desc->offset += r; } else r = xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied, - desc, tcp_copy_data); + desc, xs_tcp_copy_data); if (r > 0) { xprt->tcp_copied += r; @@ -581,14 +585,10 @@ out: xprt_complete_rqst(xprt, req, xprt->tcp_copied); } spin_unlock(&xprt->sock_lock); - tcp_check_recm(xprt); + xs_tcp_check_recm(xprt); } -/* - * TCP discard extra bytes from a short read - */ -static inline void -tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc) +static inline void xs_tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc) { size_t len; @@ -599,16 +599,10 @@ tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc) desc->offset += len; xprt->tcp_offset += len; dprintk("RPC: discarded %Zu bytes\n", len); - tcp_check_recm(xprt); + xs_tcp_check_recm(xprt); } -/* - * TCP record receive routine - * We first have to grab the record marker, then the XID, then the data. - */ -static int -tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, - unsigned int offset, size_t len) +static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, unsigned int offset, size_t len) { struct rpc_xprt *xprt = rd_desc->arg.data; skb_reader_t desc = { @@ -616,64 +610,72 @@ tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, .offset = offset, .count = len, .csum = 0 - }; + }; - dprintk("RPC: tcp_data_recv\n"); + dprintk("RPC: xs_tcp_data_recv started\n"); do { /* Read in a new fragment marker if necessary */ /* Can we ever really expect to get completely empty fragments? */ if (xprt->tcp_flags & XPRT_COPY_RECM) { - tcp_read_fraghdr(xprt, &desc); + xs_tcp_read_fraghdr(xprt, &desc); continue; } /* Read in the xid if necessary */ if (xprt->tcp_flags & XPRT_COPY_XID) { - tcp_read_xid(xprt, &desc); + xs_tcp_read_xid(xprt, &desc); continue; } /* Read in the request data */ if (xprt->tcp_flags & XPRT_COPY_DATA) { - tcp_read_request(xprt, &desc); + xs_tcp_read_request(xprt, &desc); continue; } /* Skip over any trailing bytes on short reads */ - tcp_read_discard(xprt, &desc); + xs_tcp_read_discard(xprt, &desc); } while (desc.count); - dprintk("RPC: tcp_data_recv done\n"); + dprintk("RPC: xs_tcp_data_recv done\n"); return len - desc.count; } -static void tcp_data_ready(struct sock *sk, int bytes) +/** + * xs_tcp_data_ready - "data ready" callback for TCP sockets + * @sk: socket with data to read + * @bytes: how much data to read + * + */ +static void xs_tcp_data_ready(struct sock *sk, int bytes) { struct rpc_xprt *xprt; read_descriptor_t rd_desc; read_lock(&sk->sk_callback_lock); - dprintk("RPC: tcp_data_ready...\n"); - if (!(xprt = xprt_from_sock(sk))) { - printk("RPC: tcp_data_ready socket info not found!\n"); + dprintk("RPC: xs_tcp_data_ready...\n"); + if (!(xprt = xprt_from_sock(sk))) goto out; - } if (xprt->shutdown) goto out; - /* We use rd_desc to pass struct xprt to tcp_data_recv */ + /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */ rd_desc.arg.data = xprt; rd_desc.count = 65536; - tcp_read_sock(sk, &rd_desc, tcp_data_recv); + tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); out: read_unlock(&sk->sk_callback_lock); } -static void -tcp_state_change(struct sock *sk) +/** + * xs_tcp_state_change - callback to handle TCP socket state changes + * @sk: socket whose state has changed + * + */ +static void xs_tcp_state_change(struct sock *sk) { - struct rpc_xprt *xprt; + struct rpc_xprt *xprt; read_lock(&sk->sk_callback_lock); if (!(xprt = xprt_from_sock(sk))) goto out; - dprintk("RPC: tcp_state_change client %p...\n", xprt); + dprintk("RPC: xs_tcp_state_change client %p...\n", xprt); dprintk("RPC: state %x conn %d dead %d zapped %d\n", sk->sk_state, xprt_connected(xprt), sock_flag(sk, SOCK_DEAD), @@ -703,17 +705,20 @@ tcp_state_change(struct sock *sk) read_unlock(&sk->sk_callback_lock); } -/* +/** + * xs_write_space - callback invoked when socket buffer space becomes + * available + * @sk: socket whose state has changed + * * Called when more output buffer space is available for this socket. * We try not to wake our writers until they can make "significant" * progress, otherwise we'll waste resources thrashing sock_sendmsg * with a bunch of small requests. */ -static void -xprt_write_space(struct sock *sk) +static void xs_write_space(struct sock *sk) { - struct rpc_xprt *xprt; - struct socket *sock; + struct rpc_xprt *xprt; + struct socket *sock; read_lock(&sk->sk_callback_lock); if (!(xprt = xprt_from_sock(sk)) || !(sock = sk->sk_socket)) @@ -743,11 +748,15 @@ out: read_unlock(&sk->sk_callback_lock); } -/* - * Set socket buffer length +/** + * xs_set_buffer_size - set send and receive limits + * @xprt: generic transport + * + * Set socket send and receive limits based on the + * sndsize and rcvsize fields in the generic transport + * structure. This applies only to UDP sockets. */ -static void -xprt_sock_setbufsize(struct rpc_xprt *xprt) +static void xs_set_buffer_size(struct rpc_xprt *xprt) { struct sock *sk = xprt->inet; @@ -764,15 +773,12 @@ xprt_sock_setbufsize(struct rpc_xprt *xprt) } } -/* - * Bind to a reserved port - */ -static inline int xprt_bindresvport(struct rpc_xprt *xprt, struct socket *sock) +static int xs_bindresvport(struct rpc_xprt *xprt, struct socket *sock) { struct sockaddr_in myaddr = { .sin_family = AF_INET, }; - int err, port; + int err, port; /* Were we already bound to a given port? Try to reuse it */ port = xprt->port; @@ -782,20 +788,47 @@ static inline int xprt_bindresvport(struct rpc_xprt *xprt, struct socket *sock) sizeof(myaddr)); if (err == 0) { xprt->port = port; + dprintk("RPC: xs_bindresvport bound to port %u\n", + port); return 0; } if (--port == 0) - port = XPRT_MAX_RESVPORT; + port = XS_MAX_RESVPORT; } while (err == -EADDRINUSE && port != xprt->port); - printk("RPC: Can't bind to reserved port (%d).\n", -err); + dprintk("RPC: can't bind to reserved port (%d).\n", -err); return err; } -static void -xprt_bind_socket(struct rpc_xprt *xprt, struct socket *sock) +static struct socket *xs_create(struct rpc_xprt *xprt, int proto, int resvport) { - struct sock *sk = sock->sk; + struct socket *sock; + int type, err; + + dprintk("RPC: xs_create(%s %d)\n", + (proto == IPPROTO_UDP)? "udp" : "tcp", proto); + + type = (proto == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; + + if ((err = sock_create_kern(PF_INET, type, proto, &sock)) < 0) { + dprintk("RPC: can't create socket (%d).\n", -err); + return NULL; + } + + /* If the caller has the capability, bind to a reserved port */ + if (resvport && xs_bindresvport(xprt, sock) < 0) + goto failed; + + return sock; + +failed: + sock_release(sock); + return NULL; +} + +static void xs_bind(struct rpc_xprt *xprt, struct socket *sock) +{ + struct sock *sk = sock->sk; if (xprt->inet) return; @@ -806,16 +839,16 @@ xprt_bind_socket(struct rpc_xprt *xprt, struct socket *sock) xprt->old_state_change = sk->sk_state_change; xprt->old_write_space = sk->sk_write_space; if (xprt->prot == IPPROTO_UDP) { - sk->sk_data_ready = udp_data_ready; + sk->sk_data_ready = xs_udp_data_ready; sk->sk_no_check = UDP_CSUM_NORCV; xprt_set_connected(xprt); } else { tcp_sk(sk)->nonagle = 1; /* disable Nagle's algorithm */ - sk->sk_data_ready = tcp_data_ready; - sk->sk_state_change = tcp_state_change; + sk->sk_data_ready = xs_tcp_data_ready; + sk->sk_state_change = xs_tcp_state_change; xprt_clear_connected(xprt); } - sk->sk_write_space = xprt_write_space; + sk->sk_write_space = xs_write_space; /* Reset to new socket */ xprt->sock = sock; @@ -825,39 +858,13 @@ xprt_bind_socket(struct rpc_xprt *xprt, struct socket *sock) return; } -/* - * Datastream sockets are created here, but xprt_connect will create - * and connect stream sockets. +/** + * xs_connect_worker - try to connect a socket to a remote endpoint + * @args: RPC transport to connect + * + * Invoked by a work queue tasklet. */ -static struct socket * xprt_create_socket(struct rpc_xprt *xprt, int proto, int resvport) -{ - struct socket *sock; - int type, err; - - dprintk("RPC: xprt_create_socket(%s %d)\n", - (proto == IPPROTO_UDP)? "udp" : "tcp", proto); - - type = (proto == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; - - if ((err = sock_create_kern(PF_INET, type, proto, &sock)) < 0) { - printk("RPC: can't create socket (%d).\n", -err); - return NULL; - } - - /* If the caller has the capability, bind to a reserved port */ - if (resvport && xprt_bindresvport(xprt, sock) < 0) { - printk("RPC: can't bind to reserved port.\n"); - goto failed; - } - - return sock; - -failed: - sock_release(sock); - return NULL; -} - -static void xprt_socket_connect(void *args) +static void xs_connect_worker(void *args) { struct rpc_xprt *xprt = (struct rpc_xprt *)args; struct socket *sock = xprt->sock; @@ -866,18 +873,20 @@ static void xprt_socket_connect(void *args) if (xprt->shutdown || xprt->addr.sin_port == 0) goto out; + dprintk("RPC: xs_connect_worker xprt %p\n", xprt); + /* * Start by resetting any existing state */ - xprt_close(xprt); - sock = xprt_create_socket(xprt, xprt->prot, xprt->resvport); + xs_close(xprt); + sock = xs_create(xprt, xprt->prot, xprt->resvport); if (sock == NULL) { /* couldn't create socket or bind to reserved port; * this is likely a permanent error, so cause an abort */ goto out; } - xprt_bind_socket(xprt, sock); - xprt_sock_setbufsize(xprt); + xs_bind(xprt, sock); + xs_set_buffer_size(xprt); status = 0; if (!xprt->stream) @@ -908,20 +917,23 @@ out_clear: smp_mb__after_clear_bit(); } -static void -xprt_connect_sock(struct rpc_task *task) +/** + * xs_connect - connect a socket to a remote endpoint + * @task: address of RPC task that manages state of connect request + * + * TCP: If the remote end dropped the connection, delay reconnecting. + */ +static void xs_connect(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; if (!test_and_set_bit(XPRT_CONNECTING, &xprt->sockstate)) { - /* Note: if we are here due to a dropped connection - * we delay reconnecting by RPC_REESTABLISH_TIMEOUT/HZ - * seconds - */ - if (xprt->sock != NULL) + if (xprt->sock != NULL) { + dprintk("RPC: xs_connect delayed xprt %p\n", xprt); schedule_delayed_work(&xprt->sock_connect, RPC_REESTABLISH_TIMEOUT); - else { + } else { + dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); schedule_work(&xprt->sock_connect); /* flush_scheduled_work can sleep... */ if (!RPC_IS_ASYNC(task)) @@ -930,29 +942,23 @@ xprt_connect_sock(struct rpc_task *task) } } -/* - * Set default timeout parameters - */ -static void -xprt_default_timeout(struct rpc_timeout *to, int proto) -{ - if (proto == IPPROTO_UDP) - xprt_set_timeout(to, 5, 5 * HZ); - else - xprt_set_timeout(to, 2, 60 * HZ); -} - -static struct rpc_xprt_ops xprt_socket_ops = { - .set_buffer_size = xprt_sock_setbufsize, - .connect = xprt_connect_sock, - .send_request = xprt_send_request, - .close = xprt_close, - .destroy = xprt_socket_destroy, +static struct rpc_xprt_ops xs_ops = { + .set_buffer_size = xs_set_buffer_size, + .connect = xs_connect, + .send_request = xs_send_request, + .close = xs_close, + .destroy = xs_destroy, }; extern unsigned int xprt_udp_slot_table_entries; extern unsigned int xprt_tcp_slot_table_entries; +/** + * xs_setup_udp - Set up transport to use a UDP socket + * @xprt: transport to set up + * @to: timeout parameters + * + */ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) { size_t slot_table_size; @@ -967,7 +973,7 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) memset(xprt->slot, 0, slot_table_size); xprt->prot = IPPROTO_UDP; - xprt->port = XPRT_MAX_RESVPORT; + xprt->port = XS_MAX_RESVPORT; xprt->stream = 0; xprt->nocong = 0; xprt->cwnd = RPC_INITCWND; @@ -975,18 +981,24 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) /* XXX: header size can vary due to auth type, IPv6, etc. */ xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); - INIT_WORK(&xprt->sock_connect, xprt_socket_connect, xprt); + INIT_WORK(&xprt->sock_connect, xs_connect_worker, xprt); - xprt->ops = &xprt_socket_ops; + xprt->ops = &xs_ops; if (to) xprt->timeout = *to; else - xprt_default_timeout(to, xprt->prot); + xprt_set_timeout(&xprt->timeout, 5, 5 * HZ); return 0; } +/** + * xs_setup_tcp - Set up transport to use a TCP socket + * @xprt: transport to set up + * @to: timeout parameters + * + */ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) { size_t slot_table_size; @@ -1001,21 +1013,21 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) memset(xprt->slot, 0, slot_table_size); xprt->prot = IPPROTO_TCP; - xprt->port = XPRT_MAX_RESVPORT; + xprt->port = XS_MAX_RESVPORT; xprt->stream = 1; xprt->nocong = 1; xprt->cwnd = RPC_MAXCWND(xprt); xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; xprt->max_payload = (1U << 31) - 1; - INIT_WORK(&xprt->sock_connect, xprt_socket_connect, xprt); + INIT_WORK(&xprt->sock_connect, xs_connect_worker, xprt); - xprt->ops = &xprt_socket_ops; + xprt->ops = &xs_ops; if (to) xprt->timeout = *to; else - xprt_default_timeout(to, xprt->prot); + xprt_set_timeout(&xprt->timeout, 2, 60 * HZ); return 0; } -- cgit v1.2.3 From b4b5cc85ed4ecbe4adbfbc4df028850de67a9f09 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:29 -0400 Subject: [PATCH] RPC: Reduce stack utilization in xs_sendpages Reduce stack utilization of the RPC socket transport's send path. A couple of unlikely()s are added to ensure the compiler places the tail processing at the end of the csect. Test-plan: Millions of fsx operations. Performance characterization such as "sio" or "iozone". Version: Thu, 11 Aug 2005 16:04:30 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 73 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 43 insertions(+), 30 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 80222de3afa..a5a04203a6b 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -68,6 +68,41 @@ static inline void xs_pktdump(char *msg, u32 *packet, unsigned int count) } #endif +#define XS_SENDMSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL) + +static inline int xs_send_head(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, unsigned int len) +{ + struct kvec iov = { + .iov_base = xdr->head[0].iov_base + base, + .iov_len = len - base, + }; + struct msghdr msg = { + .msg_name = addr, + .msg_namelen = addrlen, + .msg_flags = XS_SENDMSG_FLAGS, + }; + + if (xdr->len > len) + msg.msg_flags |= MSG_MORE; + + if (likely(iov.iov_len)) + return kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); + return kernel_sendmsg(sock, &msg, NULL, 0, 0); +} + +static int xs_send_tail(struct socket *sock, struct xdr_buf *xdr, unsigned int base, unsigned int len) +{ + struct kvec iov = { + .iov_base = xdr->tail[0].iov_base + base, + .iov_len = len - base, + }; + struct msghdr msg = { + .msg_flags = XS_SENDMSG_FLAGS, + }; + + return kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); +} + /** * xs_sendpages - write pages directly to a socket * @sock: socket to send on @@ -77,7 +112,7 @@ static inline void xs_pktdump(char *msg, u32 *packet, unsigned int count) * @base: starting position in the buffer * */ -static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, int msgflags) +static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base) { struct page **ppage = xdr->pages; unsigned int len, pglen = xdr->page_len; @@ -86,35 +121,20 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, len = xdr->head[0].iov_len; if (base < len || (addr != NULL && base == 0)) { - struct kvec iov = { - .iov_base = xdr->head[0].iov_base + base, - .iov_len = len - base, - }; - struct msghdr msg = { - .msg_name = addr, - .msg_namelen = addrlen, - .msg_flags = msgflags, - }; - if (xdr->len > len) - msg.msg_flags |= MSG_MORE; - - if (iov.iov_len != 0) - err = kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); - else - err = kernel_sendmsg(sock, &msg, NULL, 0, 0); + err = xs_send_head(sock, addr, addrlen, xdr, base, len); if (ret == 0) ret = err; else if (err > 0) ret += err; - if (err != iov.iov_len) + if (err != (len - base)) goto out; base = 0; } else base -= len; - if (pglen == 0) + if (unlikely(pglen == 0)) goto copy_tail; - if (base >= pglen) { + if (unlikely(base >= pglen)) { base -= pglen; goto copy_tail; } @@ -127,7 +147,7 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, sendpage = sock->ops->sendpage ? : sock_no_sendpage; do { - int flags = msgflags; + int flags = XS_SENDMSG_FLAGS; len = PAGE_CACHE_SIZE; if (base) @@ -154,14 +174,7 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, copy_tail: len = xdr->tail[0].iov_len; if (base < len) { - struct kvec iov = { - .iov_base = xdr->tail[0].iov_base + base, - .iov_len = len - base, - }; - struct msghdr msg = { - .msg_flags = msgflags, - }; - err = kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); + err = xs_send_tail(sock, xdr, base, len); if (ret == 0) ret = err; else if (err > 0) @@ -202,7 +215,7 @@ static int xs_sendmsg(struct rpc_xprt *xprt, struct rpc_rqst *req) skip = req->rq_bytes_sent; clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags); - result = xs_sendpages(sock, addr, addrlen, xdr, skip, MSG_DONTWAIT); + result = xs_sendpages(sock, addr, addrlen, xdr, skip); dprintk("RPC: xs_sendmsg(%d) = %d\n", xdr->len - skip, result); -- cgit v1.2.3 From 4a0f8c04f2ece949d54a0c4fd7490259cf23a58a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:32 -0400 Subject: [PATCH] RPC: Rename sock_lock Clean-up: replace a name reference to sockets in the generic parts of the RPC client by renaming sock_lock in the rpc_xprt structure. Test-plan: Compile kernel with CONFIG_NFS enabled. Version: Thu, 11 Aug 2005 16:05:00 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 44 ++++++++++++++++++++++---------------------- net/sunrpc/xprtsock.c | 22 +++++++++++----------- 2 files changed, 33 insertions(+), 33 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 589195e630e..1f0da8c1a3b 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -106,9 +106,9 @@ xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) { int retval; - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); retval = __xprt_lock_write(xprt, task); - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); return retval; } @@ -161,9 +161,9 @@ __xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) { - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); __xprt_release_write(xprt, task); - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); } /* @@ -266,9 +266,9 @@ int xprt_adjust_timeout(struct rpc_rqst *req) req->rq_retries = 0; xprt_reset_majortimeo(req); /* Reset the RTT counters == "slow start" */ - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); rpc_init_rtt(req->rq_task->tk_client->cl_rtt, to->to_initval); - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); pprintk("RPC: %lu timeout\n", jiffies); status = -ETIMEDOUT; } @@ -298,10 +298,10 @@ xprt_socket_autoclose(void *args) void xprt_disconnect(struct rpc_xprt *xprt) { dprintk("RPC: disconnected transport %p\n", xprt); - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); xprt_clear_connected(xprt); rpc_wake_up_status(&xprt->pending, -ENOTCONN); - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); } static void @@ -309,12 +309,12 @@ xprt_init_autodisconnect(unsigned long data) { struct rpc_xprt *xprt = (struct rpc_xprt *)data; - spin_lock(&xprt->sock_lock); + spin_lock(&xprt->transport_lock); if (!list_empty(&xprt->recv) || xprt->shutdown) goto out_abort; if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) goto out_abort; - spin_unlock(&xprt->sock_lock); + spin_unlock(&xprt->transport_lock); /* Let keventd close the socket */ if (test_bit(XPRT_CONNECTING, &xprt->sockstate) != 0) xprt_release_write(xprt, NULL); @@ -322,7 +322,7 @@ xprt_init_autodisconnect(unsigned long data) schedule_work(&xprt->task_cleanup); return; out_abort: - spin_unlock(&xprt->sock_lock); + spin_unlock(&xprt->transport_lock); } /** @@ -482,7 +482,7 @@ xprt_timer(struct rpc_task *task) struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; - spin_lock(&xprt->sock_lock); + spin_lock(&xprt->transport_lock); if (req->rq_received) goto out; @@ -496,7 +496,7 @@ xprt_timer(struct rpc_task *task) out: task->tk_timeout = 0; rpc_wake_up_task(task); - spin_unlock(&xprt->sock_lock); + spin_unlock(&xprt->transport_lock); } /** @@ -515,7 +515,7 @@ int xprt_prepare_transmit(struct rpc_task *task) if (xprt->shutdown) return -EIO; - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); if (req->rq_received && !req->rq_bytes_sent) { err = req->rq_received; goto out_unlock; @@ -530,7 +530,7 @@ int xprt_prepare_transmit(struct rpc_task *task) goto out_unlock; } out_unlock: - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); return err; } @@ -552,13 +552,13 @@ void xprt_transmit(struct rpc_task *task) smp_rmb(); if (!req->rq_received) { if (list_empty(&req->rq_list)) { - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); /* Update the softirq receive buffer */ memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(req->rq_private_buf)); /* Add request to the receive list */ list_add_tail(&req->rq_list, &xprt->recv); - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); xprt_reset_majortimeo(req); /* Turn off autodisconnect */ del_singleshot_timer_sync(&xprt->timer); @@ -592,7 +592,7 @@ void xprt_transmit(struct rpc_task *task) out_receive: dprintk("RPC: %4d xmit complete\n", task->tk_pid); /* Set the task's receive timeout value */ - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); if (!xprt->nocong) { int timer = task->tk_msg.rpc_proc->p_timer; task->tk_timeout = rpc_calc_rto(clnt->cl_rtt, timer); @@ -607,7 +607,7 @@ void xprt_transmit(struct rpc_task *task) else if (!req->rq_received) rpc_sleep_on(&xprt->pending, task, NULL, xprt_timer); __xprt_release_write(xprt, task); - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); } static inline void do_xprt_reserve(struct rpc_task *task) @@ -683,7 +683,7 @@ void xprt_release(struct rpc_task *task) if (!(req = task->tk_rqstp)) return; - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); __xprt_release_write(xprt, task); __xprt_put_cong(xprt, req); if (!list_empty(&req->rq_list)) @@ -692,7 +692,7 @@ void xprt_release(struct rpc_task *task) if (list_empty(&xprt->recv) && !xprt->shutdown) mod_timer(&xprt->timer, xprt->last_used + RPC_IDLE_DISCONNECT_TIMEOUT); - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); task->tk_rqstp = NULL; memset(req, 0, sizeof(*req)); /* mark unused */ @@ -750,7 +750,7 @@ static struct rpc_xprt *xprt_setup(int proto, struct sockaddr_in *ap, struct rpc return ERR_PTR(result); } - spin_lock_init(&xprt->sock_lock); + spin_lock_init(&xprt->transport_lock); spin_lock_init(&xprt->xprt_lock); init_waitqueue_head(&xprt->cong_wait); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index a5a04203a6b..bc90caab608 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -307,7 +307,7 @@ static int xs_send_request(struct rpc_task *task) if (status == -EAGAIN) { if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) { /* Protect against races with xs_write_space */ - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); /* Don't race with disconnect */ if (!xprt_connected(xprt)) task->tk_status = -ENOTCONN; @@ -315,7 +315,7 @@ static int xs_send_request(struct rpc_task *task) task->tk_timeout = req->rq_timeout; rpc_sleep_on(&xprt->pending, task, NULL, NULL); } - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); return status; } /* Keep holding the socket if it is blocked */ @@ -415,7 +415,7 @@ static void xs_udp_data_ready(struct sock *sk, int len) goto dropit; /* Look up and lock the request corresponding to the given XID */ - spin_lock(&xprt->sock_lock); + spin_lock(&xprt->transport_lock); rovr = xprt_lookup_rqst(xprt, *xp); if (!rovr) goto out_unlock; @@ -436,7 +436,7 @@ static void xs_udp_data_ready(struct sock *sk, int len) xprt_complete_rqst(xprt, rovr, copied); out_unlock: - spin_unlock(&xprt->sock_lock); + spin_unlock(&xprt->transport_lock); dropit: skb_free_datagram(sk, skb); out: @@ -531,13 +531,13 @@ static inline void xs_tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc ssize_t r; /* Find and lock the request corresponding to this xid */ - spin_lock(&xprt->sock_lock); + spin_lock(&xprt->transport_lock); req = xprt_lookup_rqst(xprt, xprt->tcp_xid); if (!req) { xprt->tcp_flags &= ~XPRT_COPY_DATA; dprintk("RPC: XID %08x request not found!\n", ntohl(xprt->tcp_xid)); - spin_unlock(&xprt->sock_lock); + spin_unlock(&xprt->transport_lock); return; } @@ -597,7 +597,7 @@ out: req->rq_task->tk_pid); xprt_complete_rqst(xprt, req, xprt->tcp_copied); } - spin_unlock(&xprt->sock_lock); + spin_unlock(&xprt->transport_lock); xs_tcp_check_recm(xprt); } @@ -696,7 +696,7 @@ static void xs_tcp_state_change(struct sock *sk) switch (sk->sk_state) { case TCP_ESTABLISHED: - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); if (!xprt_test_and_set_connected(xprt)) { /* Reset TCP record info */ xprt->tcp_offset = 0; @@ -705,7 +705,7 @@ static void xs_tcp_state_change(struct sock *sk) xprt->tcp_flags = XPRT_COPY_RECM | XPRT_COPY_XID; rpc_wake_up(&xprt->pending); } - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); break; case TCP_SYN_SENT: case TCP_SYN_RECV: @@ -753,10 +753,10 @@ static void xs_write_space(struct sock *sk) if (!test_and_clear_bit(SOCK_NOSPACE, &sock->flags)) goto out; - spin_lock_bh(&xprt->sock_lock); + spin_lock_bh(&xprt->transport_lock); if (xprt->snd_task) rpc_wake_up_task(xprt->snd_task); - spin_unlock_bh(&xprt->sock_lock); + spin_unlock_bh(&xprt->transport_lock); out: read_unlock(&sk->sk_callback_lock); } -- cgit v1.2.3 From 5dc07727f86b25851e95193a0c484ea21b531c47 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:35 -0400 Subject: [PATCH] RPC: Rename xprt_lock Clean-up: Replace the xprt_lock with something more aptly named. This lock single-threads the XID and request slot reservation process. Test-plan: Compile kernel with CONFIG_NFS enabled. Version: Thu, 11 Aug 2005 16:05:26 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 1f0da8c1a3b..9c45c522e3e 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -643,9 +643,9 @@ void xprt_reserve(struct rpc_task *task) task->tk_status = -EIO; if (!xprt->shutdown) { - spin_lock(&xprt->xprt_lock); + spin_lock(&xprt->reserve_lock); do_xprt_reserve(task); - spin_unlock(&xprt->xprt_lock); + spin_unlock(&xprt->reserve_lock); } } @@ -698,10 +698,10 @@ void xprt_release(struct rpc_task *task) dprintk("RPC: %4d release request %p\n", task->tk_pid, req); - spin_lock(&xprt->xprt_lock); + spin_lock(&xprt->reserve_lock); list_add(&req->rq_list, &xprt->free); xprt_clear_backlog(xprt); - spin_unlock(&xprt->xprt_lock); + spin_unlock(&xprt->reserve_lock); } /** @@ -751,7 +751,7 @@ static struct rpc_xprt *xprt_setup(int proto, struct sockaddr_in *ap, struct rpc } spin_lock_init(&xprt->transport_lock); - spin_lock_init(&xprt->xprt_lock); + spin_lock_init(&xprt->reserve_lock); init_waitqueue_head(&xprt->cong_wait); INIT_LIST_HEAD(&xprt->free); -- cgit v1.2.3 From 2226feb6bcd0e5e117a9be3ea3dd3ffc14f3e41e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:38 -0400 Subject: [PATCH] RPC: rename the sockstate field Clean-up: get rid of a name reference to sockets in the generic parts of the RPC client by renaming the sockstate field in the rpc_xprt structure. Test-plan: Compile kernel with CONFIG_NFS enabled. Version: Thu, 11 Aug 2005 16:05:53 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 14 +++++++------- net/sunrpc/xprtsock.c | 6 ++---- 2 files changed, 9 insertions(+), 11 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 9c45c522e3e..57c5e77b155 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -74,7 +74,7 @@ __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; - if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) { + if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { if (task == xprt->snd_task) return 1; goto out_sleep; @@ -88,7 +88,7 @@ __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) return 1; } smp_mb__before_clear_bit(); - clear_bit(XPRT_LOCKED, &xprt->sockstate); + clear_bit(XPRT_LOCKED, &xprt->state); smp_mb__after_clear_bit(); out_sleep: dprintk("RPC: %4d failed to lock socket %p\n", task->tk_pid, xprt); @@ -118,7 +118,7 @@ __xprt_lock_write_next(struct rpc_xprt *xprt) { struct rpc_task *task; - if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) + if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) return; if (!xprt->nocong && RPCXPRT_CONGESTED(xprt)) goto out_unlock; @@ -139,7 +139,7 @@ __xprt_lock_write_next(struct rpc_xprt *xprt) } out_unlock: smp_mb__before_clear_bit(); - clear_bit(XPRT_LOCKED, &xprt->sockstate); + clear_bit(XPRT_LOCKED, &xprt->state); smp_mb__after_clear_bit(); } @@ -152,7 +152,7 @@ __xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) if (xprt->snd_task == task) { xprt->snd_task = NULL; smp_mb__before_clear_bit(); - clear_bit(XPRT_LOCKED, &xprt->sockstate); + clear_bit(XPRT_LOCKED, &xprt->state); smp_mb__after_clear_bit(); __xprt_lock_write_next(xprt); } @@ -312,11 +312,11 @@ xprt_init_autodisconnect(unsigned long data) spin_lock(&xprt->transport_lock); if (!list_empty(&xprt->recv) || xprt->shutdown) goto out_abort; - if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) + if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) goto out_abort; spin_unlock(&xprt->transport_lock); /* Let keventd close the socket */ - if (test_bit(XPRT_CONNECTING, &xprt->sockstate) != 0) + if (xprt_connecting(xprt)) xprt_release_write(xprt, NULL); else schedule_work(&xprt->task_cleanup); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index bc90caab608..76a33b54f43 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -925,9 +925,7 @@ out: else rpc_wake_up(&xprt->pending); out_clear: - smp_mb__before_clear_bit(); - clear_bit(XPRT_CONNECTING, &xprt->sockstate); - smp_mb__after_clear_bit(); + xprt_clear_connecting(xprt); } /** @@ -940,7 +938,7 @@ static void xs_connect(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; - if (!test_and_set_bit(XPRT_CONNECTING, &xprt->sockstate)) { + if (!xprt_test_and_set_connecting(xprt)) { if (xprt->sock != NULL) { dprintk("RPC: xs_connect delayed xprt %p\n", xprt); schedule_delayed_work(&xprt->sock_connect, -- cgit v1.2.3 From 86b9f57dfdf455763d2be73a742a9a88bb664173 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:41 -0400 Subject: [PATCH] RPC: Eliminate socket.h includes in RPC client Clean-up: get rid of unnecessary socket.h and in.h includes in the generic parts of the RPC client. Test-plan: Compile kernel with CONFIG_NFS enabled. Version: Thu, 11 Aug 2005 16:06:23 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/auth.c | 1 - net/sunrpc/auth_gss/auth_gss.c | 2 -- net/sunrpc/auth_gss/gss_krb5_mech.c | 1 - net/sunrpc/auth_gss/gss_mech_switch.c | 1 - net/sunrpc/auth_null.c | 2 -- net/sunrpc/auth_unix.c | 2 -- net/sunrpc/clnt.c | 1 - net/sunrpc/sunrpc_syms.c | 1 - 8 files changed, 11 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 505e2d4b3d6..a415d99c394 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 2f7b867161d..53a030acdf7 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -42,8 +42,6 @@ #include #include #include -#include -#include #include #include #include diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 606a8a82caf..462c5b86b07 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include #include diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 9dfb68377d6..58aeaddd8c7 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -35,7 +35,6 @@ #include #include -#include #include #include #include diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c index 9b72d3abf82..f56767aaa92 100644 --- a/net/sunrpc/auth_null.c +++ b/net/sunrpc/auth_null.c @@ -7,9 +7,7 @@ */ #include -#include #include -#include #include #include #include diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 4ff297a9b15..890fb5ea0dc 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -9,8 +9,6 @@ #include #include #include -#include -#include #include #include diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index ab50c3c9e6a..0d1b010a4a0 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index ed48ff022d3..2387e7b823f 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -10,7 +10,6 @@ #include #include -#include #include #include #include -- cgit v1.2.3 From 44fbac2288dfed6f1963ac00bf922c3bcd779cd1 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:44 -0400 Subject: [PATCH] RPC: Add helper for waking tasks pending on a transport Clean-up: remove only reference to xprt->pending from the socket transport implementation. This makes a cleaner interface for other transport implementations as well. Test-plan: Compile kernel with CONFIG_NFS enabled. Version: Thu, 11 Aug 2005 16:06:52 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 18 ++++++++++++++++-- net/sunrpc/xprtsock.c | 7 ++----- 2 files changed, 18 insertions(+), 7 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 57c5e77b155..2f9cd468b95 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -227,6 +227,20 @@ xprt_adjust_cwnd(struct rpc_xprt *xprt, int result) xprt->cwnd = cwnd; } +/** + * xprt_wake_pending_tasks - wake all tasks on a transport's pending queue + * @xprt: transport with waiting tasks + * @status: result code to plant in each task before waking it + * + */ +void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status) +{ + if (status < 0) + rpc_wake_up_status(&xprt->pending, status); + else + rpc_wake_up(&xprt->pending); +} + static void xprt_reset_majortimeo(struct rpc_rqst *req) { struct rpc_timeout *to = &req->rq_xprt->timeout; @@ -300,7 +314,7 @@ void xprt_disconnect(struct rpc_xprt *xprt) dprintk("RPC: disconnected transport %p\n", xprt); spin_lock_bh(&xprt->transport_lock); xprt_clear_connected(xprt); - rpc_wake_up_status(&xprt->pending, -ENOTCONN); + xprt_wake_pending_tasks(xprt, -ENOTCONN); spin_unlock_bh(&xprt->transport_lock); } @@ -803,7 +817,7 @@ static void xprt_shutdown(struct rpc_xprt *xprt) xprt->shutdown = 1; rpc_wake_up(&xprt->sending); rpc_wake_up(&xprt->resend); - rpc_wake_up(&xprt->pending); + xprt_wake_pending_tasks(xprt, -EIO); rpc_wake_up(&xprt->backlog); wake_up(&xprt->cong_wait); del_timer_sync(&xprt->timer); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 76a33b54f43..182da2edf61 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -703,7 +703,7 @@ static void xs_tcp_state_change(struct sock *sk) xprt->tcp_reclen = 0; xprt->tcp_copied = 0; xprt->tcp_flags = XPRT_COPY_RECM | XPRT_COPY_XID; - rpc_wake_up(&xprt->pending); + xprt_wake_pending_tasks(xprt, 0); } spin_unlock_bh(&xprt->transport_lock); break; @@ -920,10 +920,7 @@ static void xs_connect_worker(void *args) } } out: - if (status < 0) - rpc_wake_up_status(&xprt->pending, status); - else - rpc_wake_up(&xprt->pending); + xprt_wake_pending_tasks(xprt, status); out_clear: xprt_clear_connecting(xprt); } -- cgit v1.2.3 From 55aa4f58aa43dc9a51fb80010630d94b96053a2e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:47 -0400 Subject: [PATCH] RPC: client-side transport switch cleanup Clean-up: change some comments to reflect the realities of the new RPC transport switch mechanism. Get rid of unused xprt_receive() prototype. Also, organize function prototypes in xprt.h by usage and scope. Test-plan: Compile kernel with CONFIG_NFS enabled. Version: Thu, 11 Aug 2005 16:07:21 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 2 +- net/sunrpc/xprt.c | 26 ++++++++++++-------------- net/sunrpc/xprtsock.c | 12 +++++++----- 3 files changed, 20 insertions(+), 20 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 0d1b010a4a0..4677959d283 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1,5 +1,5 @@ /* - * linux/net/sunrpc/rpcclnt.c + * linux/net/sunrpc/clnt.c * * This file contains the high-level RPC interface. * It is modeled as a finite state machine to support both synchronous diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 2f9cd468b95..247fa1ec870 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -10,12 +10,12 @@ * one is available. Otherwise, it sleeps on the backlog queue * (xprt_reserve). * - Next, the caller puts together the RPC message, stuffs it into - * the request struct, and calls xprt_call(). - * - xprt_call transmits the message and installs the caller on the - * socket's wait list. At the same time, it installs a timer that + * the request struct, and calls xprt_transmit(). + * - xprt_transmit sends the message and installs the caller on the + * transport's wait list. At the same time, it installs a timer that * is run after the packet's timeout has expired. * - When a packet arrives, the data_ready handler walks the list of - * pending requests for that socket. If a matching XID is found, the + * pending requests for that transport. If a matching XID is found, the * caller is woken up, and the timer removed. * - When no reply arrives within the timeout interval, the timer is * fired by the kernel and runs xprt_timer(). It either adjusts the @@ -32,6 +32,8 @@ * tasks that rely on callbacks. * * Copyright (C) 1995-1997, Olaf Kirch + * + * Transport switch API copyright (C) 2005, Chuck Lever */ #include @@ -52,8 +54,6 @@ # define RPCDBG_FACILITY RPCDBG_XPRT #endif -#define XPRT_MAX_BACKOFF (8) - /* * Local functions */ @@ -65,9 +65,9 @@ static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); static int xprt_clear_backlog(struct rpc_xprt *xprt); /* - * Serialize write access to sockets, in order to prevent different + * Serialize write access to transports, in order to prevent different * requests from interfering with each other. - * Also prevents TCP socket connects from colliding with writes. + * Also prevents transport connects from colliding with writes. */ static int __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) @@ -91,7 +91,7 @@ __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) clear_bit(XPRT_LOCKED, &xprt->state); smp_mb__after_clear_bit(); out_sleep: - dprintk("RPC: %4d failed to lock socket %p\n", task->tk_pid, xprt); + dprintk("RPC: %4d failed to lock transport %p\n", task->tk_pid, xprt); task->tk_timeout = 0; task->tk_status = -EAGAIN; if (req && req->rq_ntrans) @@ -144,7 +144,7 @@ out_unlock: } /* - * Releases the socket for use by other requests. + * Releases the transport for use by other requests. */ static void __xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) @@ -294,8 +294,7 @@ int xprt_adjust_timeout(struct rpc_rqst *req) return status; } -static void -xprt_socket_autoclose(void *args) +static void xprt_autoclose(void *args) { struct rpc_xprt *xprt = (struct rpc_xprt *)args; @@ -329,7 +328,6 @@ xprt_init_autodisconnect(unsigned long data) if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) goto out_abort; spin_unlock(&xprt->transport_lock); - /* Let keventd close the socket */ if (xprt_connecting(xprt)) xprt_release_write(xprt, NULL); else @@ -770,7 +768,7 @@ static struct rpc_xprt *xprt_setup(int proto, struct sockaddr_in *ap, struct rpc INIT_LIST_HEAD(&xprt->free); INIT_LIST_HEAD(&xprt->recv); - INIT_WORK(&xprt->task_cleanup, xprt_socket_autoclose, xprt); + INIT_WORK(&xprt->task_cleanup, xprt_autoclose, xprt); init_timer(&xprt->timer); xprt->timer.function = xprt_init_autodisconnect; xprt->timer.data = (unsigned long) xprt; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 182da2edf61..7f0b9f7f167 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -11,6 +11,8 @@ * Rewrite of larges part of the code in order to stabilize TCP stuff. * Fix behaviour when socket buffer is full. * (C) 1999 Trond Myklebust + * + * IP socket transport implementation, (C) 2005 Chuck Lever */ #include @@ -363,7 +365,7 @@ static void xs_destroy(struct rpc_xprt *xprt) { dprintk("RPC: xs_destroy xprt %p\n", xprt); - cancel_delayed_work(&xprt->sock_connect); + cancel_delayed_work(&xprt->connect_worker); flush_scheduled_work(); xprt_disconnect(xprt); @@ -938,11 +940,11 @@ static void xs_connect(struct rpc_task *task) if (!xprt_test_and_set_connecting(xprt)) { if (xprt->sock != NULL) { dprintk("RPC: xs_connect delayed xprt %p\n", xprt); - schedule_delayed_work(&xprt->sock_connect, + schedule_delayed_work(&xprt->connect_worker, RPC_REESTABLISH_TIMEOUT); } else { dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); - schedule_work(&xprt->sock_connect); + schedule_work(&xprt->connect_worker); /* flush_scheduled_work can sleep... */ if (!RPC_IS_ASYNC(task)) flush_scheduled_work(); @@ -989,7 +991,7 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) /* XXX: header size can vary due to auth type, IPv6, etc. */ xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); - INIT_WORK(&xprt->sock_connect, xs_connect_worker, xprt); + INIT_WORK(&xprt->connect_worker, xs_connect_worker, xprt); xprt->ops = &xs_ops; @@ -1028,7 +1030,7 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; xprt->max_payload = (1U << 31) - 1; - INIT_WORK(&xprt->sock_connect, xs_connect_worker, xprt); + INIT_WORK(&xprt->connect_worker, xs_connect_worker, xprt); xprt->ops = &xs_ops; -- cgit v1.2.3 From c7b2cae8a634015b72941ba2fc6c4bc9b8d3a129 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:50 -0400 Subject: [PATCH] RPC: separate TCP and UDP write space callbacks Split the socket write space callback function into a TCP version and UDP version, eliminating one dependence on the "xprt->stream" variable. Keep the common pieces of this path in xprt.c so other transports can use it too. Test-plan: Write-intensive workload on a single mount point. Version: Thu, 11 Aug 2005 16:07:51 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 34 +++++++++++++++++++++ net/sunrpc/xprtsock.c | 84 ++++++++++++++++++++++++++++++++------------------- 2 files changed, 87 insertions(+), 31 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 247fa1ec870..31ef7dc7eed 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -241,6 +241,40 @@ void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status) rpc_wake_up(&xprt->pending); } +/** + * xprt_wait_for_buffer_space - wait for transport output buffer to clear + * @task: task to be put to sleep + * + */ +void xprt_wait_for_buffer_space(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + task->tk_timeout = req->rq_timeout; + rpc_sleep_on(&xprt->pending, task, NULL, NULL); +} + +/** + * xprt_write_space - wake the task waiting for transport output buffer space + * @xprt: transport with waiting tasks + * + * Can be called in a soft IRQ context, so xprt_write_space never sleeps. + */ +void xprt_write_space(struct rpc_xprt *xprt) +{ + if (unlikely(xprt->shutdown)) + return; + + spin_lock_bh(&xprt->transport_lock); + if (xprt->snd_task) { + dprintk("RPC: write space: waking waiting task on xprt %p\n", + xprt); + rpc_wake_up_task(xprt->snd_task); + } + spin_unlock_bh(&xprt->transport_lock); +} + static void xprt_reset_majortimeo(struct rpc_rqst *req) { struct rpc_timeout *to = &req->rq_xprt->timeout; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 7f0b9f7f167..70a772d7a79 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -308,15 +308,13 @@ static int xs_send_request(struct rpc_task *task) if (status == -EAGAIN) { if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) { - /* Protect against races with xs_write_space */ + /* Protect against races with write_space */ spin_lock_bh(&xprt->transport_lock); /* Don't race with disconnect */ if (!xprt_connected(xprt)) task->tk_status = -ENOTCONN; - else if (test_bit(SOCK_NOSPACE, &xprt->sock->flags)) { - task->tk_timeout = req->rq_timeout; - rpc_sleep_on(&xprt->pending, task, NULL, NULL); - } + else if (test_bit(SOCK_NOSPACE, &xprt->sock->flags)) + xprt_wait_for_buffer_space(task); spin_unlock_bh(&xprt->transport_lock); return status; } @@ -721,45 +719,68 @@ static void xs_tcp_state_change(struct sock *sk) } /** - * xs_write_space - callback invoked when socket buffer space becomes - * available + * xs_udp_write_space - callback invoked when socket buffer space + * becomes available * @sk: socket whose state has changed * * Called when more output buffer space is available for this socket. * We try not to wake our writers until they can make "significant" - * progress, otherwise we'll waste resources thrashing sock_sendmsg + * progress, otherwise we'll waste resources thrashing kernel_sendmsg * with a bunch of small requests. */ -static void xs_write_space(struct sock *sk) +static void xs_udp_write_space(struct sock *sk) { - struct rpc_xprt *xprt; - struct socket *sock; - read_lock(&sk->sk_callback_lock); - if (!(xprt = xprt_from_sock(sk)) || !(sock = sk->sk_socket)) - goto out; - if (xprt->shutdown) - goto out; - /* Wait until we have enough socket memory */ - if (xprt->stream) { - /* from net/core/stream.c:sk_stream_write_space */ - if (sk_stream_wspace(sk) < sk_stream_min_wspace(sk)) + /* from net/core/sock.c:sock_def_write_space */ + if (sock_writeable(sk)) { + struct socket *sock; + struct rpc_xprt *xprt; + + if (unlikely(!(sock = sk->sk_socket))) goto out; - } else { - /* from net/core/sock.c:sock_def_write_space */ - if (!sock_writeable(sk)) + if (unlikely(!(xprt = xprt_from_sock(sk)))) + goto out; + if (unlikely(!test_and_clear_bit(SOCK_NOSPACE, &sock->flags))) goto out; + + xprt_write_space(xprt); } - if (!test_and_clear_bit(SOCK_NOSPACE, &sock->flags)) - goto out; + out: + read_unlock(&sk->sk_callback_lock); +} - spin_lock_bh(&xprt->transport_lock); - if (xprt->snd_task) - rpc_wake_up_task(xprt->snd_task); - spin_unlock_bh(&xprt->transport_lock); -out: +/** + * xs_tcp_write_space - callback invoked when socket buffer space + * becomes available + * @sk: socket whose state has changed + * + * Called when more output buffer space is available for this socket. + * We try not to wake our writers until they can make "significant" + * progress, otherwise we'll waste resources thrashing kernel_sendmsg + * with a bunch of small requests. + */ +static void xs_tcp_write_space(struct sock *sk) +{ + read_lock(&sk->sk_callback_lock); + + /* from net/core/stream.c:sk_stream_write_space */ + if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { + struct socket *sock; + struct rpc_xprt *xprt; + + if (unlikely(!(sock = sk->sk_socket))) + goto out; + if (unlikely(!(xprt = xprt_from_sock(sk)))) + goto out; + if (unlikely(!test_and_clear_bit(SOCK_NOSPACE, &sock->flags))) + goto out; + + xprt_write_space(xprt); + } + + out: read_unlock(&sk->sk_callback_lock); } @@ -855,15 +876,16 @@ static void xs_bind(struct rpc_xprt *xprt, struct socket *sock) xprt->old_write_space = sk->sk_write_space; if (xprt->prot == IPPROTO_UDP) { sk->sk_data_ready = xs_udp_data_ready; + sk->sk_write_space = xs_udp_write_space; sk->sk_no_check = UDP_CSUM_NORCV; xprt_set_connected(xprt); } else { tcp_sk(sk)->nonagle = 1; /* disable Nagle's algorithm */ sk->sk_data_ready = xs_tcp_data_ready; sk->sk_state_change = xs_tcp_state_change; + sk->sk_write_space = xs_tcp_write_space; xprt_clear_connected(xprt); } - sk->sk_write_space = xs_write_space; /* Reset to new socket */ xprt->sock = sock; -- cgit v1.2.3 From b0d93ad511ce2f37823a07c7a3258117a431f5fb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:53 -0400 Subject: [PATCH] RPC: separate TCP and UDP transport connection logic Create separate connection worker functions for managing UDP and TCP transport sockets. This eliminates several dependencies on "xprt->stream". Test-plan: Destructive testing (unplugging the network temporarily). Connectathon with v2, v3, and v4. Version: Thu, 11 Aug 2005 16:08:18 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 164 ++++++++++++++++++++++++++++---------------------- 1 file changed, 91 insertions(+), 73 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 70a772d7a79..f91529787b9 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -836,102 +836,118 @@ static int xs_bindresvport(struct rpc_xprt *xprt, struct socket *sock) return err; } -static struct socket *xs_create(struct rpc_xprt *xprt, int proto, int resvport) +/** + * xs_udp_connect_worker - set up a UDP socket + * @args: RPC transport to connect + * + * Invoked by a work queue tasklet. + */ +static void xs_udp_connect_worker(void *args) { - struct socket *sock; - int type, err; - - dprintk("RPC: xs_create(%s %d)\n", - (proto == IPPROTO_UDP)? "udp" : "tcp", proto); + struct rpc_xprt *xprt = (struct rpc_xprt *) args; + struct socket *sock = xprt->sock; + int err, status = -EIO; - type = (proto == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; + if (xprt->shutdown || xprt->addr.sin_port == 0) + goto out; - if ((err = sock_create_kern(PF_INET, type, proto, &sock)) < 0) { - dprintk("RPC: can't create socket (%d).\n", -err); - return NULL; - } + dprintk("RPC: xs_udp_connect_worker for xprt %p\n", xprt); - /* If the caller has the capability, bind to a reserved port */ - if (resvport && xs_bindresvport(xprt, sock) < 0) - goto failed; + /* Start by resetting any existing state */ + xs_close(xprt); - return sock; + if ((err = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) { + dprintk("RPC: can't create UDP transport socket (%d).\n", -err); + goto out; + } -failed: - sock_release(sock); - return NULL; -} + if (xprt->resvport && xs_bindresvport(xprt, sock) < 0) { + sock_release(sock); + goto out; + } -static void xs_bind(struct rpc_xprt *xprt, struct socket *sock) -{ - struct sock *sk = sock->sk; + if (!xprt->inet) { + struct sock *sk = sock->sk; - if (xprt->inet) - return; + write_lock_bh(&sk->sk_callback_lock); - write_lock_bh(&sk->sk_callback_lock); - sk->sk_user_data = xprt; - xprt->old_data_ready = sk->sk_data_ready; - xprt->old_state_change = sk->sk_state_change; - xprt->old_write_space = sk->sk_write_space; - if (xprt->prot == IPPROTO_UDP) { + sk->sk_user_data = xprt; + xprt->old_data_ready = sk->sk_data_ready; + xprt->old_state_change = sk->sk_state_change; + xprt->old_write_space = sk->sk_write_space; sk->sk_data_ready = xs_udp_data_ready; sk->sk_write_space = xs_udp_write_space; sk->sk_no_check = UDP_CSUM_NORCV; + xprt_set_connected(xprt); - } else { - tcp_sk(sk)->nonagle = 1; /* disable Nagle's algorithm */ - sk->sk_data_ready = xs_tcp_data_ready; - sk->sk_state_change = xs_tcp_state_change; - sk->sk_write_space = xs_tcp_write_space; - xprt_clear_connected(xprt); - } - /* Reset to new socket */ - xprt->sock = sock; - xprt->inet = sk; - write_unlock_bh(&sk->sk_callback_lock); + /* Reset to new socket */ + xprt->sock = sock; + xprt->inet = sk; - return; + write_unlock_bh(&sk->sk_callback_lock); + } + xs_set_buffer_size(xprt); + status = 0; +out: + xprt_wake_pending_tasks(xprt, status); + xprt_clear_connecting(xprt); } /** - * xs_connect_worker - try to connect a socket to a remote endpoint + * xs_tcp_connect_worker - connect a TCP socket to a remote endpoint * @args: RPC transport to connect * * Invoked by a work queue tasklet. */ -static void xs_connect_worker(void *args) +static void xs_tcp_connect_worker(void *args) { struct rpc_xprt *xprt = (struct rpc_xprt *)args; struct socket *sock = xprt->sock; - int status = -EIO; + int err, status = -EIO; if (xprt->shutdown || xprt->addr.sin_port == 0) goto out; - dprintk("RPC: xs_connect_worker xprt %p\n", xprt); + dprintk("RPC: xs_tcp_connect_worker for xprt %p\n", xprt); - /* - * Start by resetting any existing state - */ + /* Start by resetting any existing socket state */ xs_close(xprt); - sock = xs_create(xprt, xprt->prot, xprt->resvport); - if (sock == NULL) { - /* couldn't create socket or bind to reserved port; - * this is likely a permanent error, so cause an abort */ + + if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { + dprintk("RPC: can't create TCP transport socket (%d).\n", -err); goto out; } - xs_bind(xprt, sock); - xs_set_buffer_size(xprt); - status = 0; - if (!xprt->stream) + if (xprt->resvport && xs_bindresvport(xprt, sock) < 0) { + sock_release(sock); goto out; + } - /* - * Tell the socket layer to start connecting... - */ + if (!xprt->inet) { + struct sock *sk = sock->sk; + + write_lock_bh(&sk->sk_callback_lock); + + sk->sk_user_data = xprt; + xprt->old_data_ready = sk->sk_data_ready; + xprt->old_state_change = sk->sk_state_change; + xprt->old_write_space = sk->sk_write_space; + sk->sk_data_ready = xs_tcp_data_ready; + sk->sk_state_change = xs_tcp_state_change; + sk->sk_write_space = xs_tcp_write_space; + tcp_sk(sk)->nonagle = 1; + + xprt_clear_connected(xprt); + + /* Reset to new socket */ + xprt->sock = sock; + xprt->inet = sk; + + write_unlock_bh(&sk->sk_callback_lock); + } + + /* Tell the socket layer to start connecting... */ status = sock->ops->connect(sock, (struct sockaddr *) &xprt->addr, sizeof(xprt->addr), O_NONBLOCK); dprintk("RPC: %p connect status %d connected %d sock state %d\n", @@ -959,18 +975,20 @@ static void xs_connect(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; - if (!xprt_test_and_set_connecting(xprt)) { - if (xprt->sock != NULL) { - dprintk("RPC: xs_connect delayed xprt %p\n", xprt); - schedule_delayed_work(&xprt->connect_worker, + if (xprt_test_and_set_connecting(xprt)) + return; + + if (xprt->sock != NULL) { + dprintk("RPC: xs_connect delayed xprt %p\n", xprt); + schedule_delayed_work(&xprt->connect_worker, RPC_REESTABLISH_TIMEOUT); - } else { - dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); - schedule_work(&xprt->connect_worker); - /* flush_scheduled_work can sleep... */ - if (!RPC_IS_ASYNC(task)) - flush_scheduled_work(); - } + } else { + dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); + schedule_work(&xprt->connect_worker); + + /* flush_scheduled_work can sleep... */ + if (!RPC_IS_ASYNC(task)) + flush_scheduled_work(); } } @@ -1013,7 +1031,7 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) /* XXX: header size can vary due to auth type, IPv6, etc. */ xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); - INIT_WORK(&xprt->connect_worker, xs_connect_worker, xprt); + INIT_WORK(&xprt->connect_worker, xs_udp_connect_worker, xprt); xprt->ops = &xs_ops; @@ -1052,7 +1070,7 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; xprt->max_payload = (1U << 31) - 1; - INIT_WORK(&xprt->connect_worker, xs_connect_worker, xprt); + INIT_WORK(&xprt->connect_worker, xs_tcp_connect_worker, xprt); xprt->ops = &xs_ops; -- cgit v1.2.3 From 262965f53defd312a294b45366ea17907b6a616b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 11 Aug 2005 16:25:56 -0400 Subject: [PATCH] RPC: separate TCP and UDP socket write paths Split the RPC client's main socket write path into a TCP version and a UDP version to eliminate another dependency on the "xprt->stream" variable. Compiler optimization removes unneeded code from xs_sendpages, as this function is now called with some constant arguments. We can now cleanly perform transport protocol-specific return code testing and error recovery in each path. Test-plan: Millions of fsx operations. Performance characterization such as "sio" or "iozone". Examine oprofile results for any changes before and after this patch is applied. Version: Thu, 11 Aug 2005 16:08:46 -0400 Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 215 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 128 insertions(+), 87 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index f91529787b9..57988300640 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -40,6 +40,12 @@ */ #define XS_MAX_RESVPORT (800U) +/* + * How many times to try sending a request on a socket before waiting + * for the socket buffer to clear. + */ +#define XS_SENDMSG_RETRY (10U) + #ifdef RPC_DEBUG # undef RPC_DEBUG_DATA # define RPCDBG_FACILITY RPCDBG_TRANS @@ -114,13 +120,18 @@ static int xs_send_tail(struct socket *sock, struct xdr_buf *xdr, unsigned int b * @base: starting position in the buffer * */ -static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base) +static inline int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base) { struct page **ppage = xdr->pages; unsigned int len, pglen = xdr->page_len; int err, ret = 0; ssize_t (*sendpage)(struct socket *, struct page *, int, size_t, int); + if (unlikely(!sock)) + return -ENOTCONN; + + clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags); + len = xdr->head[0].iov_len; if (base < len || (addr != NULL && base == 0)) { err = xs_send_head(sock, addr, addrlen, xdr, base, len); @@ -187,140 +198,162 @@ out: } /** - * xs_sendmsg - write an RPC request to a socket - * @xprt: generic transport - * @req: the RPC request to write + * xs_nospace - place task on wait queue if transmit was incomplete + * @task: task to put to sleep * */ -static int xs_sendmsg(struct rpc_xprt *xprt, struct rpc_rqst *req) +static void xs_nospace(struct rpc_task *task) { - struct socket *sock = xprt->sock; - struct xdr_buf *xdr = &req->rq_snd_buf; - struct sockaddr *addr = NULL; - int addrlen = 0; - unsigned int skip; - int result; + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; - if (!sock) - return -ENOTCONN; + dprintk("RPC: %4d xmit incomplete (%u left of %u)\n", + task->tk_pid, req->rq_slen - req->rq_bytes_sent, + req->rq_slen); + + if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) { + /* Protect against races with write_space */ + spin_lock_bh(&xprt->transport_lock); + + /* Don't race with disconnect */ + if (!xprt_connected(xprt)) + task->tk_status = -ENOTCONN; + else if (test_bit(SOCK_NOSPACE, &xprt->sock->flags)) + xprt_wait_for_buffer_space(task); + + spin_unlock_bh(&xprt->transport_lock); + } else + /* Keep holding the socket if it is blocked */ + rpc_delay(task, HZ>>4); +} + +/** + * xs_udp_send_request - write an RPC request to a UDP socket + * @task: address of RPC task that manages the state of an RPC request + * + * Return values: + * 0: The request has been sent + * EAGAIN: The socket was blocked, please call again later to + * complete the request + * ENOTCONN: Caller needs to invoke connect logic then call again + * other: Some other error occured, the request was not sent + */ +static int xs_udp_send_request(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + struct xdr_buf *xdr = &req->rq_snd_buf; + int status; xs_pktdump("packet data:", req->rq_svec->iov_base, req->rq_svec->iov_len); - /* For UDP, we need to provide an address */ - if (!xprt->stream) { - addr = (struct sockaddr *) &xprt->addr; - addrlen = sizeof(xprt->addr); - } - /* Don't repeat bytes */ - skip = req->rq_bytes_sent; + req->rq_xtime = jiffies; + status = xs_sendpages(xprt->sock, (struct sockaddr *) &xprt->addr, + sizeof(xprt->addr), xdr, req->rq_bytes_sent); - clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags); - result = xs_sendpages(sock, addr, addrlen, xdr, skip); + dprintk("RPC: xs_udp_send_request(%u) = %d\n", + xdr->len - req->rq_bytes_sent, status); - dprintk("RPC: xs_sendmsg(%d) = %d\n", xdr->len - skip, result); + if (likely(status >= (int) req->rq_slen)) + return 0; - if (result >= 0) - return result; + /* Still some bytes left; set up for a retry later. */ + if (status > 0) + status = -EAGAIN; - switch (result) { + switch (status) { + case -ENETUNREACH: + case -EPIPE: case -ECONNREFUSED: /* When the server has died, an ICMP port unreachable message * prompts ECONNREFUSED. */ - case -EAGAIN: break; - case -ECONNRESET: - case -ENOTCONN: - case -EPIPE: - /* connection broken */ - if (xprt->stream) - result = -ENOTCONN; + case -EAGAIN: + xs_nospace(task); break; default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); break; } - return result; + + return status; } /** - * xs_send_request - write an RPC request to a socket + * xs_tcp_send_request - write an RPC request to a TCP socket * @task: address of RPC task that manages the state of an RPC request * * Return values: - * 0: The request has been sent - * EAGAIN: The socket was blocked, please call again later to - * complete the request - * other: Some other error occured, the request was not sent + * 0: The request has been sent + * EAGAIN: The socket was blocked, please call again later to + * complete the request + * ENOTCONN: Caller needs to invoke connect logic then call again + * other: Some other error occured, the request was not sent * * XXX: In the case of soft timeouts, should we eventually give up - * if the socket is not able to make progress? + * if sendmsg is not able to make progress? */ -static int xs_send_request(struct rpc_task *task) +static int xs_tcp_send_request(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; + struct xdr_buf *xdr = &req->rq_snd_buf; + u32 *marker = req->rq_svec[0].iov_base; int status, retry = 0; - /* set up everything as needed. */ /* Write the record marker */ - if (xprt->stream) { - u32 *marker = req->rq_svec[0].iov_base; + *marker = htonl(0x80000000|(req->rq_slen-sizeof(*marker))); - *marker = htonl(0x80000000|(req->rq_slen-sizeof(*marker))); - } + xs_pktdump("packet data:", + req->rq_svec->iov_base, + req->rq_svec->iov_len); /* Continue transmitting the packet/record. We must be careful * to cope with writespace callbacks arriving _after_ we have - * called sendmsg(). - */ + * called sendmsg(). */ while (1) { req->rq_xtime = jiffies; - status = xs_sendmsg(xprt, req); + status = xs_sendpages(xprt->sock, NULL, 0, xdr, + req->rq_bytes_sent); - if (status < 0) - break; + dprintk("RPC: xs_tcp_send_request(%u) = %d\n", + xdr->len - req->rq_bytes_sent, status); - if (xprt->stream) { - req->rq_bytes_sent += status; - - /* If we've sent the entire packet, immediately - * reset the count of bytes sent. */ - if (req->rq_bytes_sent >= req->rq_slen) { - req->rq_bytes_sent = 0; - return 0; - } - } else { - if (status >= req->rq_slen) - return 0; - status = -EAGAIN; + if (unlikely(status < 0)) break; - } - dprintk("RPC: %4d xmit incomplete (%d left of %d)\n", - task->tk_pid, req->rq_slen - req->rq_bytes_sent, - req->rq_slen); + /* If we've sent the entire packet, immediately + * reset the count of bytes sent. */ + req->rq_bytes_sent += status; + if (likely(req->rq_bytes_sent >= req->rq_slen)) { + req->rq_bytes_sent = 0; + return 0; + } status = -EAGAIN; - if (retry++ > 50) + if (retry++ > XS_SENDMSG_RETRY) break; } - if (status == -EAGAIN) { - if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) { - /* Protect against races with write_space */ - spin_lock_bh(&xprt->transport_lock); - /* Don't race with disconnect */ - if (!xprt_connected(xprt)) - task->tk_status = -ENOTCONN; - else if (test_bit(SOCK_NOSPACE, &xprt->sock->flags)) - xprt_wait_for_buffer_space(task); - spin_unlock_bh(&xprt->transport_lock); - return status; - } - /* Keep holding the socket if it is blocked */ - rpc_delay(task, HZ>>4); + switch (status) { + case -EAGAIN: + xs_nospace(task); + break; + case -ECONNREFUSED: + case -ECONNRESET: + case -ENOTCONN: + case -EPIPE: + status = -ENOTCONN; + break; + default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); + break; } + return status; } @@ -992,10 +1025,18 @@ static void xs_connect(struct rpc_task *task) } } -static struct rpc_xprt_ops xs_ops = { +static struct rpc_xprt_ops xs_udp_ops = { + .set_buffer_size = xs_set_buffer_size, + .connect = xs_connect, + .send_request = xs_udp_send_request, + .close = xs_close, + .destroy = xs_destroy, +}; + +static struct rpc_xprt_ops xs_tcp_ops = { .set_buffer_size = xs_set_buffer_size, .connect = xs_connect, - .send_request = xs_send_request, + .send_request = xs_tcp_send_request, .close = xs_close, .destroy = xs_destroy, }; @@ -1033,7 +1074,7 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) INIT_WORK(&xprt->connect_worker, xs_udp_connect_worker, xprt); - xprt->ops = &xs_ops; + xprt->ops = &xs_udp_ops; if (to) xprt->timeout = *to; @@ -1072,7 +1113,7 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) INIT_WORK(&xprt->connect_worker, xs_tcp_connect_worker, xprt); - xprt->ops = &xs_ops; + xprt->ops = &xs_tcp_ops; if (to) xprt->timeout = *to; -- cgit v1.2.3 From 808012fbb23a52ec59352445d2076d175ad4ab26 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:49 -0700 Subject: [PATCH] RPC: skip over transport-specific heads automatically Add a generic mechanism for skipping over transport-specific headers when constructing an RPC request. This removes another "xprt->stream" dependency. Test-plan: Write-intensive workload on a single mount point (try both UDP and TCP). Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 6 ++---- net/sunrpc/clnt.c | 5 ++--- net/sunrpc/xprtsock.c | 24 +++++++++++++++++------- 3 files changed, 21 insertions(+), 14 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 53a030acdf7..d2b08f16c25 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -844,10 +844,8 @@ gss_marshal(struct rpc_task *task, u32 *p) /* We compute the checksum for the verifier over the xdr-encoded bytes * starting with the xid and ending at the end of the credential: */ - iov.iov_base = req->rq_snd_buf.head[0].iov_base; - if (task->tk_client->cl_xprt->stream) - /* See clnt.c:call_header() */ - iov.iov_base += 4; + iov.iov_base = xprt_skip_transport_header(task->tk_xprt, + req->rq_snd_buf.head[0].iov_base); iov.iov_len = (u8 *)p - (u8 *)iov.iov_base; xdr_buf_from_iov(&iov, &verf_buf); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 4677959d283..cc1b773a79d 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1075,13 +1075,12 @@ static u32 * call_header(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; - struct rpc_xprt *xprt = clnt->cl_xprt; struct rpc_rqst *req = task->tk_rqstp; u32 *p = req->rq_svec[0].iov_base; /* FIXME: check buffer size? */ - if (xprt->stream) - *p++ = 0; /* fill in later */ + + p = xprt_skip_transport_header(task->tk_xprt, p); *p++ = req->rq_xid; /* XID */ *p++ = htonl(RPC_CALL); /* CALL */ *p++ = htonl(RPC_VERSION); /* RPC version */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 57988300640..aaf053b1a0c 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -282,6 +282,13 @@ static int xs_udp_send_request(struct rpc_task *task) return status; } +static inline void xs_encode_tcp_record_marker(struct xdr_buf *buf) +{ + u32 reclen = buf->len - sizeof(rpc_fraghdr); + rpc_fraghdr *base = buf->head[0].iov_base; + *base = htonl(RPC_LAST_STREAM_FRAGMENT | reclen); +} + /** * xs_tcp_send_request - write an RPC request to a TCP socket * @task: address of RPC task that manages the state of an RPC request @@ -301,11 +308,9 @@ static int xs_tcp_send_request(struct rpc_task *task) struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct xdr_buf *xdr = &req->rq_snd_buf; - u32 *marker = req->rq_svec[0].iov_base; int status, retry = 0; - /* Write the record marker */ - *marker = htonl(0x80000000|(req->rq_slen-sizeof(*marker))); + xs_encode_tcp_record_marker(&req->rq_snd_buf); xs_pktdump("packet data:", req->rq_svec->iov_base, @@ -503,16 +508,19 @@ static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc xprt->tcp_offset += used; if (used != len) return; + xprt->tcp_reclen = ntohl(xprt->tcp_recm); - if (xprt->tcp_reclen & 0x80000000) + if (xprt->tcp_reclen & RPC_LAST_STREAM_FRAGMENT) xprt->tcp_flags |= XPRT_LAST_FRAG; else xprt->tcp_flags &= ~XPRT_LAST_FRAG; - xprt->tcp_reclen &= 0x7fffffff; + xprt->tcp_reclen &= RPC_FRAGMENT_SIZE_MASK; + xprt->tcp_flags &= ~XPRT_COPY_RECM; xprt->tcp_offset = 0; + /* Sanity check of the record length */ - if (xprt->tcp_reclen < 4) { + if (unlikely(xprt->tcp_reclen < 4)) { dprintk("RPC: invalid TCP record fragment length\n"); xprt_disconnect(xprt); return; @@ -1065,6 +1073,7 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->prot = IPPROTO_UDP; xprt->port = XS_MAX_RESVPORT; + xprt->tsh_size = 0; xprt->stream = 0; xprt->nocong = 0; xprt->cwnd = RPC_INITCWND; @@ -1105,11 +1114,12 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->prot = IPPROTO_TCP; xprt->port = XS_MAX_RESVPORT; + xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32); xprt->stream = 1; xprt->nocong = 1; xprt->cwnd = RPC_MAXCWND(xprt); xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; - xprt->max_payload = (1U << 31) - 1; + xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; INIT_WORK(&xprt->connect_worker, xs_tcp_connect_worker, xprt); -- cgit v1.2.3 From 43118c29dea2b23798bd42a147015cceee7fa885 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:49 -0700 Subject: [PATCH] RPC: get rid of xprt->stream Now we can fix up the last few places that use the "xprt->stream" variable, and get rid of it from the rpc_xprt structure. Test-plan: Destructive testing (unplugging the network temporarily). Connectathon with UDP and TCP. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 3 +-- net/sunrpc/xprtsock.c | 28 ++++++++++++++++++---------- 2 files changed, 19 insertions(+), 12 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 31ef7dc7eed..43fef762644 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -630,8 +630,7 @@ void xprt_transmit(struct rpc_task *task) case -ENOTCONN: return; default: - if (xprt->stream) - xprt_disconnect(xprt); + break; } xprt_release_write(xprt, task); return; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index aaf053b1a0c..5bb6fed3df3 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -356,6 +356,7 @@ static int xs_tcp_send_request(struct rpc_task *task) default: dprintk("RPC: sendmsg returned unrecognized error %d\n", -status); + xprt_disconnect(xprt); break; } @@ -826,19 +827,17 @@ static void xs_tcp_write_space(struct sock *sk) } /** - * xs_set_buffer_size - set send and receive limits + * xs_udp_set_buffer_size - set send and receive limits * @xprt: generic transport * * Set socket send and receive limits based on the * sndsize and rcvsize fields in the generic transport - * structure. This applies only to UDP sockets. + * structure. */ -static void xs_set_buffer_size(struct rpc_xprt *xprt) +static void xs_udp_set_buffer_size(struct rpc_xprt *xprt) { struct sock *sk = xprt->inet; - if (xprt->stream) - return; if (xprt->rcvsize) { sk->sk_userlocks |= SOCK_RCVBUF_LOCK; sk->sk_rcvbuf = xprt->rcvsize * xprt->max_reqs * 2; @@ -850,6 +849,17 @@ static void xs_set_buffer_size(struct rpc_xprt *xprt) } } +/** + * xs_tcp_set_buffer_size - set send and receive limits + * @xprt: generic transport + * + * Nothing to do for TCP. + */ +static void xs_tcp_set_buffer_size(struct rpc_xprt *xprt) +{ + return; +} + static int xs_bindresvport(struct rpc_xprt *xprt, struct socket *sock) { struct sockaddr_in myaddr = { @@ -928,7 +938,7 @@ static void xs_udp_connect_worker(void *args) write_unlock_bh(&sk->sk_callback_lock); } - xs_set_buffer_size(xprt); + xs_udp_set_buffer_size(xprt); status = 0; out: xprt_wake_pending_tasks(xprt, status); @@ -1034,7 +1044,7 @@ static void xs_connect(struct rpc_task *task) } static struct rpc_xprt_ops xs_udp_ops = { - .set_buffer_size = xs_set_buffer_size, + .set_buffer_size = xs_udp_set_buffer_size, .connect = xs_connect, .send_request = xs_udp_send_request, .close = xs_close, @@ -1042,7 +1052,7 @@ static struct rpc_xprt_ops xs_udp_ops = { }; static struct rpc_xprt_ops xs_tcp_ops = { - .set_buffer_size = xs_set_buffer_size, + .set_buffer_size = xs_tcp_set_buffer_size, .connect = xs_connect, .send_request = xs_tcp_send_request, .close = xs_close, @@ -1074,7 +1084,6 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->prot = IPPROTO_UDP; xprt->port = XS_MAX_RESVPORT; xprt->tsh_size = 0; - xprt->stream = 0; xprt->nocong = 0; xprt->cwnd = RPC_INITCWND; xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; @@ -1115,7 +1124,6 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->prot = IPPROTO_TCP; xprt->port = XS_MAX_RESVPORT; xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32); - xprt->stream = 1; xprt->nocong = 1; xprt->cwnd = RPC_MAXCWND(xprt); xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; -- cgit v1.2.3 From fe3aca290f17ae4978bd73d02aa4029f1c9c024c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:50 -0700 Subject: [PATCH] RPC: add API to set transport-specific timeouts Prepare the way to remove the "xprt->nocong" variable by adding a callout to the RPC client transport switch API to handle setting RPC retransmit timeouts. Add a pair of generic helper functions that provide the ability to set a simple fixed timeout, or to set a timeout based on the state of a round- trip estimator. Test-plan: Use WAN simulation to cause sporadic bursty packet loss. Look for significant regression in performance or client stability. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 67 ++++++++++++++++++++++++++++++++++----------------- net/sunrpc/xprtsock.c | 2 ++ 2 files changed, 47 insertions(+), 22 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 43fef762644..1ac2fbe0510 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -275,6 +275,38 @@ void xprt_write_space(struct rpc_xprt *xprt) spin_unlock_bh(&xprt->transport_lock); } +/** + * xprt_set_retrans_timeout_def - set a request's retransmit timeout + * @task: task whose timeout is to be set + * + * Set a request's retransmit timeout based on the transport's + * default timeout parameters. Used by transports that don't adjust + * the retransmit timeout based on round-trip time estimation. + */ +void xprt_set_retrans_timeout_def(struct rpc_task *task) +{ + task->tk_timeout = task->tk_rqstp->rq_timeout; +} + +/* + * xprt_set_retrans_timeout_rtt - set a request's retransmit timeout + * @task: task whose timeout is to be set + * + * Set a request's retransmit timeout using the RTT estimator. + */ +void xprt_set_retrans_timeout_rtt(struct rpc_task *task) +{ + int timer = task->tk_msg.rpc_proc->p_timer; + struct rpc_rtt *rtt = task->tk_client->cl_rtt; + struct rpc_rqst *req = task->tk_rqstp; + unsigned long max_timeout = req->rq_xprt->timeout.to_maxval; + + task->tk_timeout = rpc_calc_rto(rtt, timer); + task->tk_timeout <<= rpc_ntimeo(rtt, timer) + req->rq_retries; + if (task->tk_timeout > max_timeout || task->tk_timeout == 0) + task->tk_timeout = max_timeout; +} + static void xprt_reset_majortimeo(struct rpc_rqst *req) { struct rpc_timeout *to = &req->rq_xprt->timeout; @@ -588,7 +620,6 @@ out_unlock: */ void xprt_transmit(struct rpc_task *task) { - struct rpc_clnt *clnt = task->tk_client; struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; int status; @@ -613,8 +644,19 @@ void xprt_transmit(struct rpc_task *task) return; status = xprt->ops->send_request(task); - if (!status) - goto out_receive; + if (status == 0) { + dprintk("RPC: %4d xmit complete\n", task->tk_pid); + spin_lock_bh(&xprt->transport_lock); + xprt->ops->set_retrans_timeout(task); + /* Don't race with disconnect */ + if (!xprt_connected(xprt)) + task->tk_status = -ENOTCONN; + else if (!req->rq_received) + rpc_sleep_on(&xprt->pending, task, NULL, xprt_timer); + __xprt_release_write(xprt, task); + spin_unlock_bh(&xprt->transport_lock); + return; + } /* Note: at this point, task->tk_sleeping has not yet been set, * hence there is no danger of the waking up task being put on @@ -634,25 +676,6 @@ void xprt_transmit(struct rpc_task *task) } xprt_release_write(xprt, task); return; - out_receive: - dprintk("RPC: %4d xmit complete\n", task->tk_pid); - /* Set the task's receive timeout value */ - spin_lock_bh(&xprt->transport_lock); - if (!xprt->nocong) { - int timer = task->tk_msg.rpc_proc->p_timer; - task->tk_timeout = rpc_calc_rto(clnt->cl_rtt, timer); - task->tk_timeout <<= rpc_ntimeo(clnt->cl_rtt, timer) + req->rq_retries; - if (task->tk_timeout > xprt->timeout.to_maxval || task->tk_timeout == 0) - task->tk_timeout = xprt->timeout.to_maxval; - } else - task->tk_timeout = req->rq_timeout; - /* Don't race with disconnect */ - if (!xprt_connected(xprt)) - task->tk_status = -ENOTCONN; - else if (!req->rq_received) - rpc_sleep_on(&xprt->pending, task, NULL, xprt_timer); - __xprt_release_write(xprt, task); - spin_unlock_bh(&xprt->transport_lock); } static inline void do_xprt_reserve(struct rpc_task *task) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 5bb6fed3df3..79433ffd1df 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1047,6 +1047,7 @@ static struct rpc_xprt_ops xs_udp_ops = { .set_buffer_size = xs_udp_set_buffer_size, .connect = xs_connect, .send_request = xs_udp_send_request, + .set_retrans_timeout = xprt_set_retrans_timeout_rtt, .close = xs_close, .destroy = xs_destroy, }; @@ -1055,6 +1056,7 @@ static struct rpc_xprt_ops xs_tcp_ops = { .set_buffer_size = xs_tcp_set_buffer_size, .connect = xs_connect, .send_request = xs_tcp_send_request, + .set_retrans_timeout = xprt_set_retrans_timeout_def, .close = xs_close, .destroy = xs_destroy, }; -- cgit v1.2.3 From 12a804698b29d040b7cdd92e8a44b0e75164dae9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:51 -0700 Subject: [PATCH] RPC: expose API for serializing access to RPC transports The next several patches introduce an API that allows transports to choose whether the RPC client provides congestion control or whether the transport itself provides it. The first method we abstract is the one that serializes access to the RPC transport to prevent the bytes from different requests from mingling together. This method provides proper request serialization and the opportunity to prevent new requests from being started because the transport is congested. The normal situation is for the transport to handle congestion control itself. Although NFS over UDP was first, it has been recognized after years of experience that having the transport provide congestion control is much better than doing it in the RPC client. Thus TCP, and probably every future transport implementation, will use the default method, xprt_lock_write, provided in xprt.c, which does not provide any kind of congestion control. UDP can continue using the xprt.c-provided Van Jacobson congestion avoidance implementation. Test-plan: Use WAN simulation to cause sporadic bursty packet loss. Look for significant regression in performance or client stability. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 64 +++++++++++++++++++++++++++++++++++++++++---------- net/sunrpc/xprtsock.c | 2 ++ 2 files changed, 54 insertions(+), 12 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 1ac2fbe0510..2d1e8b83dd6 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -64,14 +64,56 @@ static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); static int xprt_clear_backlog(struct rpc_xprt *xprt); +/** + * xprt_reserve_xprt - serialize write access to transports + * @task: task that is requesting access to the transport + * + * This prevents mixing the payload of separate requests, and prevents + * transport connects from colliding with writes. No congestion control + * is provided. + */ +int xprt_reserve_xprt(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + struct rpc_rqst *req = task->tk_rqstp; + + if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { + if (task == xprt->snd_task) + return 1; + if (task == NULL) + return 0; + goto out_sleep; + } + xprt->snd_task = task; + if (req) { + req->rq_bytes_sent = 0; + req->rq_ntrans++; + } + return 1; + +out_sleep: + dprintk("RPC: %4d failed to lock transport %p\n", + task->tk_pid, xprt); + task->tk_timeout = 0; + task->tk_status = -EAGAIN; + if (req && req->rq_ntrans) + rpc_sleep_on(&xprt->resend, task, NULL, NULL); + else + rpc_sleep_on(&xprt->sending, task, NULL, NULL); + return 0; +} + /* - * Serialize write access to transports, in order to prevent different - * requests from interfering with each other. - * Also prevents transport connects from colliding with writes. + * xprt_reserve_xprt_cong - serialize write access to transports + * @task: task that is requesting access to the transport + * + * Same as xprt_reserve_xprt, but Van Jacobson congestion control is + * integrated into the decision of whether a request is allowed to be + * woken up and given access to the transport. */ -static int -__xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) +int xprt_reserve_xprt_cong(struct rpc_task *task) { + struct rpc_xprt *xprt = task->tk_xprt; struct rpc_rqst *req = task->tk_rqstp; if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { @@ -79,7 +121,7 @@ __xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) return 1; goto out_sleep; } - if (xprt->nocong || __xprt_get_cong(xprt, task)) { + if (__xprt_get_cong(xprt, task)) { xprt->snd_task = task; if (req) { req->rq_bytes_sent = 0; @@ -101,20 +143,18 @@ out_sleep: return 0; } -static inline int -xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) +static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) { int retval; spin_lock_bh(&xprt->transport_lock); - retval = __xprt_lock_write(xprt, task); + retval = xprt->ops->reserve_xprt(task); spin_unlock_bh(&xprt->transport_lock); return retval; } -static void -__xprt_lock_write_next(struct rpc_xprt *xprt) +static void __xprt_lock_write_next(struct rpc_xprt *xprt) { struct rpc_task *task; @@ -598,7 +638,7 @@ int xprt_prepare_transmit(struct rpc_task *task) err = req->rq_received; goto out_unlock; } - if (!__xprt_lock_write(xprt, task)) { + if (!xprt->ops->reserve_xprt(task)) { err = -EAGAIN; goto out_unlock; } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 79433ffd1df..fc4fbe8ea34 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1045,6 +1045,7 @@ static void xs_connect(struct rpc_task *task) static struct rpc_xprt_ops xs_udp_ops = { .set_buffer_size = xs_udp_set_buffer_size, + .reserve_xprt = xprt_reserve_xprt_cong, .connect = xs_connect, .send_request = xs_udp_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_rtt, @@ -1054,6 +1055,7 @@ static struct rpc_xprt_ops xs_udp_ops = { static struct rpc_xprt_ops xs_tcp_ops = { .set_buffer_size = xs_tcp_set_buffer_size, + .reserve_xprt = xprt_reserve_xprt, .connect = xs_connect, .send_request = xs_tcp_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_def, -- cgit v1.2.3 From 49e9a89086b3cae784a4868ca852863e4f4ea3fe Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:51 -0700 Subject: [PATCH] RPC: expose API for serializing access to RPC transports The next method we abstract is the one that releases a transport, allowing another task to have access to the transport. Again, one generic version of this is provided for transports that don't need the RPC client to perform congestion control, and one version is for transports that can use the original Van Jacobson implementation in xprt.c. Test-plan: Use WAN simulation to cause sporadic bursty packet loss. Look for significant regression in performance or client stability. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 77 +++++++++++++++++++++++++++++++++++++++++---------- net/sunrpc/xprtsock.c | 2 ++ 2 files changed, 65 insertions(+), 14 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 2d1e8b83dd6..e92ea99dd31 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -153,14 +153,42 @@ static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) return retval; } - static void __xprt_lock_write_next(struct rpc_xprt *xprt) +{ + struct rpc_task *task; + struct rpc_rqst *req; + + if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) + return; + + task = rpc_wake_up_next(&xprt->resend); + if (!task) { + task = rpc_wake_up_next(&xprt->sending); + if (!task) + goto out_unlock; + } + + req = task->tk_rqstp; + xprt->snd_task = task; + if (req) { + req->rq_bytes_sent = 0; + req->rq_ntrans++; + } + return; + +out_unlock: + smp_mb__before_clear_bit(); + clear_bit(XPRT_LOCKED, &xprt->state); + smp_mb__after_clear_bit(); +} + +static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt) { struct rpc_task *task; if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) return; - if (!xprt->nocong && RPCXPRT_CONGESTED(xprt)) + if (RPCXPRT_CONGESTED(xprt)) goto out_unlock; task = rpc_wake_up_next(&xprt->resend); if (!task) { @@ -168,7 +196,7 @@ static void __xprt_lock_write_next(struct rpc_xprt *xprt) if (!task) goto out_unlock; } - if (xprt->nocong || __xprt_get_cong(xprt, task)) { + if (__xprt_get_cong(xprt, task)) { struct rpc_rqst *req = task->tk_rqstp; xprt->snd_task = task; if (req) { @@ -183,11 +211,14 @@ out_unlock: smp_mb__after_clear_bit(); } -/* - * Releases the transport for use by other requests. +/** + * xprt_release_xprt - allow other requests to use a transport + * @xprt: transport with other tasks potentially waiting + * @task: task that is releasing access to the transport + * + * Note that "task" can be NULL. No congestion control is provided. */ -static void -__xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) +void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { xprt->snd_task = NULL; @@ -198,11 +229,29 @@ __xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) } } -static inline void -xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) +/** + * xprt_release_xprt_cong - allow other requests to use a transport + * @xprt: transport with other tasks potentially waiting + * @task: task that is releasing access to the transport + * + * Note that "task" can be NULL. Another task is awoken to use the + * transport if the transport's congestion window allows it. + */ +void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) +{ + if (xprt->snd_task == task) { + xprt->snd_task = NULL; + smp_mb__before_clear_bit(); + clear_bit(XPRT_LOCKED, &xprt->state); + smp_mb__after_clear_bit(); + __xprt_lock_write_next_cong(xprt); + } +} + +static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) { spin_lock_bh(&xprt->transport_lock); - __xprt_release_write(xprt, task); + xprt->ops->release_xprt(xprt, task); spin_unlock_bh(&xprt->transport_lock); } @@ -237,7 +286,7 @@ __xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) return; req->rq_cong = 0; xprt->cong -= RPC_CWNDSCALE; - __xprt_lock_write_next(xprt); + __xprt_lock_write_next_cong(xprt); } /* @@ -256,7 +305,7 @@ xprt_adjust_cwnd(struct rpc_xprt *xprt, int result) cwnd += (RPC_CWNDSCALE * RPC_CWNDSCALE + (cwnd >> 1)) / cwnd; if (cwnd > RPC_MAXCWND(xprt)) cwnd = RPC_MAXCWND(xprt); - __xprt_lock_write_next(xprt); + __xprt_lock_write_next_cong(xprt); } else if (result == -ETIMEDOUT) { cwnd >>= 1; if (cwnd < RPC_CWNDSCALE) @@ -693,7 +742,7 @@ void xprt_transmit(struct rpc_task *task) task->tk_status = -ENOTCONN; else if (!req->rq_received) rpc_sleep_on(&xprt->pending, task, NULL, xprt_timer); - __xprt_release_write(xprt, task); + xprt->ops->release_xprt(xprt, task); spin_unlock_bh(&xprt->transport_lock); return; } @@ -792,7 +841,7 @@ void xprt_release(struct rpc_task *task) if (!(req = task->tk_rqstp)) return; spin_lock_bh(&xprt->transport_lock); - __xprt_release_write(xprt, task); + xprt->ops->release_xprt(xprt, task); __xprt_put_cong(xprt, req); if (!list_empty(&req->rq_list)) list_del(&req->rq_list); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index fc4fbe8ea34..8589c1ad55e 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1046,6 +1046,7 @@ static void xs_connect(struct rpc_task *task) static struct rpc_xprt_ops xs_udp_ops = { .set_buffer_size = xs_udp_set_buffer_size, .reserve_xprt = xprt_reserve_xprt_cong, + .release_xprt = xprt_release_xprt_cong, .connect = xs_connect, .send_request = xs_udp_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_rtt, @@ -1056,6 +1057,7 @@ static struct rpc_xprt_ops xs_udp_ops = { static struct rpc_xprt_ops xs_tcp_ops = { .set_buffer_size = xs_tcp_set_buffer_size, .reserve_xprt = xprt_reserve_xprt, + .release_xprt = xprt_release_xprt, .connect = xs_connect, .send_request = xs_tcp_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_def, -- cgit v1.2.3 From 46c0ee8bc4ad3743de05e8b8b20201df44dcb6d3 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:52 -0700 Subject: [PATCH] RPC: separate xprt_timer implementations Allow transports to hook the retransmit timer interrupt. Some transports calculate their congestion window here so that a retransmit timeout has immediate effect on the congestion window. Test-plan: Use WAN simulation to cause sporadic bursty packet loss. Look for significant regression in performance or client stability. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 45 ++++++++++++++++++++------------------------- net/sunrpc/xprtsock.c | 12 ++++++++++++ 2 files changed, 32 insertions(+), 25 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index e92ea99dd31..ffc595592af 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -289,16 +289,19 @@ __xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) __xprt_lock_write_next_cong(xprt); } -/* - * Adjust RPC congestion window +/** + * xprt_adjust_cwnd - adjust transport congestion window + * @task: recently completed RPC request used to adjust window + * @result: result code of completed RPC request + * * We use a time-smoothed congestion estimator to avoid heavy oscillation. */ -static void -xprt_adjust_cwnd(struct rpc_xprt *xprt, int result) +void xprt_adjust_cwnd(struct rpc_task *task, int result) { - unsigned long cwnd; + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = task->tk_xprt; + unsigned long cwnd = xprt->cwnd; - cwnd = xprt->cwnd; if (result >= 0 && cwnd <= xprt->cong) { /* The (cwnd >> 1) term makes sure * the result gets rounded properly. */ @@ -314,6 +317,7 @@ xprt_adjust_cwnd(struct rpc_xprt *xprt, int result) dprintk("RPC: cong %ld, cwnd was %ld, now %ld\n", xprt->cong, xprt->cwnd, cwnd); xprt->cwnd = cwnd; + __xprt_put_cong(xprt, req); } /** @@ -602,8 +606,7 @@ void xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied) /* Adjust congestion window */ if (!xprt->nocong) { unsigned timer = task->tk_msg.rpc_proc->p_timer; - xprt_adjust_cwnd(xprt, copied); - __xprt_put_cong(xprt, req); + xprt_adjust_cwnd(task, copied); if (timer) { if (req->rq_ntrans == 1) rpc_update_rtt(clnt->cl_rtt, timer, @@ -640,27 +643,19 @@ void xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied) return; } -/* - * RPC receive timeout handler. - */ -static void -xprt_timer(struct rpc_task *task) +static void xprt_timer(struct rpc_task *task) { - struct rpc_rqst *req = task->tk_rqstp; + struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; - spin_lock(&xprt->transport_lock); - if (req->rq_received) - goto out; - - xprt_adjust_cwnd(req->rq_xprt, -ETIMEDOUT); - __xprt_put_cong(xprt, req); + dprintk("RPC: %4d xprt_timer\n", task->tk_pid); - dprintk("RPC: %4d xprt_timer (%s request)\n", - task->tk_pid, req ? "pending" : "backlogged"); - - task->tk_status = -ETIMEDOUT; -out: + spin_lock(&xprt->transport_lock); + if (!req->rq_received) { + if (xprt->ops->timer) + xprt->ops->timer(task); + task->tk_status = -ETIMEDOUT; + } task->tk_timeout = 0; rpc_wake_up_task(task); spin_unlock(&xprt->transport_lock); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 8589c1ad55e..c3658ff027a 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -860,6 +860,17 @@ static void xs_tcp_set_buffer_size(struct rpc_xprt *xprt) return; } +/** + * xs_udp_timer - called when a retransmit timeout occurs on a UDP transport + * @task: task that timed out + * + * Adjust the congestion window after a retransmit timeout has occurred. + */ +static void xs_udp_timer(struct rpc_task *task) +{ + xprt_adjust_cwnd(task, -ETIMEDOUT); +} + static int xs_bindresvport(struct rpc_xprt *xprt, struct socket *sock) { struct sockaddr_in myaddr = { @@ -1050,6 +1061,7 @@ static struct rpc_xprt_ops xs_udp_ops = { .connect = xs_connect, .send_request = xs_udp_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_rtt, + .timer = xs_udp_timer, .close = xs_close, .destroy = xs_destroy, }; -- cgit v1.2.3 From 1570c1e41eabf6b7031f3e4322a2cf1cbe319fee Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:52 -0700 Subject: [PATCH] RPC: add generic interface for adjusting the congestion window A new interface that allows transports to adjust their congestion window using the Van Jacobson implementation in xprt.c is provided. Test-plan: Use WAN simulation to cause sporadic bursty packet loss. Look for significant regression in performance or client stability. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 66 ++++++++++++++++++++------------------------------- net/sunrpc/xprtsock.c | 13 ++++------ 2 files changed, 31 insertions(+), 48 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index ffc595592af..707806fe1a2 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -591,56 +591,42 @@ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid) return req; } +/** + * xprt_update_rtt - update an RPC client's RTT state after receiving a reply + * @task: RPC request that recently completed + * + */ +void xprt_update_rtt(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_rtt *rtt = task->tk_client->cl_rtt; + unsigned timer = task->tk_msg.rpc_proc->p_timer; + + if (timer) { + if (req->rq_ntrans == 1) + rpc_update_rtt(rtt, timer, + (long)jiffies - req->rq_xtime); + rpc_set_timeo(rtt, timer, req->rq_ntrans - 1); + } +} + /** * xprt_complete_rqst - called when reply processing is complete - * @xprt: controlling transport - * @req: RPC request that just completed + * @task: RPC request that recently completed * @copied: actual number of bytes received from the transport * + * Caller holds transport lock. */ -void xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied) -{ - struct rpc_task *task = req->rq_task; - struct rpc_clnt *clnt = task->tk_client; - - /* Adjust congestion window */ - if (!xprt->nocong) { - unsigned timer = task->tk_msg.rpc_proc->p_timer; - xprt_adjust_cwnd(task, copied); - if (timer) { - if (req->rq_ntrans == 1) - rpc_update_rtt(clnt->cl_rtt, timer, - (long)jiffies - req->rq_xtime); - rpc_set_timeo(clnt->cl_rtt, timer, req->rq_ntrans - 1); - } - } +void xprt_complete_rqst(struct rpc_task *task, int copied) +{ + struct rpc_rqst *req = task->tk_rqstp; -#ifdef RPC_PROFILE - /* Profile only reads for now */ - if (copied > 1024) { - static unsigned long nextstat; - static unsigned long pkt_rtt, pkt_len, pkt_cnt; - - pkt_cnt++; - pkt_len += req->rq_slen + copied; - pkt_rtt += jiffies - req->rq_xtime; - if (time_before(nextstat, jiffies)) { - printk("RPC: %lu %ld cwnd\n", jiffies, xprt->cwnd); - printk("RPC: %ld %ld %ld %ld stat\n", - jiffies, pkt_cnt, pkt_len, pkt_rtt); - pkt_rtt = pkt_len = pkt_cnt = 0; - nextstat = jiffies + 5 * HZ; - } - } -#endif + dprintk("RPC: %5u xid %08x complete (%d bytes received)\n", + task->tk_pid, ntohl(req->rq_xid), copied); - dprintk("RPC: %4d has input (%d bytes)\n", task->tk_pid, copied); list_del_init(&req->rq_list); req->rq_received = req->rq_private_buf.len = copied; - - /* ... and wake up the process. */ rpc_wake_up_task(task); - return; } static void xprt_timer(struct rpc_task *task) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index c3658ff027a..980f26504f4 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -460,8 +460,6 @@ static void xs_udp_data_ready(struct sock *sk, int len) goto out_unlock; task = rovr->rq_task; - dprintk("RPC: %4d received reply\n", task->tk_pid); - if ((copied = rovr->rq_private_buf.buflen) > repsize) copied = repsize; @@ -472,7 +470,9 @@ static void xs_udp_data_ready(struct sock *sk, int len) /* Something worked... */ dst_confirm(skb->dst); - xprt_complete_rqst(xprt, rovr, copied); + xprt_adjust_cwnd(task, copied); + xprt_update_rtt(task); + xprt_complete_rqst(task, copied); out_unlock: spin_unlock(&xprt->transport_lock); @@ -634,11 +634,8 @@ static inline void xs_tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc } out: - if (!(xprt->tcp_flags & XPRT_COPY_DATA)) { - dprintk("RPC: %4d received reply complete\n", - req->rq_task->tk_pid); - xprt_complete_rqst(xprt, req, xprt->tcp_copied); - } + if (!(xprt->tcp_flags & XPRT_COPY_DATA)) + xprt_complete_rqst(req->rq_task, xprt->tcp_copied); spin_unlock(&xprt->transport_lock); xs_tcp_check_recm(xprt); } -- cgit v1.2.3 From a58dd398f5db4f73d5c581069fd70a4304cc4f0a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:53 -0700 Subject: [PATCH] RPC: add a release_rqst callout to the RPC transport switch The final place where congestion control state is adjusted is in xprt_release, where each request is finally released. Add a callout there to allow transports to perform additional processing when a request is about to be released. Test-plan: Use WAN simulation to cause sporadic bursty packet loss. Look for significant regression in performance or client stability. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 14 +++++++++++++- net/sunrpc/xprtsock.c | 1 + 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 707806fe1a2..e8d11bd6158 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -289,6 +289,17 @@ __xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) __xprt_lock_write_next_cong(xprt); } +/** + * xprt_release_rqst_cong - housekeeping when request is complete + * @task: RPC request that recently completed + * + * Useful for transports that require congestion control. + */ +void xprt_release_rqst_cong(struct rpc_task *task) +{ + __xprt_put_cong(task->tk_xprt, task->tk_rqstp); +} + /** * xprt_adjust_cwnd - adjust transport congestion window * @task: recently completed RPC request used to adjust window @@ -823,7 +834,8 @@ void xprt_release(struct rpc_task *task) return; spin_lock_bh(&xprt->transport_lock); xprt->ops->release_xprt(xprt, task); - __xprt_put_cong(xprt, req); + if (xprt->ops->release_request) + xprt->ops->release_request(task); if (!list_empty(&req->rq_list)) list_del(&req->rq_list); xprt->last_used = jiffies; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 980f26504f4..6c2f5dcea41 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1059,6 +1059,7 @@ static struct rpc_xprt_ops xs_udp_ops = { .send_request = xs_udp_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_rtt, .timer = xs_udp_timer, + .release_request = xprt_release_rqst_cong, .close = xs_close, .destroy = xs_destroy, }; -- cgit v1.2.3 From ed63c003701a314c4893c11eceb9d68f8f46c662 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:53 -0700 Subject: [PATCH] RPC: remove xprt->nocong Get rid of the "xprt->nocong" variable. Test-plan: Use WAN simulation to cause sporadic bursty packet loss with UDP mounts. Look for significant regression in performance or client stability. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 6c2f5dcea41..7e5e020fe78 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1100,7 +1100,6 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->prot = IPPROTO_UDP; xprt->port = XS_MAX_RESVPORT; xprt->tsh_size = 0; - xprt->nocong = 0; xprt->cwnd = RPC_INITCWND; xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; /* XXX: header size can vary due to auth type, IPv6, etc. */ @@ -1140,7 +1139,6 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->prot = IPPROTO_TCP; xprt->port = XS_MAX_RESVPORT; xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32); - xprt->nocong = 1; xprt->cwnd = RPC_MAXCWND(xprt); xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; -- cgit v1.2.3 From 555ee3af161b037865793bd4bebc06b58daafde6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:54 -0700 Subject: [PATCH] RPC: clean up after nocong was removed Clean-up: Move some macros that are specific to the Van Jacobson implementation into xprt.c. Get rid of the cong_wait field in rpc_xprt, which is no longer used. Get rid of xprt_clear_backlog. Test-plan: Compile with CONFIG_NFS enabled. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 29 +++++++++++++++++++---------- net/sunrpc/xprtsock.c | 2 -- 2 files changed, 19 insertions(+), 12 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index e8d11bd6158..0458319a1bd 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -62,7 +62,23 @@ static inline void do_xprt_reserve(struct rpc_task *); static void xprt_connect_status(struct rpc_task *task); static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); -static int xprt_clear_backlog(struct rpc_xprt *xprt); +/* + * The transport code maintains an estimate on the maximum number of out- + * standing RPC requests, using a smoothed version of the congestion + * avoidance implemented in 44BSD. This is basically the Van Jacobson + * congestion algorithm: If a retransmit occurs, the congestion window is + * halved; otherwise, it is incremented by 1/cwnd when + * + * - a reply is received and + * - a full number of requests are outstanding and + * - the congestion window hasn't been updated recently. + */ +#define RPC_CWNDSHIFT (8U) +#define RPC_CWNDSCALE (1U << RPC_CWNDSHIFT) +#define RPC_INITCWND RPC_CWNDSCALE +#define RPC_MAXCWND(xprt) ((xprt)->max_reqs << RPC_CWNDSHIFT) + +#define RPCXPRT_CONGESTED(xprt) ((xprt)->cong >= (xprt)->cwnd) /** * xprt_reserve_xprt - serialize write access to transports @@ -850,7 +866,7 @@ void xprt_release(struct rpc_task *task) spin_lock(&xprt->reserve_lock); list_add(&req->rq_list, &xprt->free); - xprt_clear_backlog(xprt); + rpc_wake_up_next(&xprt->backlog); spin_unlock(&xprt->reserve_lock); } @@ -902,7 +918,6 @@ static struct rpc_xprt *xprt_setup(int proto, struct sockaddr_in *ap, struct rpc spin_lock_init(&xprt->transport_lock); spin_lock_init(&xprt->reserve_lock); - init_waitqueue_head(&xprt->cong_wait); INIT_LIST_HEAD(&xprt->free); INIT_LIST_HEAD(&xprt->recv); @@ -911,6 +926,7 @@ static struct rpc_xprt *xprt_setup(int proto, struct sockaddr_in *ap, struct rpc xprt->timer.function = xprt_init_autodisconnect; xprt->timer.data = (unsigned long) xprt; xprt->last_used = jiffies; + xprt->cwnd = RPC_INITCWND; rpc_init_wait_queue(&xprt->pending, "xprt_pending"); rpc_init_wait_queue(&xprt->sending, "xprt_sending"); @@ -955,16 +971,9 @@ static void xprt_shutdown(struct rpc_xprt *xprt) rpc_wake_up(&xprt->resend); xprt_wake_pending_tasks(xprt, -EIO); rpc_wake_up(&xprt->backlog); - wake_up(&xprt->cong_wait); del_timer_sync(&xprt->timer); } -static int xprt_clear_backlog(struct rpc_xprt *xprt) { - rpc_wake_up_next(&xprt->backlog); - wake_up(&xprt->cong_wait); - return 1; -} - /** * xprt_destroy - destroy an RPC transport, killing off all requests. * @xprt: transport to destroy diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 7e5e020fe78..26402c063f0 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1100,7 +1100,6 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->prot = IPPROTO_UDP; xprt->port = XS_MAX_RESVPORT; xprt->tsh_size = 0; - xprt->cwnd = RPC_INITCWND; xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; /* XXX: header size can vary due to auth type, IPv6, etc. */ xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); @@ -1139,7 +1138,6 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->prot = IPPROTO_TCP; xprt->port = XS_MAX_RESVPORT; xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32); - xprt->cwnd = RPC_MAXCWND(xprt); xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; -- cgit v1.2.3 From 529b33c6db0120126b1381faa51406dc463acdc9 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:54 -0700 Subject: [PATCH] RPC: allow RPC client's port range to be adjustable Select an RPC client source port between 650 and 1023 instead of between 1 and 800. The old range conflicts with a number of network services. Provide sysctls to allow admins to select a different port range. Note that this doesn't affect user-level RPC library behavior, which still uses 1 to 800. Based on a suggestion by Olaf Kirch . Test-plan: Repeated mount and unmount. Destructive testing. Idle timeouts. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/sysctl.c | 29 +++++++++++++++++++++++++++++ net/sunrpc/xprtsock.c | 23 ++++++++--------------- 2 files changed, 37 insertions(+), 15 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index ef483262f17..d0c9f460e41 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -121,9 +121,16 @@ done: unsigned int xprt_udp_slot_table_entries = RPC_DEF_SLOT_TABLE; unsigned int xprt_tcp_slot_table_entries = RPC_DEF_SLOT_TABLE; +unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT; +EXPORT_SYMBOL(xprt_min_resvport); +unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT; +EXPORT_SYMBOL(xprt_max_resvport); + static unsigned int min_slot_table_size = RPC_MIN_SLOT_TABLE; static unsigned int max_slot_table_size = RPC_MAX_SLOT_TABLE; +static unsigned int xprt_min_resvport_limit = RPC_MIN_RESVPORT; +static unsigned int xprt_max_resvport_limit = RPC_MAX_RESVPORT; static ctl_table debug_table[] = { { @@ -180,6 +187,28 @@ static ctl_table debug_table[] = { .extra1 = &min_slot_table_size, .extra2 = &max_slot_table_size }, + { + .ctl_name = CTL_MIN_RESVPORT, + .procname = "min_resvport", + .data = &xprt_min_resvport, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xprt_min_resvport_limit, + .extra2 = &xprt_max_resvport_limit + }, + { + .ctl_name = CTL_MAX_RESVPORT, + .procname = "max_resvport", + .data = &xprt_max_resvport, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &xprt_min_resvport_limit, + .extra2 = &xprt_max_resvport_limit + }, { .ctl_name = 0 } }; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 26402c063f0..62c2e7caa34 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -35,11 +35,6 @@ #include #include -/* - * Maximum port number to use when requesting a reserved port. - */ -#define XS_MAX_RESVPORT (800U) - /* * How many times to try sending a request on a socket before waiting * for the socket buffer to clear. @@ -873,10 +868,9 @@ static int xs_bindresvport(struct rpc_xprt *xprt, struct socket *sock) struct sockaddr_in myaddr = { .sin_family = AF_INET, }; - int err, port; + int err; + unsigned short port = xprt->port; - /* Were we already bound to a given port? Try to reuse it */ - port = xprt->port; do { myaddr.sin_port = htons(port); err = sock->ops->bind(sock, (struct sockaddr *) &myaddr, @@ -887,8 +881,10 @@ static int xs_bindresvport(struct rpc_xprt *xprt, struct socket *sock) port); return 0; } - if (--port == 0) - port = XS_MAX_RESVPORT; + if (port <= xprt_min_resvport) + port = xprt_max_resvport; + else + port--; } while (err == -EADDRINUSE && port != xprt->port); dprintk("RPC: can't bind to reserved port (%d).\n", -err); @@ -1075,9 +1071,6 @@ static struct rpc_xprt_ops xs_tcp_ops = { .destroy = xs_destroy, }; -extern unsigned int xprt_udp_slot_table_entries; -extern unsigned int xprt_tcp_slot_table_entries; - /** * xs_setup_udp - Set up transport to use a UDP socket * @xprt: transport to set up @@ -1098,7 +1091,7 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) memset(xprt->slot, 0, slot_table_size); xprt->prot = IPPROTO_UDP; - xprt->port = XS_MAX_RESVPORT; + xprt->port = xprt_max_resvport; xprt->tsh_size = 0; xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; /* XXX: header size can vary due to auth type, IPv6, etc. */ @@ -1136,7 +1129,7 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) memset(xprt->slot, 0, slot_table_size); xprt->prot = IPPROTO_TCP; - xprt->port = XS_MAX_RESVPORT; + xprt->port = xprt_max_resvport; xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32); xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; -- cgit v1.2.3 From 3167e12c0c424f3c323944701615343022d86418 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:55 -0700 Subject: [PATCH] RPC: make sure to get the same local port number when reconnecting Implement a best practice: if the remote end drops our connection, try to reconnect using the same port number. This is important because the NFS server's Duplicate Reply Cache often hashes on the source port number. If the client reuses the port number when it reconnects, the server's DRC will be more effective. Based on suggestions by Mike Eisler, Olaf Kirch, and Alexey Kuznetsky. Test-plan: Destructive testing. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 65 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 53 insertions(+), 12 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 62c2e7caa34..88ac71fcd33 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -362,6 +362,8 @@ static int xs_tcp_send_request(struct rpc_task *task) * xs_close - close a socket * @xprt: transport * + * This is used when all requests are complete; ie, no DRC state remains + * on the server we want to save. */ static void xs_close(struct rpc_xprt *xprt) { @@ -949,6 +951,30 @@ out: xprt_clear_connecting(xprt); } +/* + * We need to preserve the port number so the reply cache on the server can + * find our cached RPC replies when we get around to reconnecting. + */ +static void xs_tcp_reuse_connection(struct rpc_xprt *xprt) +{ + int result; + struct socket *sock = xprt->sock; + struct sockaddr any; + + dprintk("RPC: disconnecting xprt %p to reuse port\n", xprt); + + /* + * Disconnect the transport socket by doing a connect operation + * with AF_UNSPEC. This should return immediately... + */ + memset(&any, 0, sizeof(any)); + any.sa_family = AF_UNSPEC; + result = sock->ops->connect(sock, &any, sizeof(any), 0); + if (result) + dprintk("RPC: AF_UNSPEC connect return code %d\n", + result); +} + /** * xs_tcp_connect_worker - connect a TCP socket to a remote endpoint * @args: RPC transport to connect @@ -966,18 +992,20 @@ static void xs_tcp_connect_worker(void *args) dprintk("RPC: xs_tcp_connect_worker for xprt %p\n", xprt); - /* Start by resetting any existing socket state */ - xs_close(xprt); - - if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { - dprintk("RPC: can't create TCP transport socket (%d).\n", -err); - goto out; - } + if (!xprt->sock) { + /* start from scratch */ + if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { + dprintk("RPC: can't create TCP transport socket (%d).\n", -err); + goto out; + } - if (xprt->resvport && xs_bindresvport(xprt, sock) < 0) { - sock_release(sock); - goto out; - } + if (xprt->resvport && xs_bindresvport(xprt, sock) < 0) { + sock_release(sock); + goto out; + } + } else + /* "close" the socket, preserving the local port */ + xs_tcp_reuse_connection(xprt); if (!xprt->inet) { struct sock *sk = sock->sk; @@ -991,7 +1019,12 @@ static void xs_tcp_connect_worker(void *args) sk->sk_data_ready = xs_tcp_data_ready; sk->sk_state_change = xs_tcp_state_change; sk->sk_write_space = xs_tcp_write_space; - tcp_sk(sk)->nonagle = 1; + + /* socket options */ + sk->sk_userlocks |= SOCK_BINDPORT_LOCK; + sock_reset_flag(sk, SOCK_LINGER); + tcp_sk(sk)->linger2 = 0; + tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF; xprt_clear_connected(xprt); @@ -1012,6 +1045,14 @@ static void xs_tcp_connect_worker(void *args) case -EINPROGRESS: case -EALREADY: goto out_clear; + case -ECONNREFUSED: + case -ECONNRESET: + /* retry with existing socket, after a delay */ + break; + default: + /* get rid of existing socket, and retry */ + xs_close(xprt); + break; } } out: -- cgit v1.2.3 From 03bf4b707eee06706c9db343dd5c905b7ee47ed2 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:55 -0700 Subject: [PATCH] RPC: parametrize various transport connect timeouts Each transport implementation can now set unique bind, connect, reestablishment, and idle timeout values. These are variables, allowing the values to be modified dynamically. This permits exponential backoff of any of these values, for instance. As an example, we implement exponential backoff for the connection reestablishment timeout. Test-plan: Destructive testing (unplugging the network temporarily). Connectathon with UDP and TCP. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 2 +- net/sunrpc/xprt.c | 5 ++-- net/sunrpc/xprtsock.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 69 insertions(+), 6 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index cc1b773a79d..24b44e73f39 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -740,7 +740,7 @@ call_bind(struct rpc_task *task) task->tk_action = call_connect; if (!clnt->cl_port) { task->tk_action = call_bind_status; - task->tk_timeout = RPC_CONNECT_TIMEOUT; + task->tk_timeout = task->tk_xprt->bind_timeout; rpc_getport(task, clnt); } } diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 0458319a1bd..215be0d0ef6 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -551,7 +551,7 @@ void xprt_connect(struct rpc_task *task) if (task->tk_rqstp) task->tk_rqstp->rq_bytes_sent = 0; - task->tk_timeout = RPC_CONNECT_TIMEOUT; + task->tk_timeout = xprt->connect_timeout; rpc_sleep_on(&xprt->pending, task, xprt_connect_status, NULL); xprt->ops->connect(task); } @@ -763,7 +763,6 @@ void xprt_transmit(struct rpc_task *task) switch (status) { case -ECONNREFUSED: - task->tk_timeout = RPC_REESTABLISH_TIMEOUT; rpc_sleep_on(&xprt->sending, task, NULL, NULL); case -EAGAIN: case -ENOTCONN: @@ -857,7 +856,7 @@ void xprt_release(struct rpc_task *task) xprt->last_used = jiffies; if (list_empty(&xprt->recv) && !xprt->shutdown) mod_timer(&xprt->timer, - xprt->last_used + RPC_IDLE_DISCONNECT_TIMEOUT); + xprt->last_used + xprt->idle_timeout); spin_unlock_bh(&xprt->transport_lock); task->tk_rqstp = NULL; memset(req, 0, sizeof(*req)); /* mark unused */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 88ac71fcd33..06c2d95484e 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -41,6 +41,50 @@ */ #define XS_SENDMSG_RETRY (10U) +/* + * Time out for an RPC UDP socket connect. UDP socket connects are + * synchronous, but we set a timeout anyway in case of resource + * exhaustion on the local host. + */ +#define XS_UDP_CONN_TO (5U * HZ) + +/* + * Wait duration for an RPC TCP connection to be established. Solaris + * NFS over TCP uses 60 seconds, for example, which is in line with how + * long a server takes to reboot. + */ +#define XS_TCP_CONN_TO (60U * HZ) + +/* + * Wait duration for a reply from the RPC portmapper. + */ +#define XS_BIND_TO (60U * HZ) + +/* + * Delay if a UDP socket connect error occurs. This is most likely some + * kind of resource problem on the local host. + */ +#define XS_UDP_REEST_TO (2U * HZ) + +/* + * The reestablish timeout allows clients to delay for a bit before attempting + * to reconnect to a server that just dropped our connection. + * + * We implement an exponential backoff when trying to reestablish a TCP + * transport connection with the server. Some servers like to drop a TCP + * connection when they are overworked, so we start with a short timeout and + * increase over time if the server is down or not responding. + */ +#define XS_TCP_INIT_REEST_TO (3U * HZ) +#define XS_TCP_MAX_REEST_TO (5U * 60 * HZ) + +/* + * TCP idle timeout; client drops the transport socket if it is idle + * for this long. Note that we also timeout UDP sockets to prevent + * holding port numbers when there is no RPC traffic. + */ +#define XS_IDLE_DISC_TO (5U * 60 * HZ) + #ifdef RPC_DEBUG # undef RPC_DEBUG_DATA # define RPCDBG_FACILITY RPCDBG_TRANS @@ -739,6 +783,7 @@ static void xs_tcp_state_change(struct sock *sk) xprt->tcp_reclen = 0; xprt->tcp_copied = 0; xprt->tcp_flags = XPRT_COPY_RECM | XPRT_COPY_XID; + xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; xprt_wake_pending_tasks(xprt, 0); } spin_unlock_bh(&xprt->transport_lock); @@ -1066,6 +1111,13 @@ out_clear: * @task: address of RPC task that manages state of connect request * * TCP: If the remote end dropped the connection, delay reconnecting. + * + * UDP socket connects are synchronous, but we use a work queue anyway + * to guarantee that even unprivileged user processes can set up a + * socket on a privileged port. + * + * If a UDP socket connect fails, the delay behavior here prevents + * retry floods (hard mounts). */ static void xs_connect(struct rpc_task *task) { @@ -1075,9 +1127,13 @@ static void xs_connect(struct rpc_task *task) return; if (xprt->sock != NULL) { - dprintk("RPC: xs_connect delayed xprt %p\n", xprt); + dprintk("RPC: xs_connect delayed xprt %p for %lu seconds\n", + xprt, xprt->reestablish_timeout / HZ); schedule_delayed_work(&xprt->connect_worker, - RPC_REESTABLISH_TIMEOUT); + xprt->reestablish_timeout); + xprt->reestablish_timeout <<= 1; + if (xprt->reestablish_timeout > XS_TCP_MAX_REEST_TO) + xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO; } else { dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); schedule_work(&xprt->connect_worker); @@ -1139,6 +1195,10 @@ int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); INIT_WORK(&xprt->connect_worker, xs_udp_connect_worker, xprt); + xprt->bind_timeout = XS_BIND_TO; + xprt->connect_timeout = XS_UDP_CONN_TO; + xprt->reestablish_timeout = XS_UDP_REEST_TO; + xprt->idle_timeout = XS_IDLE_DISC_TO; xprt->ops = &xs_udp_ops; @@ -1176,6 +1236,10 @@ int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; INIT_WORK(&xprt->connect_worker, xs_tcp_connect_worker, xprt); + xprt->bind_timeout = XS_BIND_TO; + xprt->connect_timeout = XS_TCP_CONN_TO; + xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; + xprt->idle_timeout = XS_IDLE_DISC_TO; xprt->ops = &xs_tcp_ops; -- cgit v1.2.3 From 470056c288334eb0b37be26c9ff8aee37ed1cc7a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 25 Aug 2005 16:25:56 -0700 Subject: [PATCH] RPC: rationalize set_buffer_size In fact, ->set_buffer_size should be completely functionless for non-UDP. Test-plan: Check socket buffer size on UDP sockets over time. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 10 ++-------- net/sunrpc/xprtsock.c | 30 +++++++++++++++--------------- 2 files changed, 17 insertions(+), 23 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 24b44e73f39..5a8f01d726e 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -517,14 +517,8 @@ void rpc_setbufsize(struct rpc_clnt *clnt, unsigned int sndsize, unsigned int rcvsize) { struct rpc_xprt *xprt = clnt->cl_xprt; - - xprt->sndsize = 0; - if (sndsize) - xprt->sndsize = sndsize + RPC_SLACK_SPACE; - xprt->rcvsize = 0; - if (rcvsize) - xprt->rcvsize = rcvsize + RPC_SLACK_SPACE; - xprt->ops->set_buffer_size(xprt); + if (xprt->ops->set_buffer_size) + xprt->ops->set_buffer_size(xprt, sndsize, rcvsize); } /* diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 06c2d95484e..2e1529217e6 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -865,15 +865,7 @@ static void xs_tcp_write_space(struct sock *sk) read_unlock(&sk->sk_callback_lock); } -/** - * xs_udp_set_buffer_size - set send and receive limits - * @xprt: generic transport - * - * Set socket send and receive limits based on the - * sndsize and rcvsize fields in the generic transport - * structure. - */ -static void xs_udp_set_buffer_size(struct rpc_xprt *xprt) +static void xs_udp_do_set_buffer_size(struct rpc_xprt *xprt) { struct sock *sk = xprt->inet; @@ -889,14 +881,23 @@ static void xs_udp_set_buffer_size(struct rpc_xprt *xprt) } /** - * xs_tcp_set_buffer_size - set send and receive limits + * xs_udp_set_buffer_size - set send and receive limits * @xprt: generic transport + * @sndsize: requested size of send buffer, in bytes + * @rcvsize: requested size of receive buffer, in bytes * - * Nothing to do for TCP. + * Set socket send and receive buffer size limits. */ -static void xs_tcp_set_buffer_size(struct rpc_xprt *xprt) +static void xs_udp_set_buffer_size(struct rpc_xprt *xprt, size_t sndsize, size_t rcvsize) { - return; + xprt->sndsize = 0; + if (sndsize) + xprt->sndsize = sndsize + 1024; + xprt->rcvsize = 0; + if (rcvsize) + xprt->rcvsize = rcvsize + 1024; + + xs_udp_do_set_buffer_size(xprt); } /** @@ -989,7 +990,7 @@ static void xs_udp_connect_worker(void *args) write_unlock_bh(&sk->sk_callback_lock); } - xs_udp_set_buffer_size(xprt); + xs_udp_do_set_buffer_size(xprt); status = 0; out: xprt_wake_pending_tasks(xprt, status); @@ -1158,7 +1159,6 @@ static struct rpc_xprt_ops xs_udp_ops = { }; static struct rpc_xprt_ops xs_tcp_ops = { - .set_buffer_size = xs_tcp_set_buffer_size, .reserve_xprt = xprt_reserve_xprt, .release_xprt = xprt_release_xprt, .connect = xs_connect, -- cgit v1.2.3 From 278c995c8a153bb2a9bc427e931cfb9c8034c9d7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 24 Jul 2005 23:53:01 +0100 Subject: [PATCH] RPC,NFS: new rpc_pipefs patch Currently rpc_mkdir/rpc_rmdir and rpc_mkpipe/mk_unlink have an API that's a little unfortunate. They take a path relative to the rpc_pipefs root and thus need to perform a full lookup. If you look at debugfs or usbfs they always store the dentry for directories they created and thus can pass in a dentry + single pathname component pair into their equivalents of the above functions. And in fact rpc_pipefs actually stores a dentry for all but one component so this change not only simplifies the core rpc_pipe code but also the callers. Unfortuntately this code path is only used by the NFS4 idmapper and AUTH_GSSAPI for which I don't have a test enviroment. Could someone give it a spin? It's the last bit needed before we can rework the lookup_hash API Signed-off-by: Christoph Hellwig Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 9 +- net/sunrpc/clnt.c | 53 +++++--- net/sunrpc/rpc_pipe.c | 268 +++++++++++++++-------------------------- 3 files changed, 133 insertions(+), 197 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index d2b08f16c25..bd2555139fa 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -87,7 +87,6 @@ struct gss_auth { struct list_head upcalls; struct rpc_clnt *client; struct dentry *dentry; - char path[48]; spinlock_t lock; }; @@ -690,10 +689,8 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor) if (err) goto err_put_mech; - snprintf(gss_auth->path, sizeof(gss_auth->path), "%s/%s", - clnt->cl_pathname, - gss_auth->mech->gm_name); - gss_auth->dentry = rpc_mkpipe(gss_auth->path, clnt, &gss_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN); + gss_auth->dentry = rpc_mkpipe(clnt->cl_dentry, gss_auth->mech->gm_name, + clnt, &gss_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN); if (IS_ERR(gss_auth->dentry)) { err = PTR_ERR(gss_auth->dentry); goto err_put_mech; @@ -718,7 +715,7 @@ gss_destroy(struct rpc_auth *auth) auth, auth->au_flavor); gss_auth = container_of(auth, struct gss_auth, rpc_auth); - rpc_unlink(gss_auth->path); + rpc_unlink(gss_auth->dentry); gss_mech_put(gss_auth->mech); rpcauth_free_credcache(auth); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 5a8f01d726e..63bf591310e 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -67,26 +67,42 @@ static u32 * call_verify(struct rpc_task *task); static int rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name) { - static uint32_t clntid; + static unsigned int clntid; + char name[128]; int error; if (dir_name == NULL) return 0; - for (;;) { - snprintf(clnt->cl_pathname, sizeof(clnt->cl_pathname), - "%s/clnt%x", dir_name, - (unsigned int)clntid++); - clnt->cl_pathname[sizeof(clnt->cl_pathname) - 1] = '\0'; - clnt->cl_dentry = rpc_mkdir(clnt->cl_pathname, clnt); - if (!IS_ERR(clnt->cl_dentry)) - return 0; + + retry_parent: + clnt->__cl_parent_dentry = rpc_mkdir(NULL, dir_name, NULL); + if (IS_ERR(clnt->__cl_parent_dentry)) { + error = PTR_ERR(clnt->__cl_parent_dentry); + if (error == -EEXIST) + goto retry_parent; /* XXX(hch): WTF? */ + + printk(KERN_INFO "RPC: Couldn't create pipefs entry %s, error %d\n", + dir_name, error); + return error; + } + + + retry_child: + snprintf(name, sizeof(name), "clnt%x", clntid++); + name[sizeof(name) - 1] = '\0'; + + clnt->cl_dentry = rpc_mkdir(clnt->__cl_parent_dentry, name, clnt); + if (IS_ERR(clnt->cl_dentry)) { error = PTR_ERR(clnt->cl_dentry); - if (error != -EEXIST) { - printk(KERN_INFO "RPC: Couldn't create pipefs entry %s, error %d\n", - clnt->cl_pathname, error); - return error; - } + if (error == -EEXIST) + goto retry_child; + printk(KERN_INFO "RPC: Couldn't create pipefs entry %s, error %d\n", + name, error); + rpc_rmdir(clnt->__cl_parent_dentry); + return error; } + + return 0; } /* @@ -174,7 +190,8 @@ rpc_new_client(struct rpc_xprt *xprt, char *servname, return clnt; out_no_auth: - rpc_rmdir(clnt->cl_pathname); + rpc_rmdir(clnt->cl_dentry); + rpc_rmdir(clnt->__cl_parent_dentry); out_no_path: if (clnt->cl_server != clnt->cl_inline_name) kfree(clnt->cl_server); @@ -302,8 +319,10 @@ rpc_destroy_client(struct rpc_clnt *clnt) rpc_destroy_client(clnt->cl_parent); goto out_free; } - if (clnt->cl_pathname[0]) - rpc_rmdir(clnt->cl_pathname); + if (clnt->cl_dentry) + rpc_rmdir(clnt->cl_dentry); + if (clnt->__cl_parent_dentry) + rpc_rmdir(clnt->__cl_parent_dentry); if (clnt->cl_xprt) { xprt_destroy(clnt->cl_xprt); clnt->cl_xprt = NULL; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index ded6c63f11e..b382809726d 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -414,38 +414,6 @@ rpc_put_mount(void) simple_release_fs(&rpc_mount, &rpc_mount_count); } -static int -rpc_lookup_parent(char *path, struct nameidata *nd) -{ - if (path[0] == '\0') - return -ENOENT; - if (rpc_get_mount()) { - printk(KERN_WARNING "%s: %s failed to mount " - "pseudofilesystem \n", __FILE__, __FUNCTION__); - return -ENODEV; - } - nd->mnt = mntget(rpc_mount); - nd->dentry = dget(rpc_mount->mnt_root); - nd->last_type = LAST_ROOT; - nd->flags = LOOKUP_PARENT; - nd->depth = 0; - - if (path_walk(path, nd)) { - printk(KERN_WARNING "%s: %s failed to find path %s\n", - __FILE__, __FUNCTION__, path); - rpc_put_mount(); - return -ENOENT; - } - return 0; -} - -static void -rpc_release_path(struct nameidata *nd) -{ - path_release(nd); - rpc_put_mount(); -} - static struct inode * rpc_get_inode(struct super_block *sb, int mode) { @@ -550,197 +518,149 @@ out_bad: return -ENOMEM; } -static int -__rpc_mkdir(struct inode *dir, struct dentry *dentry) +struct dentry * +rpc_mkdir(struct dentry *parent, char *name, struct rpc_clnt *rpc_client) { + struct inode *dir; + struct dentry *dentry; struct inode *inode; - - inode = rpc_get_inode(dir->i_sb, S_IFDIR | S_IRUSR | S_IXUSR); - if (!inode) - goto out_err; - inode->i_ino = iunique(dir->i_sb, 100); - d_instantiate(dentry, inode); - dir->i_nlink++; - inode_dir_notify(dir, DN_CREATE); - rpc_get_mount(); - return 0; -out_err: - printk(KERN_WARNING "%s: %s failed to allocate inode for dentry %s\n", - __FILE__, __FUNCTION__, dentry->d_name.name); - return -ENOMEM; -} - -static int -__rpc_rmdir(struct inode *dir, struct dentry *dentry) -{ int error; - shrink_dcache_parent(dentry); - if (dentry->d_inode) { - rpc_close_pipes(dentry->d_inode); - rpc_inode_setowner(dentry->d_inode, NULL); - } - if ((error = simple_rmdir(dir, dentry)) != 0) - return error; - if (!error) { - inode_dir_notify(dir, DN_DELETE); - d_drop(dentry); - rpc_put_mount(); - } - return 0; -} + if (!parent) + parent = rpc_mount->mnt_root; -static struct dentry * -rpc_lookup_negative(char *path, struct nameidata *nd) -{ - struct dentry *dentry; - struct inode *dir; - int error; - - if ((error = rpc_lookup_parent(path, nd)) != 0) + dir = parent->d_inode; + + error = rpc_get_mount(); + if (error) return ERR_PTR(error); - dir = nd->dentry->d_inode; + down(&dir->i_sem); - dentry = lookup_hash(&nd->last, nd->dentry); + dentry = lookup_one_len(name, parent, strlen(name)); if (IS_ERR(dentry)) - goto out_err; + goto out_unlock; if (dentry->d_inode) { - dput(dentry); dentry = ERR_PTR(-EEXIST); - goto out_err; + goto out_dput; } - return dentry; -out_err: - up(&dir->i_sem); - rpc_release_path(nd); - return dentry; -} + inode = rpc_get_inode(dir->i_sb, S_IFDIR | S_IRUSR | S_IXUSR); + if (!inode) + goto out_dput; + inode->i_ino = iunique(dir->i_sb, 100); + dir->i_nlink++; + RPC_I(dentry->d_inode)->private = rpc_client; -struct dentry * -rpc_mkdir(char *path, struct rpc_clnt *rpc_client) -{ - struct nameidata nd; - struct dentry *dentry; - struct inode *dir; - int error; + d_instantiate(dentry, inode); + dget(dentry); + up(&dir->i_sem); + + inode_dir_notify(dir, DN_CREATE); - dentry = rpc_lookup_negative(path, &nd); - if (IS_ERR(dentry)) - return dentry; - dir = nd.dentry->d_inode; - if ((error = __rpc_mkdir(dir, dentry)) != 0) - goto err_dput; - RPC_I(dentry->d_inode)->private = rpc_client; error = rpc_populate(dentry, authfiles, RPCAUTH_info, RPCAUTH_EOF); if (error) - goto err_depopulate; -out: - up(&dir->i_sem); - rpc_release_path(&nd); + goto out_depopulate; + return dentry; -err_depopulate: - rpc_depopulate(dentry); - __rpc_rmdir(dir, dentry); -err_dput: + + out_depopulate: + rpc_rmdir(dentry); + out_dput: dput(dentry); - printk(KERN_WARNING "%s: %s() failed to create directory %s (errno = %d)\n", - __FILE__, __FUNCTION__, path, error); - dentry = ERR_PTR(error); - goto out; + out_unlock: + up(&dir->i_sem); + rpc_put_mount(); + return dentry; } -int -rpc_rmdir(char *path) +void +rpc_rmdir(struct dentry *dentry) { - struct nameidata nd; - struct dentry *dentry; - struct inode *dir; - int error; + struct dentry *parent = dentry->d_parent; - if ((error = rpc_lookup_parent(path, &nd)) != 0) - return error; - dir = nd.dentry->d_inode; - down(&dir->i_sem); - dentry = lookup_hash(&nd.last, nd.dentry); - if (IS_ERR(dentry)) { - error = PTR_ERR(dentry); - goto out_release; - } rpc_depopulate(dentry); - error = __rpc_rmdir(dir, dentry); - dput(dentry); -out_release: - up(&dir->i_sem); - rpc_release_path(&nd); - return error; + + down(&parent->d_inode->i_sem); + if (dentry->d_inode) { + rpc_close_pipes(dentry->d_inode); + rpc_inode_setowner(dentry->d_inode, NULL); + simple_rmdir(parent->d_inode, dentry); + } + up(&parent->d_inode->i_sem); + + inode_dir_notify(parent->d_inode, DN_DELETE); + rpc_put_mount(); } struct dentry * -rpc_mkpipe(char *path, void *private, struct rpc_pipe_ops *ops, int flags) +rpc_mkpipe(struct dentry *parent, char *name, void *private, + struct rpc_pipe_ops *ops, int flags) { - struct nameidata nd; + struct inode *dir = parent->d_inode; struct dentry *dentry; - struct inode *dir, *inode; + struct inode *inode; struct rpc_inode *rpci; + int error; + + error = rpc_get_mount(); + if (error) + return ERR_PTR(error); - dentry = rpc_lookup_negative(path, &nd); + down(&parent->d_inode->i_sem); + dentry = lookup_one_len(name, parent, strlen(name)); if (IS_ERR(dentry)) - return dentry; - dir = nd.dentry->d_inode; - inode = rpc_get_inode(dir->i_sb, S_IFSOCK | S_IRUSR | S_IWUSR); - if (!inode) - goto err_dput; + goto out_unlock; + if (dentry->d_inode) { + dentry = ERR_PTR(-EEXIST); + goto out_dput; + } + + inode = rpc_get_inode(parent->d_inode->i_sb, + S_IFSOCK | S_IRUSR | S_IWUSR); + if (!inode) { + dentry = ERR_PTR(-ENOMEM); + goto out_dput; + } + inode->i_ino = iunique(dir->i_sb, 100); inode->i_fop = &rpc_pipe_fops; - d_instantiate(dentry, inode); + rpci = RPC_I(inode); rpci->private = private; rpci->flags = flags; rpci->ops = ops; + + d_instantiate(dentry, inode); + dget(dentry); + up(&parent->d_inode->i_sem); + inode_dir_notify(dir, DN_CREATE); -out: - up(&dir->i_sem); - rpc_release_path(&nd); return dentry; -err_dput: + + out_dput: dput(dentry); - dentry = ERR_PTR(-ENOMEM); - printk(KERN_WARNING "%s: %s() failed to create pipe %s (errno = %d)\n", - __FILE__, __FUNCTION__, path, -ENOMEM); - goto out; + out_unlock: + up(&parent->d_inode->i_sem); + rpc_put_mount(); + return dentry; } -int -rpc_unlink(char *path) +void +rpc_unlink(struct dentry *dentry) { - struct nameidata nd; - struct dentry *dentry; - struct inode *dir; - int error; + struct dentry *parent = dentry->d_parent; - if ((error = rpc_lookup_parent(path, &nd)) != 0) - return error; - dir = nd.dentry->d_inode; - down(&dir->i_sem); - dentry = lookup_hash(&nd.last, nd.dentry); - if (IS_ERR(dentry)) { - error = PTR_ERR(dentry); - goto out_release; - } - d_drop(dentry); + down(&parent->d_inode->i_sem); if (dentry->d_inode) { rpc_close_pipes(dentry->d_inode); rpc_inode_setowner(dentry->d_inode, NULL); - error = simple_unlink(dir, dentry); + simple_unlink(parent->d_inode, dentry); } - dput(dentry); - inode_dir_notify(dir, DN_DELETE); -out_release: - up(&dir->i_sem); - rpc_release_path(&nd); - return error; + up(&parent->d_inode->i_sem); + + inode_dir_notify(parent->d_inode, DN_DELETE); + rpc_put_mount(); } /* -- cgit v1.2.3 From f134585a7343d71f9be7f0cf97e2145f21dd10c6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 23 Sep 2005 11:08:25 -0400 Subject: Revert "[PATCH] RPC,NFS: new rpc_pipefs patch" This reverts 17f4e6febca160a9f9dd4bdece9784577a2f4524 commit. --- net/sunrpc/auth_gss/auth_gss.c | 9 +- net/sunrpc/clnt.c | 53 +++----- net/sunrpc/rpc_pipe.c | 268 ++++++++++++++++++++++++++--------------- 3 files changed, 197 insertions(+), 133 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index bd2555139fa..d2b08f16c25 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -87,6 +87,7 @@ struct gss_auth { struct list_head upcalls; struct rpc_clnt *client; struct dentry *dentry; + char path[48]; spinlock_t lock; }; @@ -689,8 +690,10 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor) if (err) goto err_put_mech; - gss_auth->dentry = rpc_mkpipe(clnt->cl_dentry, gss_auth->mech->gm_name, - clnt, &gss_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN); + snprintf(gss_auth->path, sizeof(gss_auth->path), "%s/%s", + clnt->cl_pathname, + gss_auth->mech->gm_name); + gss_auth->dentry = rpc_mkpipe(gss_auth->path, clnt, &gss_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN); if (IS_ERR(gss_auth->dentry)) { err = PTR_ERR(gss_auth->dentry); goto err_put_mech; @@ -715,7 +718,7 @@ gss_destroy(struct rpc_auth *auth) auth, auth->au_flavor); gss_auth = container_of(auth, struct gss_auth, rpc_auth); - rpc_unlink(gss_auth->dentry); + rpc_unlink(gss_auth->path); gss_mech_put(gss_auth->mech); rpcauth_free_credcache(auth); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 63bf591310e..5a8f01d726e 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -67,42 +67,26 @@ static u32 * call_verify(struct rpc_task *task); static int rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name) { - static unsigned int clntid; - char name[128]; + static uint32_t clntid; int error; if (dir_name == NULL) return 0; - - retry_parent: - clnt->__cl_parent_dentry = rpc_mkdir(NULL, dir_name, NULL); - if (IS_ERR(clnt->__cl_parent_dentry)) { - error = PTR_ERR(clnt->__cl_parent_dentry); - if (error == -EEXIST) - goto retry_parent; /* XXX(hch): WTF? */ - - printk(KERN_INFO "RPC: Couldn't create pipefs entry %s, error %d\n", - dir_name, error); - return error; - } - - - retry_child: - snprintf(name, sizeof(name), "clnt%x", clntid++); - name[sizeof(name) - 1] = '\0'; - - clnt->cl_dentry = rpc_mkdir(clnt->__cl_parent_dentry, name, clnt); - if (IS_ERR(clnt->cl_dentry)) { + for (;;) { + snprintf(clnt->cl_pathname, sizeof(clnt->cl_pathname), + "%s/clnt%x", dir_name, + (unsigned int)clntid++); + clnt->cl_pathname[sizeof(clnt->cl_pathname) - 1] = '\0'; + clnt->cl_dentry = rpc_mkdir(clnt->cl_pathname, clnt); + if (!IS_ERR(clnt->cl_dentry)) + return 0; error = PTR_ERR(clnt->cl_dentry); - if (error == -EEXIST) - goto retry_child; - printk(KERN_INFO "RPC: Couldn't create pipefs entry %s, error %d\n", - name, error); - rpc_rmdir(clnt->__cl_parent_dentry); - return error; + if (error != -EEXIST) { + printk(KERN_INFO "RPC: Couldn't create pipefs entry %s, error %d\n", + clnt->cl_pathname, error); + return error; + } } - - return 0; } /* @@ -190,8 +174,7 @@ rpc_new_client(struct rpc_xprt *xprt, char *servname, return clnt; out_no_auth: - rpc_rmdir(clnt->cl_dentry); - rpc_rmdir(clnt->__cl_parent_dentry); + rpc_rmdir(clnt->cl_pathname); out_no_path: if (clnt->cl_server != clnt->cl_inline_name) kfree(clnt->cl_server); @@ -319,10 +302,8 @@ rpc_destroy_client(struct rpc_clnt *clnt) rpc_destroy_client(clnt->cl_parent); goto out_free; } - if (clnt->cl_dentry) - rpc_rmdir(clnt->cl_dentry); - if (clnt->__cl_parent_dentry) - rpc_rmdir(clnt->__cl_parent_dentry); + if (clnt->cl_pathname[0]) + rpc_rmdir(clnt->cl_pathname); if (clnt->cl_xprt) { xprt_destroy(clnt->cl_xprt); clnt->cl_xprt = NULL; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index b382809726d..ded6c63f11e 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -414,6 +414,38 @@ rpc_put_mount(void) simple_release_fs(&rpc_mount, &rpc_mount_count); } +static int +rpc_lookup_parent(char *path, struct nameidata *nd) +{ + if (path[0] == '\0') + return -ENOENT; + if (rpc_get_mount()) { + printk(KERN_WARNING "%s: %s failed to mount " + "pseudofilesystem \n", __FILE__, __FUNCTION__); + return -ENODEV; + } + nd->mnt = mntget(rpc_mount); + nd->dentry = dget(rpc_mount->mnt_root); + nd->last_type = LAST_ROOT; + nd->flags = LOOKUP_PARENT; + nd->depth = 0; + + if (path_walk(path, nd)) { + printk(KERN_WARNING "%s: %s failed to find path %s\n", + __FILE__, __FUNCTION__, path); + rpc_put_mount(); + return -ENOENT; + } + return 0; +} + +static void +rpc_release_path(struct nameidata *nd) +{ + path_release(nd); + rpc_put_mount(); +} + static struct inode * rpc_get_inode(struct super_block *sb, int mode) { @@ -518,149 +550,197 @@ out_bad: return -ENOMEM; } -struct dentry * -rpc_mkdir(struct dentry *parent, char *name, struct rpc_clnt *rpc_client) +static int +__rpc_mkdir(struct inode *dir, struct dentry *dentry) { - struct inode *dir; - struct dentry *dentry; struct inode *inode; + + inode = rpc_get_inode(dir->i_sb, S_IFDIR | S_IRUSR | S_IXUSR); + if (!inode) + goto out_err; + inode->i_ino = iunique(dir->i_sb, 100); + d_instantiate(dentry, inode); + dir->i_nlink++; + inode_dir_notify(dir, DN_CREATE); + rpc_get_mount(); + return 0; +out_err: + printk(KERN_WARNING "%s: %s failed to allocate inode for dentry %s\n", + __FILE__, __FUNCTION__, dentry->d_name.name); + return -ENOMEM; +} + +static int +__rpc_rmdir(struct inode *dir, struct dentry *dentry) +{ int error; - if (!parent) - parent = rpc_mount->mnt_root; + shrink_dcache_parent(dentry); + if (dentry->d_inode) { + rpc_close_pipes(dentry->d_inode); + rpc_inode_setowner(dentry->d_inode, NULL); + } + if ((error = simple_rmdir(dir, dentry)) != 0) + return error; + if (!error) { + inode_dir_notify(dir, DN_DELETE); + d_drop(dentry); + rpc_put_mount(); + } + return 0; +} - dir = parent->d_inode; - - error = rpc_get_mount(); - if (error) - return ERR_PTR(error); +static struct dentry * +rpc_lookup_negative(char *path, struct nameidata *nd) +{ + struct dentry *dentry; + struct inode *dir; + int error; + if ((error = rpc_lookup_parent(path, nd)) != 0) + return ERR_PTR(error); + dir = nd->dentry->d_inode; down(&dir->i_sem); - dentry = lookup_one_len(name, parent, strlen(name)); + dentry = lookup_hash(&nd->last, nd->dentry); if (IS_ERR(dentry)) - goto out_unlock; + goto out_err; if (dentry->d_inode) { + dput(dentry); dentry = ERR_PTR(-EEXIST); - goto out_dput; + goto out_err; } - - inode = rpc_get_inode(dir->i_sb, S_IFDIR | S_IRUSR | S_IXUSR); - if (!inode) - goto out_dput; - inode->i_ino = iunique(dir->i_sb, 100); - dir->i_nlink++; - RPC_I(dentry->d_inode)->private = rpc_client; - - d_instantiate(dentry, inode); - dget(dentry); + return dentry; +out_err: up(&dir->i_sem); + rpc_release_path(nd); + return dentry; +} - inode_dir_notify(dir, DN_CREATE); +struct dentry * +rpc_mkdir(char *path, struct rpc_clnt *rpc_client) +{ + struct nameidata nd; + struct dentry *dentry; + struct inode *dir; + int error; + + dentry = rpc_lookup_negative(path, &nd); + if (IS_ERR(dentry)) + return dentry; + dir = nd.dentry->d_inode; + if ((error = __rpc_mkdir(dir, dentry)) != 0) + goto err_dput; + RPC_I(dentry->d_inode)->private = rpc_client; error = rpc_populate(dentry, authfiles, RPCAUTH_info, RPCAUTH_EOF); if (error) - goto out_depopulate; - - return dentry; - - out_depopulate: - rpc_rmdir(dentry); - out_dput: - dput(dentry); - out_unlock: + goto err_depopulate; +out: up(&dir->i_sem); - rpc_put_mount(); + rpc_release_path(&nd); return dentry; +err_depopulate: + rpc_depopulate(dentry); + __rpc_rmdir(dir, dentry); +err_dput: + dput(dentry); + printk(KERN_WARNING "%s: %s() failed to create directory %s (errno = %d)\n", + __FILE__, __FUNCTION__, path, error); + dentry = ERR_PTR(error); + goto out; } -void -rpc_rmdir(struct dentry *dentry) +int +rpc_rmdir(char *path) { - struct dentry *parent = dentry->d_parent; - - rpc_depopulate(dentry); + struct nameidata nd; + struct dentry *dentry; + struct inode *dir; + int error; - down(&parent->d_inode->i_sem); - if (dentry->d_inode) { - rpc_close_pipes(dentry->d_inode); - rpc_inode_setowner(dentry->d_inode, NULL); - simple_rmdir(parent->d_inode, dentry); + if ((error = rpc_lookup_parent(path, &nd)) != 0) + return error; + dir = nd.dentry->d_inode; + down(&dir->i_sem); + dentry = lookup_hash(&nd.last, nd.dentry); + if (IS_ERR(dentry)) { + error = PTR_ERR(dentry); + goto out_release; } - up(&parent->d_inode->i_sem); - - inode_dir_notify(parent->d_inode, DN_DELETE); - rpc_put_mount(); + rpc_depopulate(dentry); + error = __rpc_rmdir(dir, dentry); + dput(dentry); +out_release: + up(&dir->i_sem); + rpc_release_path(&nd); + return error; } struct dentry * -rpc_mkpipe(struct dentry *parent, char *name, void *private, - struct rpc_pipe_ops *ops, int flags) +rpc_mkpipe(char *path, void *private, struct rpc_pipe_ops *ops, int flags) { - struct inode *dir = parent->d_inode; + struct nameidata nd; struct dentry *dentry; - struct inode *inode; + struct inode *dir, *inode; struct rpc_inode *rpci; - int error; - - error = rpc_get_mount(); - if (error) - return ERR_PTR(error); - down(&parent->d_inode->i_sem); - dentry = lookup_one_len(name, parent, strlen(name)); + dentry = rpc_lookup_negative(path, &nd); if (IS_ERR(dentry)) - goto out_unlock; - if (dentry->d_inode) { - dentry = ERR_PTR(-EEXIST); - goto out_dput; - } - - inode = rpc_get_inode(parent->d_inode->i_sb, - S_IFSOCK | S_IRUSR | S_IWUSR); - if (!inode) { - dentry = ERR_PTR(-ENOMEM); - goto out_dput; - } - + return dentry; + dir = nd.dentry->d_inode; + inode = rpc_get_inode(dir->i_sb, S_IFSOCK | S_IRUSR | S_IWUSR); + if (!inode) + goto err_dput; inode->i_ino = iunique(dir->i_sb, 100); inode->i_fop = &rpc_pipe_fops; - + d_instantiate(dentry, inode); rpci = RPC_I(inode); rpci->private = private; rpci->flags = flags; rpci->ops = ops; - - d_instantiate(dentry, inode); - dget(dentry); - up(&parent->d_inode->i_sem); - inode_dir_notify(dir, DN_CREATE); +out: + up(&dir->i_sem); + rpc_release_path(&nd); return dentry; - - out_dput: +err_dput: dput(dentry); - out_unlock: - up(&parent->d_inode->i_sem); - rpc_put_mount(); - return dentry; + dentry = ERR_PTR(-ENOMEM); + printk(KERN_WARNING "%s: %s() failed to create pipe %s (errno = %d)\n", + __FILE__, __FUNCTION__, path, -ENOMEM); + goto out; } -void -rpc_unlink(struct dentry *dentry) +int +rpc_unlink(char *path) { - struct dentry *parent = dentry->d_parent; + struct nameidata nd; + struct dentry *dentry; + struct inode *dir; + int error; - down(&parent->d_inode->i_sem); + if ((error = rpc_lookup_parent(path, &nd)) != 0) + return error; + dir = nd.dentry->d_inode; + down(&dir->i_sem); + dentry = lookup_hash(&nd.last, nd.dentry); + if (IS_ERR(dentry)) { + error = PTR_ERR(dentry); + goto out_release; + } + d_drop(dentry); if (dentry->d_inode) { rpc_close_pipes(dentry->d_inode); rpc_inode_setowner(dentry->d_inode, NULL); - simple_unlink(parent->d_inode, dentry); + error = simple_unlink(dir, dentry); } - up(&parent->d_inode->i_sem); - - inode_dir_notify(parent->d_inode, DN_DELETE); - rpc_put_mount(); + dput(dentry); + inode_dir_notify(dir, DN_DELETE); +out_release: + up(&dir->i_sem); + rpc_release_path(&nd); + return error; } /* -- cgit v1.2.3 From 6cd7525a00f3b926e8bd2e402954ed3e09a8e924 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 22 Sep 2005 21:24:59 -0400 Subject: SUNRPC: fix bug in patch "portmapper doesn't need a reserved port" The in-kernel portmapper does in fact need a reserved port when registering new services, but not when performing bind queries. Ensure that we distinguish between the two cases. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/pmap_clnt.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c index d8e3f220002..a398575f94b 100644 --- a/net/sunrpc/pmap_clnt.c +++ b/net/sunrpc/pmap_clnt.c @@ -26,7 +26,7 @@ #define PMAP_GETPORT 3 static struct rpc_procinfo pmap_procedures[]; -static struct rpc_clnt * pmap_create(char *, struct sockaddr_in *, int); +static struct rpc_clnt * pmap_create(char *, struct sockaddr_in *, int, int); static void pmap_getport_done(struct rpc_task *); static struct rpc_program pmap_program; static DEFINE_SPINLOCK(pmap_lock); @@ -65,7 +65,7 @@ rpc_getport(struct rpc_task *task, struct rpc_clnt *clnt) map->pm_binding = 1; spin_unlock(&pmap_lock); - pmap_clnt = pmap_create(clnt->cl_server, sap, map->pm_prot); + pmap_clnt = pmap_create(clnt->cl_server, sap, map->pm_prot, 0); if (IS_ERR(pmap_clnt)) { task->tk_status = PTR_ERR(pmap_clnt); goto bailout; @@ -112,7 +112,7 @@ rpc_getport_external(struct sockaddr_in *sin, __u32 prog, __u32 vers, int prot) NIPQUAD(sin->sin_addr.s_addr), prog, vers, prot); sprintf(hostname, "%u.%u.%u.%u", NIPQUAD(sin->sin_addr.s_addr)); - pmap_clnt = pmap_create(hostname, sin, prot); + pmap_clnt = pmap_create(hostname, sin, prot, 0); if (IS_ERR(pmap_clnt)) return PTR_ERR(pmap_clnt); @@ -171,7 +171,7 @@ rpc_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay) sin.sin_family = AF_INET; sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK); - pmap_clnt = pmap_create("localhost", &sin, IPPROTO_UDP); + pmap_clnt = pmap_create("localhost", &sin, IPPROTO_UDP, 1); if (IS_ERR(pmap_clnt)) { error = PTR_ERR(pmap_clnt); dprintk("RPC: couldn't create pmap client. Error = %d\n", error); @@ -198,7 +198,7 @@ rpc_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay) } static struct rpc_clnt * -pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto) +pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto, int privileged) { struct rpc_xprt *xprt; struct rpc_clnt *clnt; @@ -208,7 +208,8 @@ pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto) if (IS_ERR(xprt)) return (struct rpc_clnt *)xprt; xprt->addr.sin_port = htons(RPC_PMAP_PORT); - xprt->resvport = 0; + if (!privileged) + xprt->resvport = 0; /* printk("pmap: create clnt\n"); */ clnt = rpc_new_client(xprt, hostname, -- cgit v1.2.3 From dd13a285b79ba77416b96ee10f49097f4aaf48c5 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 4 Oct 2005 22:44:45 -0700 Subject: [RPC]: fix sparse gfp nocast warnings Fix nocast sparse warnings: net/rxrpc/call.c:2013:25: warning: implicit cast to nocast type net/rxrpc/connection.c:538:46: warning: implicit cast to nocast type net/sunrpc/sched.c:730:36: warning: implicit cast to nocast type net/sunrpc/sched.c:734:56: warning: implicit cast to nocast type Signed-off-by: Randy Dunlap Signed-off-by: David S. Miller --- net/sunrpc/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index f3104035e35..ade730eaf40 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -719,7 +719,7 @@ static void rpc_async_schedule(void *arg) void * rpc_malloc(struct rpc_task *task, size_t size) { - int gfp; + unsigned int __nocast gfp; if (task->tk_flags & RPC_TASK_SWAPPER) gfp = GFP_ATOMIC; -- cgit v1.2.3 From dd0fc66fb33cd610bc1a5db8a5e232d34879b4d7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 7 Oct 2005 07:46:04 +0100 Subject: [PATCH] gfp flags annotations - part 1 - added typedef unsigned int __nocast gfp_t; - replaced __nocast uses for gfp flags with gfp_t - it gives exactly the same warnings as far as sparse is concerned, doesn't change generated code (from gcc point of view we replaced unsigned int with typedef) and documents what's going on far better. Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- net/sunrpc/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index ade730eaf40..54e60a65750 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -719,7 +719,7 @@ static void rpc_async_schedule(void *arg) void * rpc_malloc(struct rpc_task *task, size_t size) { - unsigned int __nocast gfp; + gfp_t gfp; if (task->tk_flags & RPC_TASK_SWAPPER) gfp = GFP_ATOMIC; -- cgit v1.2.3 From ea635a517e350eb03ab5f01618417f31b82a9a4d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 6 Oct 2005 23:12:58 -0400 Subject: SUNRPC: Retry rpcbind requests if the server's portmapper isn't up After a server crash/reboot, rebinding should always retry, otherwise requests on "hard" mounts will fail when they shouldn't. Test plan: Run a lock-intensive workload against a server while rebooting the server repeatedly. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 5a8f01d726e..a5f7029b1da 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -759,7 +759,8 @@ call_bind_status(struct rpc_task *task) case -EACCES: dprintk("RPC: %4d remote rpcbind: RPC program/version unavailable\n", task->tk_pid); - break; + rpc_delay(task, 3*HZ); + goto retry_bind; case -ETIMEDOUT: dprintk("RPC: %4d rpcbind request timed out\n", task->tk_pid); -- cgit v1.2.3 From 5e5ce5be6f0161d2a069a4f8a1154fe639c5c02f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Oct 2005 14:20:11 -0700 Subject: RPC: allow call_encode() to delay transmission of an RPC call. Currently, call_encode will cause the entire RPC call to abort if it returns an error. This is unnecessarily rigid, and gets in the way of attempts to allow the NFSv4 layer to order RPC calls that carry sequence ids. Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 23 ++++++++++++----------- net/sunrpc/xprt.c | 8 ++++++++ 2 files changed, 20 insertions(+), 11 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index a5f7029b1da..53427405632 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -678,13 +678,11 @@ call_allocate(struct rpc_task *task) static void call_encode(struct rpc_task *task) { - struct rpc_clnt *clnt = task->tk_client; struct rpc_rqst *req = task->tk_rqstp; struct xdr_buf *sndbuf = &req->rq_snd_buf; struct xdr_buf *rcvbuf = &req->rq_rcv_buf; unsigned int bufsiz; kxdrproc_t encode; - int status; u32 *p; dprintk("RPC: %4d call_encode (status %d)\n", @@ -712,12 +710,9 @@ call_encode(struct rpc_task *task) rpc_exit(task, -EIO); return; } - if (encode && (status = rpcauth_wrap_req(task, encode, req, p, - task->tk_msg.rpc_argp)) < 0) { - printk(KERN_WARNING "%s: can't encode arguments: %d\n", - clnt->cl_protname, -status); - rpc_exit(task, status); - } + if (encode != NULL) + task->tk_status = rpcauth_wrap_req(task, encode, req, p, + task->tk_msg.rpc_argp); } /* @@ -865,10 +860,12 @@ call_transmit(struct rpc_task *task) if (task->tk_status != 0) return; /* Encode here so that rpcsec_gss can use correct sequence number. */ - if (!task->tk_rqstp->rq_bytes_sent) + if (task->tk_rqstp->rq_bytes_sent == 0) { call_encode(task); - if (task->tk_status < 0) - return; + /* Did the encode result in an error condition? */ + if (task->tk_status != 0) + goto out_nosend; + } xprt_transmit(task); if (task->tk_status < 0) return; @@ -876,6 +873,10 @@ call_transmit(struct rpc_task *task) task->tk_action = NULL; rpc_wake_up_task(task); } + return; +out_nosend: + /* release socket write lock before attempting to handle error */ + xprt_abort_transmit(task); } /* diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 215be0d0ef6..1ba55dc38b7 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -709,6 +709,14 @@ out_unlock: return err; } +void +xprt_abort_transmit(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + + xprt_release_write(xprt, task); +} + /** * xprt_transmit - send an RPC request on a transport * @task: controlling RPC task -- cgit v1.2.3 From 747c5534c9a6da4aa87e7cdc2209ea98ea27f381 Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Tue, 18 Oct 2005 23:19:40 -0700 Subject: RPC: stops the release_pipe() funtion from being called twice This patch stops the release_pipe() funtion from being called twice by invalidating the ops pointer in the rpc_inode when rpc_pipe_release() is called. Signed-off-by: Steve Dickson Signed-off-by: Trond Myklebust --- net/sunrpc/rpc_pipe.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/sunrpc') diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index ded6c63f11e..649d609e7d9 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -177,6 +177,8 @@ rpc_pipe_release(struct inode *inode, struct file *filp) __rpc_purge_upcall(inode, -EPIPE); if (rpci->ops->release_pipe) rpci->ops->release_pipe(inode); + if (!rpci->nreaders && !rpci->nwriters) + rpci->ops = NULL; out: up(&inode->i_sem); return 0; -- cgit v1.2.3 From 293f1eb551a77fe5c8956a559a3c0baea95cd9bc Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 13 Oct 2005 16:54:37 -0400 Subject: SUNRPC: Add support for privacy to generic gss-api code. Add support for privacy to generic gss-api code. This is dead code until we have both a mechanism that supports privacy and code in the client or server that uses it. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/gss_mech_switch.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'net/sunrpc') diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 58aeaddd8c7..06d97cb3481 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -276,6 +276,28 @@ gss_verify_mic(struct gss_ctx *context_handle, qstate); } +u32 +gss_wrap(struct gss_ctx *ctx_id, + u32 qop, + int offset, + struct xdr_buf *buf, + struct page **inpages) +{ + return ctx_id->mech_type->gm_ops + ->gss_wrap(ctx_id, qop, offset, buf, inpages); +} + +u32 +gss_unwrap(struct gss_ctx *ctx_id, + u32 *qop, + int offset, + struct xdr_buf *buf) +{ + return ctx_id->mech_type->gm_ops + ->gss_unwrap(ctx_id, qop, offset, buf); +} + + /* gss_delete_sec_context: free all resources associated with context_handle. * Note this differs from the RFC 2744-specified prototype in that we don't * bother returning an output token, since it would never be used anyway. */ -- cgit v1.2.3 From ead5e1c26fdcd969cf40c49cb0589d56879d240d Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 13 Oct 2005 16:54:43 -0400 Subject: SUNRPC: Provide a callback to allow free pages allocated during xdr encoding For privacy, we need to allocate pages to store the encrypted data (passed in pages can't be used without the risk of corrupting data in the page cache). So we need a way to free that memory after the request has been transmitted. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/sunrpc') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 1ba55dc38b7..6dda3860351 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -839,6 +839,7 @@ static void xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt) req->rq_task = task; req->rq_xprt = xprt; req->rq_xid = xprt_alloc_xid(xprt); + req->rq_release_snd_buf = NULL; dprintk("RPC: %4d reserved req %p xid %08x\n", task->tk_pid, req, ntohl(req->rq_xid)); } @@ -867,6 +868,8 @@ void xprt_release(struct rpc_task *task) xprt->last_used + xprt->idle_timeout); spin_unlock_bh(&xprt->transport_lock); task->tk_rqstp = NULL; + if (req->rq_release_snd_buf) + req->rq_release_snd_buf(req); memset(req, 0, sizeof(*req)); /* mark unused */ dprintk("RPC: %4d release request %p\n", task->tk_pid, req); -- cgit v1.2.3 From f3680312a737355ddf35c1b68af25e384d7ef0a8 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 13 Oct 2005 16:54:48 -0400 Subject: SUNRPC: Retry wrap in case of memory allocation failure. For privacy we need to allocate extra pages to hold encrypted page data when wrapping requests. This allocation may fail, and we handle that case by waiting and retrying. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 53427405632..702ede309b0 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -710,9 +710,16 @@ call_encode(struct rpc_task *task) rpc_exit(task, -EIO); return; } - if (encode != NULL) - task->tk_status = rpcauth_wrap_req(task, encode, req, p, - task->tk_msg.rpc_argp); + if (encode == NULL) + return; + + task->tk_status = rpcauth_wrap_req(task, encode, req, p, + task->tk_msg.rpc_argp); + if (task->tk_status == -ENOMEM) { + /* XXX: Is this sane? */ + rpc_delay(task, 3*HZ); + task->tk_status = -EAGAIN; + } } /* -- cgit v1.2.3 From 24b2605becc10ca63c4c30808fa59a8abbf68727 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 13 Oct 2005 16:54:53 -0400 Subject: RPCSEC_GSS: cleanup au_rslack calculation Various xdr encode routines use au_rslack to guess where the reply argument will end up, so we can set up the xdr_buf to recieve data into the right place for zero copy. Currently we calculate the au_rslack estimate when we check the verifier. Normally this only depends on the verifier size. In the integrity case we add a few bytes to allow for a length and sequence number. It's a bit simpler to calculate only the verifier size when we check the verifier, and delay the full calculation till we unwrap. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index d2b08f16c25..dc95b797ca6 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -886,8 +886,6 @@ static u32 * gss_validate(struct rpc_task *task, u32 *p) { struct rpc_cred *cred = task->tk_msg.rpc_cred; - struct gss_cred *gss_cred = container_of(cred, struct gss_cred, - gc_base); struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); u32 seq, qop_state; struct kvec iov; @@ -915,18 +913,9 @@ gss_validate(struct rpc_task *task, u32 *p) cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; if (maj_stat) goto out_bad; - switch (gss_cred->gc_service) { - case RPC_GSS_SVC_NONE: - /* verifier data, flavor, length: */ - task->tk_auth->au_rslack = XDR_QUADLEN(len) + 2; - break; - case RPC_GSS_SVC_INTEGRITY: - /* verifier data, flavor, length, length, sequence number: */ - task->tk_auth->au_rslack = XDR_QUADLEN(len) + 4; - break; - case RPC_GSS_SVC_PRIVACY: - goto out_bad; - } + /* We leave it to unwrap to calculate au_rslack. For now we just + * calculate the length of the verifier: */ + task->tk_auth->au_verfsize = XDR_QUADLEN(len) + 2; gss_put_ctx(ctx); dprintk("RPC: %4u GSS gss_validate: gss_verify_mic succeeded.\n", task->tk_pid); @@ -1067,6 +1056,7 @@ gss_unwrap_resp(struct rpc_task *task, struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); + u32 *savedp = p; int status = -EIO; if (ctx->gc_proc != RPC_GSS_PROC_DATA) @@ -1082,6 +1072,8 @@ gss_unwrap_resp(struct rpc_task *task, case RPC_GSS_SVC_PRIVACY: break; } + /* take into account extra slack for integrity and privacy cases: */ + task->tk_auth->au_rslack = task->tk_auth->au_verfsize + (p - savedp); out_decode: status = decode(rqstp, p, obj); out: -- cgit v1.2.3 From 2d2da60c63b67174add32f06e8d54c3a0c5cd9cf Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 13 Oct 2005 16:54:58 -0400 Subject: RPCSEC_GSS: client-side privacy support Add the code to the client side to handle privacy. This is dead code until we actually add privacy support to krb5. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 149 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 148 insertions(+), 1 deletion(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index dc95b797ca6..5e4872058ec 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -975,6 +976,114 @@ gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, return 0; } +static void +priv_release_snd_buf(struct rpc_rqst *rqstp) +{ + int i; + + for (i=0; i < rqstp->rq_enc_pages_num; i++) + __free_page(rqstp->rq_enc_pages[i]); + kfree(rqstp->rq_enc_pages); +} + +static int +alloc_enc_pages(struct rpc_rqst *rqstp) +{ + struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; + int first, last, i; + + if (snd_buf->page_len == 0) { + rqstp->rq_enc_pages_num = 0; + return 0; + } + + first = snd_buf->page_base >> PAGE_CACHE_SHIFT; + last = (snd_buf->page_base + snd_buf->page_len - 1) >> PAGE_CACHE_SHIFT; + rqstp->rq_enc_pages_num = last - first + 1 + 1; + rqstp->rq_enc_pages + = kmalloc(rqstp->rq_enc_pages_num * sizeof(struct page *), + GFP_NOFS); + if (!rqstp->rq_enc_pages) + goto out; + for (i=0; i < rqstp->rq_enc_pages_num; i++) { + rqstp->rq_enc_pages[i] = alloc_page(GFP_NOFS); + if (rqstp->rq_enc_pages[i] == NULL) + goto out_free; + } + rqstp->rq_release_snd_buf = priv_release_snd_buf; + return 0; +out_free: + for (i--; i >= 0; i--) { + __free_page(rqstp->rq_enc_pages[i]); + } +out: + return -EAGAIN; +} + +static inline int +gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, + kxdrproc_t encode, struct rpc_rqst *rqstp, u32 *p, void *obj) +{ + struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; + u32 offset; + u32 maj_stat; + int status; + u32 *opaque_len; + struct page **inpages; + int first; + int pad; + struct kvec *iov; + char *tmp; + + opaque_len = p++; + offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base; + *p++ = htonl(rqstp->rq_seqno); + + status = encode(rqstp, p, obj); + if (status) + return status; + + status = alloc_enc_pages(rqstp); + if (status) + return status; + first = snd_buf->page_base >> PAGE_CACHE_SHIFT; + inpages = snd_buf->pages + first; + snd_buf->pages = rqstp->rq_enc_pages; + snd_buf->page_base -= first << PAGE_CACHE_SHIFT; + /* Give the tail its own page, in case we need extra space in the + * head when wrapping: */ + if (snd_buf->page_len || snd_buf->tail[0].iov_len) { + tmp = page_address(rqstp->rq_enc_pages[rqstp->rq_enc_pages_num - 1]); + memcpy(tmp, snd_buf->tail[0].iov_base, snd_buf->tail[0].iov_len); + snd_buf->tail[0].iov_base = tmp; + } + maj_stat = gss_wrap(ctx->gc_gss_ctx, GSS_C_QOP_DEFAULT, offset, + snd_buf, inpages); + /* RPC_SLACK_SPACE should prevent this ever happening: */ + BUG_ON(snd_buf->len > snd_buf->buflen); + status = -EIO; + /* We're assuming that when GSS_S_CONTEXT_EXPIRED, the encryption was + * done anyway, so it's safe to put the request on the wire: */ + if (maj_stat == GSS_S_CONTEXT_EXPIRED) + cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; + else if (maj_stat) + return status; + + *opaque_len = htonl(snd_buf->len - offset); + /* guess whether we're in the head or the tail: */ + if (snd_buf->page_len || snd_buf->tail[0].iov_len) + iov = snd_buf->tail; + else + iov = snd_buf->head; + p = iov->iov_base + iov->iov_len; + pad = 3 - ((snd_buf->len - offset - 1) & 3); + memset(p, 0, pad); + iov->iov_len += pad; + snd_buf->len += pad; + + return 0; +} + static int gss_wrap_req(struct rpc_task *task, kxdrproc_t encode, void *rqstp, u32 *p, void *obj) @@ -1002,6 +1111,8 @@ gss_wrap_req(struct rpc_task *task, rqstp, p, obj); break; case RPC_GSS_SVC_PRIVACY: + status = gss_wrap_req_priv(cred, ctx, encode, + rqstp, p, obj); break; } out: @@ -1048,6 +1159,36 @@ gss_unwrap_resp_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, return 0; } +static inline int +gss_unwrap_resp_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, + struct rpc_rqst *rqstp, u32 **p) +{ + struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf; + u32 offset; + u32 opaque_len; + u32 maj_stat; + int status = -EIO; + + opaque_len = ntohl(*(*p)++); + offset = (u8 *)(*p) - (u8 *)rcv_buf->head[0].iov_base; + if (offset + opaque_len > rcv_buf->len) + return status; + /* remove padding: */ + rcv_buf->len = offset + opaque_len; + + maj_stat = gss_unwrap(ctx->gc_gss_ctx, NULL, + offset, rcv_buf); + if (maj_stat == GSS_S_CONTEXT_EXPIRED) + cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; + if (maj_stat != GSS_S_COMPLETE) + return status; + if (ntohl(*(*p)++) != rqstp->rq_seqno) + return status; + + return 0; +} + + static int gss_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp, u32 *p, void *obj) @@ -1057,6 +1198,8 @@ gss_unwrap_resp(struct rpc_task *task, gc_base); struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); u32 *savedp = p; + struct kvec *head = ((struct rpc_rqst *)rqstp)->rq_rcv_buf.head; + int savedlen = head->iov_len; int status = -EIO; if (ctx->gc_proc != RPC_GSS_PROC_DATA) @@ -1070,10 +1213,14 @@ gss_unwrap_resp(struct rpc_task *task, goto out; break; case RPC_GSS_SVC_PRIVACY: + status = gss_unwrap_resp_priv(cred, ctx, rqstp, &p); + if (status) + goto out; break; } /* take into account extra slack for integrity and privacy cases: */ - task->tk_auth->au_rslack = task->tk_auth->au_verfsize + (p - savedp); + task->tk_auth->au_rslack = task->tk_auth->au_verfsize + (p - savedp) + + (savedlen - head->iov_len); out_decode: status = decode(rqstp, p, obj); out: -- cgit v1.2.3 From f7b3af64c653c73feb060a9f94f2df9ab4bba4c3 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 13 Oct 2005 16:55:03 -0400 Subject: RPCSEC_GSS: Simplify rpcsec_gss crypto code Factor out some code that will be shared by privacy crypto routines Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/gss_krb5_crypto.c | 106 ++++++++++++++++++++++++---------- 1 file changed, 77 insertions(+), 29 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index ee6ae74cd1b..2baf93f8b8f 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -139,6 +139,82 @@ buf_to_sg(struct scatterlist *sg, char *ptr, int len) { sg->length = len; } +static int +process_xdr_buf(struct xdr_buf *buf, int offset, int len, + int (*actor)(struct scatterlist *, void *), void *data) +{ + int i, page_len, thislen, page_offset, ret = 0; + struct scatterlist sg[1]; + + if (offset >= buf->head[0].iov_len) { + offset -= buf->head[0].iov_len; + } else { + thislen = buf->head[0].iov_len - offset; + if (thislen > len) + thislen = len; + buf_to_sg(sg, buf->head[0].iov_base + offset, thislen); + ret = actor(sg, data); + if (ret) + goto out; + offset = 0; + len -= thislen; + } + if (len == 0) + goto out; + + if (offset >= buf->page_len) { + offset -= buf->page_len; + } else { + page_len = buf->page_len - offset; + if (page_len > len) + page_len = len; + len -= page_len; + page_offset = (offset + buf->page_base) & (PAGE_CACHE_SIZE - 1); + i = (offset + buf->page_base) >> PAGE_CACHE_SHIFT; + thislen = PAGE_CACHE_SIZE - page_offset; + do { + if (thislen > page_len) + thislen = page_len; + sg->page = buf->pages[i]; + sg->offset = page_offset; + sg->length = thislen; + ret = actor(sg, data); + if (ret) + goto out; + page_len -= thislen; + i++; + page_offset = 0; + thislen = PAGE_CACHE_SIZE; + } while (page_len != 0); + offset = 0; + } + if (len == 0) + goto out; + + if (offset < buf->tail[0].iov_len) { + thislen = buf->tail[0].iov_len - offset; + if (thislen > len) + thislen = len; + buf_to_sg(sg, buf->tail[0].iov_base + offset, thislen); + ret = actor(sg, data); + len -= thislen; + } + if (len != 0) + ret = -EINVAL; +out: + return ret; +} + +static int +checksummer(struct scatterlist *sg, void *data) +{ + struct crypto_tfm *tfm = (struct crypto_tfm *)data; + + crypto_digest_update(tfm, sg, 1); + + return 0; +} + /* checksum the plaintext data and hdrlen bytes of the token header */ s32 make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body, @@ -148,8 +224,6 @@ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body, struct crypto_tfm *tfm = NULL; /* XXX add to ctx? */ struct scatterlist sg[1]; u32 code = GSS_S_FAILURE; - int len, thislen, offset; - int i; switch (cksumtype) { case CKSUMTYPE_RSA_MD5: @@ -169,33 +243,7 @@ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body, crypto_digest_init(tfm); buf_to_sg(sg, header, hdrlen); crypto_digest_update(tfm, sg, 1); - if (body->head[0].iov_len) { - buf_to_sg(sg, body->head[0].iov_base, body->head[0].iov_len); - crypto_digest_update(tfm, sg, 1); - } - - len = body->page_len; - if (len != 0) { - offset = body->page_base & (PAGE_CACHE_SIZE - 1); - i = body->page_base >> PAGE_CACHE_SHIFT; - thislen = PAGE_CACHE_SIZE - offset; - do { - if (thislen > len) - thislen = len; - sg->page = body->pages[i]; - sg->offset = offset; - sg->length = thislen; - crypto_digest_update(tfm, sg, 1); - len -= thislen; - i++; - offset = 0; - thislen = PAGE_CACHE_SIZE; - } while(len != 0); - } - if (body->tail[0].iov_len) { - buf_to_sg(sg, body->tail[0].iov_base, body->tail[0].iov_len); - crypto_digest_update(tfm, sg, 1); - } + process_xdr_buf(body, 0, body->len, checksummer, tfm); crypto_digest_final(tfm, cksum->data); code = 0; out: -- cgit v1.2.3 From bfa91516b57483fc9c81d8d90325fd2c3c16ac48 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 13 Oct 2005 16:55:08 -0400 Subject: RPCSEC_GSS: krb5 pre-privacy cleanup The code this was originally derived from processed wrap and mic tokens using the same functions. This required some contortions, and more would be required with the addition of xdr_buf's, so it's better to separate out the two code paths. In preparation for adding privacy support, remove the last vestiges of the old wrap token code. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/gss_krb5_mech.c | 5 ++--- net/sunrpc/auth_gss/gss_krb5_seal.c | 38 ++++++----------------------------- net/sunrpc/auth_gss/gss_krb5_unseal.c | 30 ++++++--------------------- 3 files changed, 14 insertions(+), 59 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 462c5b86b07..8b9066fdfda 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -199,8 +199,7 @@ gss_verify_mic_kerberos(struct gss_ctx *ctx, int qop_state; struct krb5_ctx *kctx = ctx->internal_ctx_id; - maj_stat = krb5_read_token(kctx, mic_token, message, &qop_state, - KG_TOK_MIC_MSG); + maj_stat = krb5_read_token(kctx, mic_token, message, &qop_state); if (!maj_stat && qop_state) *qstate = qop_state; @@ -216,7 +215,7 @@ gss_get_mic_kerberos(struct gss_ctx *ctx, u32 err = 0; struct krb5_ctx *kctx = ctx->internal_ctx_id; - err = krb5_make_token(kctx, qop, message, mic_token, KG_TOK_MIC_MSG); + err = krb5_make_token(kctx, qop, message, mic_token); dprintk("RPC: gss_get_mic_kerberos returning %d\n",err); diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index afeeb8715a7..2511834e6e5 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -70,22 +70,12 @@ # define RPCDBG_FACILITY RPCDBG_AUTH #endif -static inline int -gss_krb5_padding(int blocksize, int length) { - /* Most of the code is block-size independent but in practice we - * use only 8: */ - BUG_ON(blocksize != 8); - return 8 - (length & 7); -} - u32 krb5_make_token(struct krb5_ctx *ctx, int qop_req, - struct xdr_buf *text, struct xdr_netobj *token, - int toktype) + struct xdr_buf *text, struct xdr_netobj *token) { s32 checksum_type; struct xdr_netobj md5cksum = {.len = 0, .data = NULL}; - int blocksize = 0, tmsglen; unsigned char *ptr, *krb5_hdr, *msg_start; s32 now; @@ -111,21 +101,13 @@ krb5_make_token(struct krb5_ctx *ctx, int qop_req, goto out_err; } - if (toktype == KG_TOK_WRAP_MSG) { - blocksize = crypto_tfm_alg_blocksize(ctx->enc); - tmsglen = blocksize + text->len - + gss_krb5_padding(blocksize, blocksize + text->len); - } else { - tmsglen = 0; - } - - token->len = g_token_size(&ctx->mech_used, 22 + tmsglen); + token->len = g_token_size(&ctx->mech_used, 22); ptr = token->data; - g_make_token_header(&ctx->mech_used, 22 + tmsglen, &ptr); + g_make_token_header(&ctx->mech_used, 22, &ptr); - *ptr++ = (unsigned char) ((toktype>>8)&0xff); - *ptr++ = (unsigned char) (toktype&0xff); + *ptr++ = (unsigned char) ((KG_TOK_MIC_MSG>>8)&0xff); + *ptr++ = (unsigned char) (KG_TOK_MIC_MSG&0xff); /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */ krb5_hdr = ptr - 2; @@ -133,17 +115,9 @@ krb5_make_token(struct krb5_ctx *ctx, int qop_req, *(u16 *)(krb5_hdr + 2) = htons(ctx->signalg); memset(krb5_hdr + 4, 0xff, 4); - if (toktype == KG_TOK_WRAP_MSG) - *(u16 *)(krb5_hdr + 4) = htons(ctx->sealalg); - if (toktype == KG_TOK_WRAP_MSG) { - /* XXX removing support for now */ + if (make_checksum(checksum_type, krb5_hdr, 8, text, &md5cksum)) goto out_err; - } else { /* Sign only. */ - if (make_checksum(checksum_type, krb5_hdr, 8, text, - &md5cksum)) - goto out_err; - } switch (ctx->signalg) { case SGN_ALG_DES_MAC_MD5: diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c index 8767fc53183..19eba3df660 100644 --- a/net/sunrpc/auth_gss/gss_krb5_unseal.c +++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c @@ -68,20 +68,13 @@ #endif -/* message_buffer is an input if toktype is MIC and an output if it is WRAP: - * If toktype is MIC: read_token is a mic token, and message_buffer is the - * data that the mic was supposedly taken over. - * If toktype is WRAP: read_token is a wrap token, and message_buffer is used - * to return the decrypted data. - */ +/* read_token is a mic token, and message_buffer is the data that the mic was + * supposedly taken over. */ -/* XXX will need to change prototype and/or just split into a separate function - * when we add privacy (because read_token will be in pages too). */ u32 krb5_read_token(struct krb5_ctx *ctx, struct xdr_netobj *read_token, - struct xdr_buf *message_buffer, - int *qop_state, int toktype) + struct xdr_buf *message_buffer, int *qop_state) { int signalg; int sealalg; @@ -100,16 +93,12 @@ krb5_read_token(struct krb5_ctx *ctx, read_token->len)) goto out; - if ((*ptr++ != ((toktype>>8)&0xff)) || (*ptr++ != (toktype&0xff))) + if ((*ptr++ != ((KG_TOK_MIC_MSG>>8)&0xff)) || + (*ptr++ != ( KG_TOK_MIC_MSG &0xff)) ) goto out; /* XXX sanity-check bodysize?? */ - if (toktype == KG_TOK_WRAP_MSG) { - /* XXX gone */ - goto out; - } - /* get the sign and seal algorithms */ signalg = ptr[0] + (ptr[1] << 8); @@ -120,14 +109,7 @@ krb5_read_token(struct krb5_ctx *ctx, if ((ptr[4] != 0xff) || (ptr[5] != 0xff)) goto out; - if (((toktype != KG_TOK_WRAP_MSG) && (sealalg != 0xffff)) || - ((toktype == KG_TOK_WRAP_MSG) && (sealalg == 0xffff))) - goto out; - - /* in the current spec, there is only one valid seal algorithm per - key type, so a simple comparison is ok */ - - if ((toktype == KG_TOK_WRAP_MSG) && !(sealalg == ctx->sealalg)) + if (sealalg != 0xffff) goto out; /* there are several mappings of seal algorithms to sign algorithms, -- cgit v1.2.3 From 14ae162c24d985593d5b19437d7f3d8fd0062b59 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 13 Oct 2005 16:55:13 -0400 Subject: RPCSEC_GSS: Add support for privacy to krb5 rpcsec_gss mechanism. Add support for privacy to the krb5 rpcsec_gss mechanism. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/Makefile | 2 +- net/sunrpc/auth_gss/gss_krb5_crypto.c | 156 +++++++++++++- net/sunrpc/auth_gss/gss_krb5_mech.c | 7 + net/sunrpc/auth_gss/gss_krb5_seal.c | 4 +- net/sunrpc/auth_gss/gss_krb5_unseal.c | 2 +- net/sunrpc/auth_gss/gss_krb5_wrap.c | 370 ++++++++++++++++++++++++++++++++++ 6 files changed, 535 insertions(+), 6 deletions(-) create mode 100644 net/sunrpc/auth_gss/gss_krb5_wrap.c (limited to 'net/sunrpc') diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile index fe1b874084b..f3431a7e33d 100644 --- a/net/sunrpc/auth_gss/Makefile +++ b/net/sunrpc/auth_gss/Makefile @@ -10,7 +10,7 @@ auth_rpcgss-objs := auth_gss.o gss_generic_token.o \ obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o rpcsec_gss_krb5-objs := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \ - gss_krb5_seqnum.o + gss_krb5_seqnum.o gss_krb5_wrap.o obj-$(CONFIG_RPCSEC_GSS_SPKM3) += rpcsec_gss_spkm3.o diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index 2baf93f8b8f..3f3d5437f02 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -218,7 +218,7 @@ checksummer(struct scatterlist *sg, void *data) /* checksum the plaintext data and hdrlen bytes of the token header */ s32 make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body, - struct xdr_netobj *cksum) + int body_offset, struct xdr_netobj *cksum) { char *cksumname; struct crypto_tfm *tfm = NULL; /* XXX add to ctx? */ @@ -243,7 +243,8 @@ make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body, crypto_digest_init(tfm); buf_to_sg(sg, header, hdrlen); crypto_digest_update(tfm, sg, 1); - process_xdr_buf(body, 0, body->len, checksummer, tfm); + process_xdr_buf(body, body_offset, body->len - body_offset, + checksummer, tfm); crypto_digest_final(tfm, cksum->data); code = 0; out: @@ -252,3 +253,154 @@ out: } EXPORT_SYMBOL(make_checksum); + +struct encryptor_desc { + u8 iv[8]; /* XXX hard-coded blocksize */ + struct crypto_tfm *tfm; + int pos; + struct xdr_buf *outbuf; + struct page **pages; + struct scatterlist infrags[4]; + struct scatterlist outfrags[4]; + int fragno; + int fraglen; +}; + +static int +encryptor(struct scatterlist *sg, void *data) +{ + struct encryptor_desc *desc = data; + struct xdr_buf *outbuf = desc->outbuf; + struct page *in_page; + int thislen = desc->fraglen + sg->length; + int fraglen, ret; + int page_pos; + + /* Worst case is 4 fragments: head, end of page 1, start + * of page 2, tail. Anything more is a bug. */ + BUG_ON(desc->fragno > 3); + desc->infrags[desc->fragno] = *sg; + desc->outfrags[desc->fragno] = *sg; + + page_pos = desc->pos - outbuf->head[0].iov_len; + if (page_pos >= 0 && page_pos < outbuf->page_len) { + /* pages are not in place: */ + int i = (page_pos + outbuf->page_base) >> PAGE_CACHE_SHIFT; + in_page = desc->pages[i]; + } else { + in_page = sg->page; + } + desc->infrags[desc->fragno].page = in_page; + desc->fragno++; + desc->fraglen += sg->length; + desc->pos += sg->length; + + fraglen = thislen & 7; /* XXX hardcoded blocksize */ + thislen -= fraglen; + + if (thislen == 0) + return 0; + + ret = crypto_cipher_encrypt_iv(desc->tfm, desc->outfrags, desc->infrags, + thislen, desc->iv); + if (ret) + return ret; + if (fraglen) { + desc->outfrags[0].page = sg->page; + desc->outfrags[0].offset = sg->offset + sg->length - fraglen; + desc->outfrags[0].length = fraglen; + desc->infrags[0] = desc->outfrags[0]; + desc->infrags[0].page = in_page; + desc->fragno = 1; + desc->fraglen = fraglen; + } else { + desc->fragno = 0; + desc->fraglen = 0; + } + return 0; +} + +int +gss_encrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *buf, int offset, + struct page **pages) +{ + int ret; + struct encryptor_desc desc; + + BUG_ON((buf->len - offset) % crypto_tfm_alg_blocksize(tfm) != 0); + + memset(desc.iv, 0, sizeof(desc.iv)); + desc.tfm = tfm; + desc.pos = offset; + desc.outbuf = buf; + desc.pages = pages; + desc.fragno = 0; + desc.fraglen = 0; + + ret = process_xdr_buf(buf, offset, buf->len - offset, encryptor, &desc); + return ret; +} + +EXPORT_SYMBOL(gss_encrypt_xdr_buf); + +struct decryptor_desc { + u8 iv[8]; /* XXX hard-coded blocksize */ + struct crypto_tfm *tfm; + struct scatterlist frags[4]; + int fragno; + int fraglen; +}; + +static int +decryptor(struct scatterlist *sg, void *data) +{ + struct decryptor_desc *desc = data; + int thislen = desc->fraglen + sg->length; + int fraglen, ret; + + /* Worst case is 4 fragments: head, end of page 1, start + * of page 2, tail. Anything more is a bug. */ + BUG_ON(desc->fragno > 3); + desc->frags[desc->fragno] = *sg; + desc->fragno++; + desc->fraglen += sg->length; + + fraglen = thislen & 7; /* XXX hardcoded blocksize */ + thislen -= fraglen; + + if (thislen == 0) + return 0; + + ret = crypto_cipher_decrypt_iv(desc->tfm, desc->frags, desc->frags, + thislen, desc->iv); + if (ret) + return ret; + if (fraglen) { + desc->frags[0].page = sg->page; + desc->frags[0].offset = sg->offset + sg->length - fraglen; + desc->frags[0].length = fraglen; + desc->fragno = 1; + desc->fraglen = fraglen; + } else { + desc->fragno = 0; + desc->fraglen = 0; + } + return 0; +} + +int +gss_decrypt_xdr_buf(struct crypto_tfm *tfm, struct xdr_buf *buf, int offset) +{ + struct decryptor_desc desc; + + /* XXXJBF: */ + BUG_ON((buf->len - offset) % crypto_tfm_alg_blocksize(tfm) != 0); + + memset(desc.iv, 0, sizeof(desc.iv)); + desc.tfm = tfm; + desc.fragno = 0; + desc.fraglen = 0; + return process_xdr_buf(buf, offset, buf->len - offset, decryptor, &desc); +} + +EXPORT_SYMBOL(gss_decrypt_xdr_buf); diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 8b9066fdfda..37a9ad97ccd 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -226,6 +226,8 @@ static struct gss_api_ops gss_kerberos_ops = { .gss_import_sec_context = gss_import_sec_context_kerberos, .gss_get_mic = gss_get_mic_kerberos, .gss_verify_mic = gss_verify_mic_kerberos, + .gss_wrap = gss_wrap_kerberos, + .gss_unwrap = gss_unwrap_kerberos, .gss_delete_sec_context = gss_delete_sec_context_kerberos, }; @@ -240,6 +242,11 @@ static struct pf_desc gss_kerberos_pfs[] = { .service = RPC_GSS_SVC_INTEGRITY, .name = "krb5i", }, + [2] = { + .pseudoflavor = RPC_AUTH_GSS_KRB5P, + .service = RPC_GSS_SVC_PRIVACY, + .name = "krb5p", + }, }; static struct gss_api_mech gss_kerberos_mech = { diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index 2511834e6e5..fb852d9ab06 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -116,8 +116,8 @@ krb5_make_token(struct krb5_ctx *ctx, int qop_req, *(u16 *)(krb5_hdr + 2) = htons(ctx->signalg); memset(krb5_hdr + 4, 0xff, 4); - if (make_checksum(checksum_type, krb5_hdr, 8, text, &md5cksum)) - goto out_err; + if (make_checksum(checksum_type, krb5_hdr, 8, text, 0, &md5cksum)) + goto out_err; switch (ctx->signalg) { case SGN_ALG_DES_MAC_MD5: diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c index 19eba3df660..c3d6d1bc100 100644 --- a/net/sunrpc/auth_gss/gss_krb5_unseal.c +++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c @@ -136,7 +136,7 @@ krb5_read_token(struct krb5_ctx *ctx, switch (signalg) { case SGN_ALG_DES_MAC_MD5: ret = make_checksum(checksum_type, ptr - 2, 8, - message_buffer, &md5cksum); + message_buffer, 0, &md5cksum); if (ret) goto out; diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c new file mode 100644 index 00000000000..ddcde6e42b2 --- /dev/null +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -0,0 +1,370 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +static inline int +gss_krb5_padding(int blocksize, int length) +{ + /* Most of the code is block-size independent but currently we + * use only 8: */ + BUG_ON(blocksize != 8); + return 8 - (length & 7); +} + +static inline void +gss_krb5_add_padding(struct xdr_buf *buf, int offset, int blocksize) +{ + int padding = gss_krb5_padding(blocksize, buf->len - offset); + char *p; + struct kvec *iov; + + if (buf->page_len || buf->tail[0].iov_len) + iov = &buf->tail[0]; + else + iov = &buf->head[0]; + p = iov->iov_base + iov->iov_len; + iov->iov_len += padding; + buf->len += padding; + memset(p, padding, padding); +} + +static inline int +gss_krb5_remove_padding(struct xdr_buf *buf, int blocksize) +{ + u8 *ptr; + u8 pad; + int len = buf->len; + + if (len <= buf->head[0].iov_len) { + pad = *(u8 *)(buf->head[0].iov_base + len - 1); + if (pad > buf->head[0].iov_len) + return -EINVAL; + buf->head[0].iov_len -= pad; + goto out; + } else + len -= buf->head[0].iov_len; + if (len <= buf->page_len) { + int last = (buf->page_base + len - 1) + >>PAGE_CACHE_SHIFT; + int offset = (buf->page_base + len - 1) + & (PAGE_CACHE_SIZE - 1); + ptr = kmap_atomic(buf->pages[last], KM_SKB_SUNRPC_DATA); + pad = *(ptr + offset); + kunmap_atomic(ptr, KM_SKB_SUNRPC_DATA); + goto out; + } else + len -= buf->page_len; + BUG_ON(len > buf->tail[0].iov_len); + pad = *(u8 *)(buf->tail[0].iov_base + len - 1); +out: + /* XXX: NOTE: we do not adjust the page lengths--they represent + * a range of data in the real filesystem page cache, and we need + * to know that range so the xdr code can properly place read data. + * However adjusting the head length, as we do above, is harmless. + * In the case of a request that fits into a single page, the server + * also uses length and head length together to determine the original + * start of the request to copy the request for deferal; so it's + * easier on the server if we adjust head and tail length in tandem. + * It's not really a problem that we don't fool with the page and + * tail lengths, though--at worst badly formed xdr might lead the + * server to attempt to parse the padding. + * XXX: Document all these weird requirements for gss mechanism + * wrap/unwrap functions. */ + if (pad > blocksize) + return -EINVAL; + if (buf->len > pad) + buf->len -= pad; + else + return -EINVAL; + return 0; +} + +static inline void +make_confounder(char *p, int blocksize) +{ + static u64 i = 0; + u64 *q = (u64 *)p; + + /* rfc1964 claims this should be "random". But all that's really + * necessary is that it be unique. And not even that is necessary in + * our case since our "gssapi" implementation exists only to support + * rpcsec_gss, so we know that the only buffers we will ever encrypt + * already begin with a unique sequence number. Just to hedge my bets + * I'll make a half-hearted attempt at something unique, but ensuring + * uniqueness would mean worrying about atomicity and rollover, and I + * don't care enough. */ + + BUG_ON(blocksize != 8); + *q = i++; +} + +/* Assumptions: the head and tail of inbuf are ours to play with. + * The pages, however, may be real pages in the page cache and we replace + * them with scratch pages from **pages before writing to them. */ +/* XXX: obviously the above should be documentation of wrap interface, + * and shouldn't be in this kerberos-specific file. */ + +/* XXX factor out common code with seal/unseal. */ + +u32 +gss_wrap_kerberos(struct gss_ctx *ctx, u32 qop, int offset, + struct xdr_buf *buf, struct page **pages) +{ + struct krb5_ctx *kctx = ctx->internal_ctx_id; + s32 checksum_type; + struct xdr_netobj md5cksum = {.len = 0, .data = NULL}; + int blocksize = 0, plainlen; + unsigned char *ptr, *krb5_hdr, *msg_start; + s32 now; + int headlen; + struct page **tmp_pages; + + dprintk("RPC: gss_wrap_kerberos\n"); + + now = get_seconds(); + + if (qop != 0) + goto out_err; + + switch (kctx->signalg) { + case SGN_ALG_DES_MAC_MD5: + checksum_type = CKSUMTYPE_RSA_MD5; + break; + default: + dprintk("RPC: gss_krb5_seal: kctx->signalg %d not" + " supported\n", kctx->signalg); + goto out_err; + } + if (kctx->sealalg != SEAL_ALG_NONE && kctx->sealalg != SEAL_ALG_DES) { + dprintk("RPC: gss_krb5_seal: kctx->sealalg %d not supported\n", + kctx->sealalg); + goto out_err; + } + + blocksize = crypto_tfm_alg_blocksize(kctx->enc); + gss_krb5_add_padding(buf, offset, blocksize); + BUG_ON((buf->len - offset) % blocksize); + plainlen = blocksize + buf->len - offset; + + headlen = g_token_size(&kctx->mech_used, 22 + plainlen) - + (buf->len - offset); + + ptr = buf->head[0].iov_base + offset; + /* shift data to make room for header. */ + /* XXX Would be cleverer to encrypt while copying. */ + /* XXX bounds checking, slack, etc. */ + memmove(ptr + headlen, ptr, buf->head[0].iov_len - offset); + buf->head[0].iov_len += headlen; + buf->len += headlen; + BUG_ON((buf->len - offset - headlen) % blocksize); + + g_make_token_header(&kctx->mech_used, 22 + plainlen, &ptr); + + + *ptr++ = (unsigned char) ((KG_TOK_WRAP_MSG>>8)&0xff); + *ptr++ = (unsigned char) (KG_TOK_WRAP_MSG&0xff); + + /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */ + krb5_hdr = ptr - 2; + msg_start = krb5_hdr + 24; + /* XXXJBF: */ BUG_ON(buf->head[0].iov_base + offset + headlen != msg_start + blocksize); + + *(u16 *)(krb5_hdr + 2) = htons(kctx->signalg); + memset(krb5_hdr + 4, 0xff, 4); + *(u16 *)(krb5_hdr + 4) = htons(kctx->sealalg); + + make_confounder(msg_start, blocksize); + + /* XXXJBF: UGH!: */ + tmp_pages = buf->pages; + buf->pages = pages; + if (make_checksum(checksum_type, krb5_hdr, 8, buf, + offset + headlen - blocksize, &md5cksum)) + goto out_err; + buf->pages = tmp_pages; + + switch (kctx->signalg) { + case SGN_ALG_DES_MAC_MD5: + if (krb5_encrypt(kctx->seq, NULL, md5cksum.data, + md5cksum.data, md5cksum.len)) + goto out_err; + memcpy(krb5_hdr + 16, + md5cksum.data + md5cksum.len - KRB5_CKSUM_LENGTH, + KRB5_CKSUM_LENGTH); + + dprintk("RPC: make_seal_token: cksum data: \n"); + print_hexl((u32 *) (krb5_hdr + 16), KRB5_CKSUM_LENGTH, 0); + break; + default: + BUG(); + } + + kfree(md5cksum.data); + + /* XXX would probably be more efficient to compute checksum + * and encrypt at the same time: */ + if ((krb5_make_seq_num(kctx->seq, kctx->initiate ? 0 : 0xff, + kctx->seq_send, krb5_hdr + 16, krb5_hdr + 8))) + goto out_err; + + if (gss_encrypt_xdr_buf(kctx->enc, buf, offset + headlen - blocksize, + pages)) + goto out_err; + + kctx->seq_send++; + + return ((kctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE); +out_err: + if (md5cksum.data) kfree(md5cksum.data); + return GSS_S_FAILURE; +} + +u32 +gss_unwrap_kerberos(struct gss_ctx *ctx, u32 *qop, int offset, + struct xdr_buf *buf) +{ + struct krb5_ctx *kctx = ctx->internal_ctx_id; + int signalg; + int sealalg; + s32 checksum_type; + struct xdr_netobj md5cksum = {.len = 0, .data = NULL}; + s32 now; + int direction; + s32 seqnum; + unsigned char *ptr; + int bodysize; + u32 ret = GSS_S_DEFECTIVE_TOKEN; + void *data_start, *orig_start; + int data_len; + int blocksize; + + dprintk("RPC: gss_unwrap_kerberos\n"); + + ptr = (u8 *)buf->head[0].iov_base + offset; + if (g_verify_token_header(&kctx->mech_used, &bodysize, &ptr, + buf->len - offset)) + goto out; + + if ((*ptr++ != ((KG_TOK_WRAP_MSG>>8)&0xff)) || + (*ptr++ != (KG_TOK_WRAP_MSG &0xff)) ) + goto out; + + /* XXX sanity-check bodysize?? */ + + /* get the sign and seal algorithms */ + + signalg = ptr[0] + (ptr[1] << 8); + sealalg = ptr[2] + (ptr[3] << 8); + + /* Sanity checks */ + + if ((ptr[4] != 0xff) || (ptr[5] != 0xff)) + goto out; + + if (sealalg == 0xffff) + goto out; + + /* in the current spec, there is only one valid seal algorithm per + key type, so a simple comparison is ok */ + + if (sealalg != kctx->sealalg) + goto out; + + /* there are several mappings of seal algorithms to sign algorithms, + but few enough that we can try them all. */ + + if ((kctx->sealalg == SEAL_ALG_NONE && signalg > 1) || + (kctx->sealalg == SEAL_ALG_1 && signalg != SGN_ALG_3) || + (kctx->sealalg == SEAL_ALG_DES3KD && + signalg != SGN_ALG_HMAC_SHA1_DES3_KD)) + goto out; + + if (gss_decrypt_xdr_buf(kctx->enc, buf, + ptr + 22 - (unsigned char *)buf->head[0].iov_base)) + goto out; + + /* compute the checksum of the message */ + + /* initialize the the cksum */ + switch (signalg) { + case SGN_ALG_DES_MAC_MD5: + checksum_type = CKSUMTYPE_RSA_MD5; + break; + default: + ret = GSS_S_DEFECTIVE_TOKEN; + goto out; + } + + switch (signalg) { + case SGN_ALG_DES_MAC_MD5: + ret = make_checksum(checksum_type, ptr - 2, 8, buf, + ptr + 22 - (unsigned char *)buf->head[0].iov_base, &md5cksum); + if (ret) + goto out; + + ret = krb5_encrypt(kctx->seq, NULL, md5cksum.data, + md5cksum.data, md5cksum.len); + if (ret) + goto out; + + if (memcmp(md5cksum.data + 8, ptr + 14, 8)) { + ret = GSS_S_BAD_SIG; + goto out; + } + break; + default: + ret = GSS_S_DEFECTIVE_TOKEN; + goto out; + } + + /* it got through unscathed. Make sure the context is unexpired */ + + if (qop) + *qop = GSS_C_QOP_DEFAULT; + + now = get_seconds(); + + ret = GSS_S_CONTEXT_EXPIRED; + if (now > kctx->endtime) + goto out; + + /* do sequencing checks */ + + ret = GSS_S_BAD_SIG; + if ((ret = krb5_get_seq_num(kctx->seq, ptr + 14, ptr + 6, &direction, + &seqnum))) + goto out; + + if ((kctx->initiate && direction != 0xff) || + (!kctx->initiate && direction != 0)) + goto out; + + /* Copy the data back to the right position. XXX: Would probably be + * better to copy and encrypt at the same time. */ + + blocksize = crypto_tfm_alg_blocksize(kctx->enc); + data_start = ptr + 22 + blocksize; + orig_start = buf->head[0].iov_base + offset; + data_len = (buf->head[0].iov_base + buf->head[0].iov_len) - data_start; + memmove(orig_start, data_start, data_len); + buf->head[0].iov_len -= (data_start - orig_start); + buf->len -= (data_start - orig_start); + + ret = GSS_S_DEFECTIVE_TOKEN; + if (gss_krb5_remove_padding(buf, blocksize)) + goto out; + + ret = GSS_S_COMPLETE; +out: + if (md5cksum.data) kfree(md5cksum.data); + return ret; +} -- cgit v1.2.3 From 00fd6e14255fe7a249315746386d640bc4e9e758 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 13 Oct 2005 16:55:18 -0400 Subject: RPCSEC_GSS remove all qop parameters Not only are the qop parameters that are passed around throughout the gssapi unused by any currently implemented mechanism, but there appears to be some doubt as to whether they will ever be used. Let's just kill them off for now. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 20 +++++++------------- net/sunrpc/auth_gss/gss_krb5_mech.c | 12 ++++-------- net/sunrpc/auth_gss/gss_krb5_seal.c | 5 +---- net/sunrpc/auth_gss/gss_krb5_unseal.c | 5 +---- net/sunrpc/auth_gss/gss_krb5_wrap.c | 11 ++--------- net/sunrpc/auth_gss/gss_mech_switch.c | 14 ++++---------- net/sunrpc/auth_gss/gss_spkm3_mech.c | 21 ++++++++------------- net/sunrpc/auth_gss/gss_spkm3_seal.c | 4 +--- net/sunrpc/auth_gss/gss_spkm3_unseal.c | 2 +- net/sunrpc/auth_gss/svcauth_gss.c | 9 ++++----- 10 files changed, 33 insertions(+), 70 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 5e4872058ec..f44f46f1d8e 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -854,9 +854,7 @@ gss_marshal(struct rpc_task *task, u32 *p) *p++ = htonl(RPC_AUTH_GSS); mic.data = (u8 *)(p + 1); - maj_stat = gss_get_mic(ctx->gc_gss_ctx, - GSS_C_QOP_DEFAULT, - &verf_buf, &mic); + maj_stat = gss_get_mic(ctx->gc_gss_ctx, &verf_buf, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) { cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; } else if (maj_stat != 0) { @@ -888,7 +886,7 @@ gss_validate(struct rpc_task *task, u32 *p) { struct rpc_cred *cred = task->tk_msg.rpc_cred; struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); - u32 seq, qop_state; + u32 seq; struct kvec iov; struct xdr_buf verf_buf; struct xdr_netobj mic; @@ -909,7 +907,7 @@ gss_validate(struct rpc_task *task, u32 *p) mic.data = (u8 *)p; mic.len = len; - maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic, &qop_state); + maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; if (maj_stat) @@ -961,8 +959,7 @@ gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, p = iov->iov_base + iov->iov_len; mic.data = (u8 *)(p + 1); - maj_stat = gss_get_mic(ctx->gc_gss_ctx, - GSS_C_QOP_DEFAULT, &integ_buf, &mic); + maj_stat = gss_get_mic(ctx->gc_gss_ctx, &integ_buf, &mic); status = -EIO; /* XXX? */ if (maj_stat == GSS_S_CONTEXT_EXPIRED) cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; @@ -1057,8 +1054,7 @@ gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, memcpy(tmp, snd_buf->tail[0].iov_base, snd_buf->tail[0].iov_len); snd_buf->tail[0].iov_base = tmp; } - maj_stat = gss_wrap(ctx->gc_gss_ctx, GSS_C_QOP_DEFAULT, offset, - snd_buf, inpages); + maj_stat = gss_wrap(ctx->gc_gss_ctx, offset, snd_buf, inpages); /* RPC_SLACK_SPACE should prevent this ever happening: */ BUG_ON(snd_buf->len > snd_buf->buflen); status = -EIO; @@ -1150,8 +1146,7 @@ gss_unwrap_resp_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, if (xdr_buf_read_netobj(rcv_buf, &mic, mic_offset)) return status; - maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, - &mic, NULL); + maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, &mic); if (maj_stat == GSS_S_CONTEXT_EXPIRED) cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; if (maj_stat != GSS_S_COMPLETE) @@ -1176,8 +1171,7 @@ gss_unwrap_resp_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx, /* remove padding: */ rcv_buf->len = offset + opaque_len; - maj_stat = gss_unwrap(ctx->gc_gss_ctx, NULL, - offset, rcv_buf); + maj_stat = gss_unwrap(ctx->gc_gss_ctx, offset, rcv_buf); if (maj_stat == GSS_S_CONTEXT_EXPIRED) cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; if (maj_stat != GSS_S_COMPLETE) diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 37a9ad97ccd..9ffac2c50b9 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -193,15 +193,12 @@ gss_delete_sec_context_kerberos(void *internal_ctx) { static u32 gss_verify_mic_kerberos(struct gss_ctx *ctx, struct xdr_buf *message, - struct xdr_netobj *mic_token, - u32 *qstate) { + struct xdr_netobj *mic_token) +{ u32 maj_stat = 0; - int qop_state; struct krb5_ctx *kctx = ctx->internal_ctx_id; - maj_stat = krb5_read_token(kctx, mic_token, message, &qop_state); - if (!maj_stat && qop_state) - *qstate = qop_state; + maj_stat = krb5_read_token(kctx, mic_token, message); dprintk("RPC: gss_verify_mic_kerberos returning %d\n", maj_stat); return maj_stat; @@ -209,13 +206,12 @@ gss_verify_mic_kerberos(struct gss_ctx *ctx, static u32 gss_get_mic_kerberos(struct gss_ctx *ctx, - u32 qop, struct xdr_buf *message, struct xdr_netobj *mic_token) { u32 err = 0; struct krb5_ctx *kctx = ctx->internal_ctx_id; - err = krb5_make_token(kctx, qop, message, mic_token); + err = krb5_make_token(kctx, message, mic_token); dprintk("RPC: gss_get_mic_kerberos returning %d\n",err); diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index fb852d9ab06..15227c727c8 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -71,7 +71,7 @@ #endif u32 -krb5_make_token(struct krb5_ctx *ctx, int qop_req, +krb5_make_token(struct krb5_ctx *ctx, struct xdr_buf *text, struct xdr_netobj *token) { s32 checksum_type; @@ -83,9 +83,6 @@ krb5_make_token(struct krb5_ctx *ctx, int qop_req, now = get_seconds(); - if (qop_req != 0) - goto out_err; - switch (ctx->signalg) { case SGN_ALG_DES_MAC_MD5: checksum_type = CKSUMTYPE_RSA_MD5; diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c index c3d6d1bc100..bcf978627a7 100644 --- a/net/sunrpc/auth_gss/gss_krb5_unseal.c +++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c @@ -74,7 +74,7 @@ u32 krb5_read_token(struct krb5_ctx *ctx, struct xdr_netobj *read_token, - struct xdr_buf *message_buffer, int *qop_state) + struct xdr_buf *message_buffer) { int signalg; int sealalg; @@ -157,9 +157,6 @@ krb5_read_token(struct krb5_ctx *ctx, /* it got through unscathed. Make sure the context is unexpired */ - if (qop_state) - *qop_state = GSS_C_QOP_DEFAULT; - now = get_seconds(); ret = GSS_S_CONTEXT_EXPIRED; diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index ddcde6e42b2..af777cf9f25 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -116,7 +116,7 @@ make_confounder(char *p, int blocksize) /* XXX factor out common code with seal/unseal. */ u32 -gss_wrap_kerberos(struct gss_ctx *ctx, u32 qop, int offset, +gss_wrap_kerberos(struct gss_ctx *ctx, int offset, struct xdr_buf *buf, struct page **pages) { struct krb5_ctx *kctx = ctx->internal_ctx_id; @@ -132,9 +132,6 @@ gss_wrap_kerberos(struct gss_ctx *ctx, u32 qop, int offset, now = get_seconds(); - if (qop != 0) - goto out_err; - switch (kctx->signalg) { case SGN_ALG_DES_MAC_MD5: checksum_type = CKSUMTYPE_RSA_MD5; @@ -229,8 +226,7 @@ out_err: } u32 -gss_unwrap_kerberos(struct gss_ctx *ctx, u32 *qop, int offset, - struct xdr_buf *buf) +gss_unwrap_kerberos(struct gss_ctx *ctx, int offset, struct xdr_buf *buf) { struct krb5_ctx *kctx = ctx->internal_ctx_id; int signalg; @@ -328,9 +324,6 @@ gss_unwrap_kerberos(struct gss_ctx *ctx, u32 *qop, int offset, /* it got through unscathed. Make sure the context is unexpired */ - if (qop) - *qop = GSS_C_QOP_DEFAULT; - now = get_seconds(); ret = GSS_S_CONTEXT_EXPIRED; diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 06d97cb3481..b048bf672da 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -250,13 +250,11 @@ gss_import_sec_context(const void *input_token, size_t bufsize, u32 gss_get_mic(struct gss_ctx *context_handle, - u32 qop, struct xdr_buf *message, struct xdr_netobj *mic_token) { return context_handle->mech_type->gm_ops ->gss_get_mic(context_handle, - qop, message, mic_token); } @@ -266,35 +264,31 @@ gss_get_mic(struct gss_ctx *context_handle, u32 gss_verify_mic(struct gss_ctx *context_handle, struct xdr_buf *message, - struct xdr_netobj *mic_token, - u32 *qstate) + struct xdr_netobj *mic_token) { return context_handle->mech_type->gm_ops ->gss_verify_mic(context_handle, message, - mic_token, - qstate); + mic_token); } u32 gss_wrap(struct gss_ctx *ctx_id, - u32 qop, int offset, struct xdr_buf *buf, struct page **inpages) { return ctx_id->mech_type->gm_ops - ->gss_wrap(ctx_id, qop, offset, buf, inpages); + ->gss_wrap(ctx_id, offset, buf, inpages); } u32 gss_unwrap(struct gss_ctx *ctx_id, - u32 *qop, int offset, struct xdr_buf *buf) { return ctx_id->mech_type->gm_ops - ->gss_unwrap(ctx_id, qop, offset, buf); + ->gss_unwrap(ctx_id, offset, buf); } diff --git a/net/sunrpc/auth_gss/gss_spkm3_mech.c b/net/sunrpc/auth_gss/gss_spkm3_mech.c index 6c97d61baa9..39b3edc1469 100644 --- a/net/sunrpc/auth_gss/gss_spkm3_mech.c +++ b/net/sunrpc/auth_gss/gss_spkm3_mech.c @@ -224,18 +224,13 @@ gss_delete_sec_context_spkm3(void *internal_ctx) { static u32 gss_verify_mic_spkm3(struct gss_ctx *ctx, struct xdr_buf *signbuf, - struct xdr_netobj *checksum, - u32 *qstate) { + struct xdr_netobj *checksum) +{ u32 maj_stat = 0; - int qop_state = 0; struct spkm3_ctx *sctx = ctx->internal_ctx_id; dprintk("RPC: gss_verify_mic_spkm3 calling spkm3_read_token\n"); - maj_stat = spkm3_read_token(sctx, checksum, signbuf, &qop_state, - SPKM_MIC_TOK); - - if (!maj_stat && qop_state) - *qstate = qop_state; + maj_stat = spkm3_read_token(sctx, checksum, signbuf, SPKM_MIC_TOK); dprintk("RPC: gss_verify_mic_spkm3 returning %d\n", maj_stat); return maj_stat; @@ -243,15 +238,15 @@ gss_verify_mic_spkm3(struct gss_ctx *ctx, static u32 gss_get_mic_spkm3(struct gss_ctx *ctx, - u32 qop, struct xdr_buf *message_buffer, - struct xdr_netobj *message_token) { + struct xdr_netobj *message_token) +{ u32 err = 0; struct spkm3_ctx *sctx = ctx->internal_ctx_id; dprintk("RPC: gss_get_mic_spkm3\n"); - err = spkm3_make_token(sctx, qop, message_buffer, + err = spkm3_make_token(sctx, message_buffer, message_token, SPKM_MIC_TOK); return err; } @@ -264,8 +259,8 @@ static struct gss_api_ops gss_spkm3_ops = { }; static struct pf_desc gss_spkm3_pfs[] = { - {RPC_AUTH_GSS_SPKM, 0, RPC_GSS_SVC_NONE, "spkm3"}, - {RPC_AUTH_GSS_SPKMI, 0, RPC_GSS_SVC_INTEGRITY, "spkm3i"}, + {RPC_AUTH_GSS_SPKM, RPC_GSS_SVC_NONE, "spkm3"}, + {RPC_AUTH_GSS_SPKMI, RPC_GSS_SVC_INTEGRITY, "spkm3i"}, }; static struct gss_api_mech gss_spkm3_mech = { diff --git a/net/sunrpc/auth_gss/gss_spkm3_seal.c b/net/sunrpc/auth_gss/gss_spkm3_seal.c index 25339868d46..148201e929d 100644 --- a/net/sunrpc/auth_gss/gss_spkm3_seal.c +++ b/net/sunrpc/auth_gss/gss_spkm3_seal.c @@ -51,7 +51,7 @@ */ u32 -spkm3_make_token(struct spkm3_ctx *ctx, int qop_req, +spkm3_make_token(struct spkm3_ctx *ctx, struct xdr_buf * text, struct xdr_netobj * token, int toktype) { @@ -68,8 +68,6 @@ spkm3_make_token(struct spkm3_ctx *ctx, int qop_req, dprintk("RPC: spkm3_make_token\n"); now = jiffies; - if (qop_req != 0) - goto out_err; if (ctx->ctx_id.len != 16) { dprintk("RPC: spkm3_make_token BAD ctx_id.len %d\n", diff --git a/net/sunrpc/auth_gss/gss_spkm3_unseal.c b/net/sunrpc/auth_gss/gss_spkm3_unseal.c index 65ce81bf0bc..c3c0d958610 100644 --- a/net/sunrpc/auth_gss/gss_spkm3_unseal.c +++ b/net/sunrpc/auth_gss/gss_spkm3_unseal.c @@ -52,7 +52,7 @@ u32 spkm3_read_token(struct spkm3_ctx *ctx, struct xdr_netobj *read_token, /* checksum */ struct xdr_buf *message_buffer, /* signbuf */ - int *qop_state, int toktype) + int toktype) { s32 code; struct xdr_netobj wire_cksum = {.len =0, .data = NULL}; diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index e3308195374..e4ada15ed85 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -566,8 +566,7 @@ gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, if (rqstp->rq_deferred) /* skip verification of revisited request */ return SVC_OK; - if (gss_verify_mic(ctx_id, &rpchdr, &checksum, NULL) - != GSS_S_COMPLETE) { + if (gss_verify_mic(ctx_id, &rpchdr, &checksum) != GSS_S_COMPLETE) { *authp = rpcsec_gsserr_credproblem; return SVC_DENIED; } @@ -604,7 +603,7 @@ gss_write_verf(struct svc_rqst *rqstp, struct gss_ctx *ctx_id, u32 seq) xdr_buf_from_iov(&iov, &verf_data); p = rqstp->rq_res.head->iov_base + rqstp->rq_res.head->iov_len; mic.data = (u8 *)(p + 1); - maj_stat = gss_get_mic(ctx_id, 0, &verf_data, &mic); + maj_stat = gss_get_mic(ctx_id, &verf_data, &mic); if (maj_stat != GSS_S_COMPLETE) return -1; *p++ = htonl(mic.len); @@ -710,7 +709,7 @@ unwrap_integ_data(struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx) goto out; if (read_bytes_from_xdr_buf(buf, integ_len + 4, mic.data, mic.len)) goto out; - maj_stat = gss_verify_mic(ctx, &integ_buf, &mic, NULL); + maj_stat = gss_verify_mic(ctx, &integ_buf, &mic); if (maj_stat != GSS_S_COMPLETE) goto out; if (ntohl(svc_getu32(&buf->head[0])) != seq) @@ -1012,7 +1011,7 @@ svcauth_gss_release(struct svc_rqst *rqstp) resv = &resbuf->tail[0]; } mic.data = (u8 *)resv->iov_base + resv->iov_len + 4; - if (gss_get_mic(gsd->rsci->mechctx, 0, &integ_buf, &mic)) + if (gss_get_mic(gsd->rsci->mechctx, &integ_buf, &mic)) goto out_err; svc_putu32(resv, htonl(mic.len)); memset(mic.data + mic.len, 0, -- cgit v1.2.3 From a0857d03b21fa54653c9d2fe7a315381176015b4 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 13 Oct 2005 16:55:23 -0400 Subject: RPCSEC_GSS: krb5 cleanup Remove some senseless wrappers. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/gss_krb5_mech.c | 28 ---------------------------- net/sunrpc/auth_gss/gss_krb5_seal.c | 5 +++-- net/sunrpc/auth_gss/gss_krb5_unseal.c | 6 +++--- 3 files changed, 6 insertions(+), 33 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 9ffac2c50b9..5f1f806a0b1 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -190,34 +190,6 @@ gss_delete_sec_context_kerberos(void *internal_ctx) { kfree(kctx); } -static u32 -gss_verify_mic_kerberos(struct gss_ctx *ctx, - struct xdr_buf *message, - struct xdr_netobj *mic_token) -{ - u32 maj_stat = 0; - struct krb5_ctx *kctx = ctx->internal_ctx_id; - - maj_stat = krb5_read_token(kctx, mic_token, message); - - dprintk("RPC: gss_verify_mic_kerberos returning %d\n", maj_stat); - return maj_stat; -} - -static u32 -gss_get_mic_kerberos(struct gss_ctx *ctx, - struct xdr_buf *message, - struct xdr_netobj *mic_token) { - u32 err = 0; - struct krb5_ctx *kctx = ctx->internal_ctx_id; - - err = krb5_make_token(kctx, message, mic_token); - - dprintk("RPC: gss_get_mic_kerberos returning %d\n",err); - - return err; -} - static struct gss_api_ops gss_kerberos_ops = { .gss_import_sec_context = gss_import_sec_context_kerberos, .gss_get_mic = gss_get_mic_kerberos, diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index 15227c727c8..13f8ae97945 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -71,9 +71,10 @@ #endif u32 -krb5_make_token(struct krb5_ctx *ctx, - struct xdr_buf *text, struct xdr_netobj *token) +gss_get_mic_kerberos(struct gss_ctx *gss_ctx, struct xdr_buf *text, + struct xdr_netobj *token) { + struct krb5_ctx *ctx = gss_ctx->internal_ctx_id; s32 checksum_type; struct xdr_netobj md5cksum = {.len = 0, .data = NULL}; unsigned char *ptr, *krb5_hdr, *msg_start; diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c index bcf978627a7..2030475d98e 100644 --- a/net/sunrpc/auth_gss/gss_krb5_unseal.c +++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c @@ -72,10 +72,10 @@ * supposedly taken over. */ u32 -krb5_read_token(struct krb5_ctx *ctx, - struct xdr_netobj *read_token, - struct xdr_buf *message_buffer) +gss_verify_mic_kerberos(struct gss_ctx *gss_ctx, + struct xdr_buf *message_buffer, struct xdr_netobj *read_token) { + struct krb5_ctx *ctx = gss_ctx->internal_ctx_id; int signalg; int sealalg; s32 checksum_type; -- cgit v1.2.3 From 4bcde03d41d2264edb4ea3c47cb27da1e2609e48 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 26 Oct 2005 01:59:03 -0700 Subject: [PATCH] svcsock timestamp fix Convert nanoseconds to microseconds correctly. Spotted by Steve Dickson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/sunrpc/svcsock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 30ec3efc48a..691dea4a58e 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -587,7 +587,7 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) struct timeval tv; tv.tv_sec = xtime.tv_sec; - tv.tv_usec = xtime.tv_nsec * 1000; + tv.tv_usec = xtime.tv_nsec / NSEC_PER_USEC; skb_set_timestamp(skb, &tv); /* Don't enable netstamp, sunrpc doesn't need that much accuracy */ -- cgit v1.2.3 From 6fa05b17367f04ada501e89d3b9cb56adec0d930 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 27 Oct 2005 19:08:18 -0400 Subject: Revert "RPC: stops the release_pipe() funtion from being called twice" This reverts 747c5534c9a6da4aa87e7cdc2209ea98ea27f381 commit. --- net/sunrpc/rpc_pipe.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 649d609e7d9..ded6c63f11e 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -177,8 +177,6 @@ rpc_pipe_release(struct inode *inode, struct file *filp) __rpc_purge_upcall(inode, -EPIPE); if (rpci->ops->release_pipe) rpci->ops->release_pipe(inode); - if (!rpci->nreaders && !rpci->nwriters) - rpci->ops = NULL; out: up(&inode->i_sem); return 0; -- cgit v1.2.3 From 6070fe6f82c67021367092855c0812a98becf0fa Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 27 Oct 2005 22:12:46 -0400 Subject: RPC: Ensure that nobody can queue up new upcalls after rpc_close_pipes() Signed-off-by: Trond Myklebust --- net/sunrpc/rpc_pipe.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'net/sunrpc') diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index ded6c63f11e..4f188d0a5d1 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -76,25 +76,35 @@ int rpc_queue_upcall(struct inode *inode, struct rpc_pipe_msg *msg) { struct rpc_inode *rpci = RPC_I(inode); - int res = 0; + int res = -EPIPE; down(&inode->i_sem); + if (rpci->ops == NULL) + goto out; if (rpci->nreaders) { list_add_tail(&msg->list, &rpci->pipe); rpci->pipelen += msg->len; + res = 0; } else if (rpci->flags & RPC_PIPE_WAIT_FOR_OPEN) { if (list_empty(&rpci->pipe)) schedule_delayed_work(&rpci->queue_timeout, RPC_UPCALL_TIMEOUT); list_add_tail(&msg->list, &rpci->pipe); rpci->pipelen += msg->len; - } else - res = -EPIPE; + res = 0; + } +out: up(&inode->i_sem); wake_up(&rpci->waitq); return res; } +static inline void +rpc_inode_setowner(struct inode *inode, void *private) +{ + RPC_I(inode)->private = private; +} + static void rpc_close_pipes(struct inode *inode) { @@ -111,15 +121,10 @@ rpc_close_pipes(struct inode *inode) rpci->ops->release_pipe(inode); rpci->ops = NULL; } + rpc_inode_setowner(inode, NULL); up(&inode->i_sem); } -static inline void -rpc_inode_setowner(struct inode *inode, void *private) -{ - RPC_I(inode)->private = private; -} - static struct inode * rpc_alloc_inode(struct super_block *sb) { @@ -501,7 +506,6 @@ repeat: dentry = dvec[--n]; if (dentry->d_inode) { rpc_close_pipes(dentry->d_inode); - rpc_inode_setowner(dentry->d_inode, NULL); simple_unlink(dir, dentry); } dput(dentry); @@ -576,10 +580,8 @@ __rpc_rmdir(struct inode *dir, struct dentry *dentry) int error; shrink_dcache_parent(dentry); - if (dentry->d_inode) { + if (dentry->d_inode) rpc_close_pipes(dentry->d_inode); - rpc_inode_setowner(dentry->d_inode, NULL); - } if ((error = simple_rmdir(dir, dentry)) != 0) return error; if (!error) { @@ -732,7 +734,6 @@ rpc_unlink(char *path) d_drop(dentry); if (dentry->d_inode) { rpc_close_pipes(dentry->d_inode); - rpc_inode_setowner(dentry->d_inode, NULL); error = simple_unlink(dir, dentry); } dput(dentry); -- cgit v1.2.3