diff options
Diffstat (limited to 'fs')
45 files changed, 1180 insertions, 943 deletions
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index b0a0ae509c0..61c599b4a1e 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -127,12 +127,13 @@ static struct super_block *v9fs_get_sb(struct file_system_type if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) { dprintk(DEBUG_ERROR, "problem initiating session\n"); - kfree(v9ses); - return ERR_PTR(newfid); + sb = ERR_PTR(newfid); + goto out_free_session; } sb = sget(fs_type, NULL, v9fs_set_super, v9ses); - + if (IS_ERR(sb)) + goto out_close_session; v9fs_fill_super(sb, v9ses, flags); inode = v9fs_get_inode(sb, S_IFDIR | mode); @@ -185,6 +186,12 @@ static struct super_block *v9fs_get_sb(struct file_system_type return sb; +out_close_session: + v9fs_session_close(v9ses); +out_free_session: + kfree(v9ses); + return sb; + put_back_sb: /* deactivate_super calls v9fs_kill_super which will frees the rest */ up_write(&sb->s_umount); diff --git a/fs/Kconfig b/fs/Kconfig index e207be68d4c..2524629dc83 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -799,6 +799,7 @@ config PROC_KCORE config PROC_VMCORE bool "/proc/vmcore support (EXPERIMENTAL)" depends on PROC_FS && EXPERIMENTAL && CRASH_DUMP + default y help Exports the dump image of crashed kernel in ELF format. @@ -861,7 +862,7 @@ config RAMFS config CONFIGFS_FS tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)" - depends on EXPERIMENTAL + depends on SYSFS && EXPERIMENTAL help configfs is a ram-based filesystem that provides the converse of sysfs's functionality. Where sysfs is a filesystem-based diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 8ed9b06a982..5638c8f9362 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -504,7 +504,7 @@ static int populate_groups(struct config_group *group) int ret = 0; int i; - if (group && group->default_groups) { + if (group->default_groups) { /* FYI, we're faking mkdir here * I'm not sure we need this semaphore, as we're called * from our parent's mkdir. That holds our parent's diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 242fe1a66ce..1b4491cdd11 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -599,7 +599,7 @@ sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event) switch (op) { case EPOLL_CTL_ADD: if (!epi) { - epds.events |= POLLERR | POLLHUP | POLLRDHUP; + epds.events |= POLLERR | POLLHUP; error = ep_insert(ep, &epds, tfile, fd); } else @@ -613,7 +613,7 @@ sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event) break; case EPOLL_CTL_MOD: if (epi) { - epds.events |= POLLERR | POLLHUP | POLLRDHUP; + epds.events |= POLLERR | POLLHUP; error = ep_modify(ep, epi, &epds); } else error = -ENOENT; diff --git a/fs/exec.c b/fs/exec.c index 0291a68a362..3234a0c32d5 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -678,6 +678,18 @@ static int de_thread(struct task_struct *tsk) while (leader->exit_state != EXIT_ZOMBIE) yield(); + /* + * The only record we have of the real-time age of a + * process, regardless of execs it's done, is start_time. + * All the past CPU time is accumulated in signal_struct + * from sister threads now dead. But in this non-leader + * exec, nothing survives from the original leader thread, + * whose birth marks the true age of this process now. + * When we take on its identity by switching to its PID, we + * also take its birthdate (always earlier than our own). + */ + current->start_time = leader->start_time; + spin_lock(&leader->proc_lock); spin_lock(¤t->proc_lock); proc_dentry1 = proc_pid_unhash(current); @@ -723,7 +735,12 @@ static int de_thread(struct task_struct *tsk) current->parent = current->real_parent = leader->real_parent; leader->parent = leader->real_parent = child_reaper; current->group_leader = current; - leader->group_leader = leader; + leader->group_leader = current; + + /* Reduce leader to a thread */ + detach_pid(leader, PIDTYPE_PGID); + detach_pid(leader, PIDTYPE_SID); + list_del_init(&leader->tasks); add_parent(current); add_parent(leader); diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 1041dab6de2..14f5f6ea3e7 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -974,6 +974,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) { ext3_warning(sb, __FUNCTION__, "multiple resizers run on filesystem!"); + unlock_super(sb); err = -EBUSY; goto exit_put; } diff --git a/fs/fifo.c b/fs/fifo.c index 889f722ee36..49035b174b4 100644 --- a/fs/fifo.c +++ b/fs/fifo.c @@ -15,30 +15,35 @@ #include <linux/fs.h> #include <linux/pipe_fs_i.h> -static void wait_for_partner(struct inode* inode, unsigned int* cnt) +static void wait_for_partner(struct inode* inode, unsigned int *cnt) { int cur = *cnt; - while(cur == *cnt) { - pipe_wait(inode); - if(signal_pending(current)) + + while (cur == *cnt) { + pipe_wait(inode->i_pipe); + if (signal_pending(current)) break; } } static void wake_up_partner(struct inode* inode) { - wake_up_interruptible(PIPE_WAIT(*inode)); + wake_up_interruptible(&inode->i_pipe->wait); } static int fifo_open(struct inode *inode, struct file *filp) { + struct pipe_inode_info *pipe; int ret; - mutex_lock(PIPE_MUTEX(*inode)); - if (!inode->i_pipe) { + mutex_lock(&inode->i_mutex); + pipe = inode->i_pipe; + if (!pipe) { ret = -ENOMEM; - if(!pipe_new(inode)) + pipe = alloc_pipe_info(inode); + if (!pipe) goto err_nocleanup; + inode->i_pipe = pipe; } filp->f_version = 0; @@ -53,18 +58,18 @@ static int fifo_open(struct inode *inode, struct file *filp) * opened, even when there is no process writing the FIFO. */ filp->f_op = &read_fifo_fops; - PIPE_RCOUNTER(*inode)++; - if (PIPE_READERS(*inode)++ == 0) + pipe->r_counter++; + if (pipe->readers++ == 0) wake_up_partner(inode); - if (!PIPE_WRITERS(*inode)) { + if (!pipe->writers) { if ((filp->f_flags & O_NONBLOCK)) { /* suppress POLLHUP until we have * seen a writer */ - filp->f_version = PIPE_WCOUNTER(*inode); + filp->f_version = pipe->w_counter; } else { - wait_for_partner(inode, &PIPE_WCOUNTER(*inode)); + wait_for_partner(inode, &pipe->w_counter); if(signal_pending(current)) goto err_rd; } @@ -78,16 +83,16 @@ static int fifo_open(struct inode *inode, struct file *filp) * errno=ENXIO when there is no process reading the FIFO. */ ret = -ENXIO; - if ((filp->f_flags & O_NONBLOCK) && !PIPE_READERS(*inode)) + if ((filp->f_flags & O_NONBLOCK) && !pipe->readers) goto err; filp->f_op = &write_fifo_fops; - PIPE_WCOUNTER(*inode)++; - if (!PIPE_WRITERS(*inode)++) + pipe->w_counter++; + if (!pipe->writers++) wake_up_partner(inode); - if (!PIPE_READERS(*inode)) { - wait_for_partner(inode, &PIPE_RCOUNTER(*inode)); + if (!pipe->readers) { + wait_for_partner(inode, &pipe->r_counter); if (signal_pending(current)) goto err_wr; } @@ -102,11 +107,11 @@ static int fifo_open(struct inode *inode, struct file *filp) */ filp->f_op = &rdwr_fifo_fops; - PIPE_READERS(*inode)++; - PIPE_WRITERS(*inode)++; - PIPE_RCOUNTER(*inode)++; - PIPE_WCOUNTER(*inode)++; - if (PIPE_READERS(*inode) == 1 || PIPE_WRITERS(*inode) == 1) + pipe->readers++; + pipe->writers++; + pipe->r_counter++; + pipe->w_counter++; + if (pipe->readers == 1 || pipe->writers == 1) wake_up_partner(inode); break; @@ -116,27 +121,27 @@ static int fifo_open(struct inode *inode, struct file *filp) } /* Ok! */ - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); return 0; err_rd: - if (!--PIPE_READERS(*inode)) - wake_up_interruptible(PIPE_WAIT(*inode)); + if (!--pipe->readers) + wake_up_interruptible(&pipe->wait); ret = -ERESTARTSYS; goto err; err_wr: - if (!--PIPE_WRITERS(*inode)) - wake_up_interruptible(PIPE_WAIT(*inode)); + if (!--pipe->writers) + wake_up_interruptible(&pipe->wait); ret = -ERESTARTSYS; goto err; err: - if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) + if (!pipe->readers && !pipe->writers) free_pipe_info(inode); err_nocleanup: - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); return ret; } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 23d1f52eb1b..6c740f86066 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2005 Miklos Szeredi <miklos@szeredi.hu> + Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -23,13 +23,11 @@ static kmem_cache_t *fuse_req_cachep; static struct fuse_conn *fuse_get_conn(struct file *file) { - struct fuse_conn *fc; - spin_lock(&fuse_lock); - fc = file->private_data; - if (fc && !fc->connected) - fc = NULL; - spin_unlock(&fuse_lock); - return fc; + /* + * Lockless access is OK, because file->private data is set + * once during mount and is valid until the file is released. + */ + return file->private_data; } static void fuse_request_init(struct fuse_req *req) @@ -74,10 +72,8 @@ static void restore_sigs(sigset_t *oldset) */ void fuse_reset_request(struct fuse_req *req) { - int preallocated = req->preallocated; BUG_ON(atomic_read(&req->count) != 1); fuse_request_init(req); - req->preallocated = preallocated; } static void __fuse_get_request(struct fuse_req *req) @@ -92,80 +88,52 @@ static void __fuse_put_request(struct fuse_req *req) atomic_dec(&req->count); } -static struct fuse_req *do_get_request(struct fuse_conn *fc) +struct fuse_req *fuse_get_req(struct fuse_conn *fc) { struct fuse_req *req; - - spin_lock(&fuse_lock); - BUG_ON(list_empty(&fc->unused_list)); - req = list_entry(fc->unused_list.next, struct fuse_req, list); - list_del_init(&req->list); - spin_unlock(&fuse_lock); - fuse_request_init(req); - req->preallocated = 1; - req->in.h.uid = current->fsuid; - req->in.h.gid = current->fsgid; - req->in.h.pid = current->pid; - return req; -} - -/* This can return NULL, but only in case it's interrupted by a SIGKILL */ -struct fuse_req *fuse_get_request(struct fuse_conn *fc) -{ - int intr; sigset_t oldset; + int err; - atomic_inc(&fc->num_waiting); block_sigs(&oldset); - intr = down_interruptible(&fc->outstanding_sem); + err = wait_event_interruptible(fc->blocked_waitq, !fc->blocked); restore_sigs(&oldset); - if (intr) { - atomic_dec(&fc->num_waiting); - return NULL; - } - return do_get_request(fc); -} + if (err) + return ERR_PTR(-EINTR); -/* Must be called with fuse_lock held */ -static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req) -{ - if (req->preallocated) { - atomic_dec(&fc->num_waiting); - list_add(&req->list, &fc->unused_list); - } else - fuse_request_free(req); + req = fuse_request_alloc(); + if (!req) + return ERR_PTR(-ENOMEM); - /* If we are in debt decrease that first */ - if (fc->outstanding_debt) - fc->outstanding_debt--; - else - up(&fc->outstanding_sem); + atomic_inc(&fc->num_waiting); + fuse_request_init(req); + req->in.h.uid = current->fsuid; + req->in.h.gid = current->fsgid; + req->in.h.pid = current->pid; + return req; } void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) { if (atomic_dec_and_test(&req->count)) { - spin_lock(&fuse_lock); - fuse_putback_request(fc, req); - spin_unlock(&fuse_lock); + atomic_dec(&fc->num_waiting); + fuse_request_free(req); } } -static void fuse_put_request_locked(struct fuse_conn *fc, struct fuse_req *req) -{ - if (atomic_dec_and_test(&req->count)) - fuse_putback_request(fc, req); -} - -void fuse_release_background(struct fuse_req *req) +void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req) { iput(req->inode); iput(req->inode2); if (req->file) fput(req->file); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); list_del(&req->bg_entry); - spin_unlock(&fuse_lock); + if (fc->num_background == FUSE_MAX_BACKGROUND) { + fc->blocked = 0; + wake_up_all(&fc->blocked_waitq); + } + fc->num_background--; + spin_unlock(&fc->lock); } /* @@ -184,23 +152,23 @@ void fuse_release_background(struct fuse_req *req) * interrupted and put in the background, it will return with an error * and hence never be reset and reused. * - * Called with fuse_lock, unlocks it + * Called with fc->lock, unlocks it */ static void request_end(struct fuse_conn *fc, struct fuse_req *req) { list_del(&req->list); req->state = FUSE_REQ_FINISHED; if (!req->background) { + spin_unlock(&fc->lock); wake_up(&req->waitq); - fuse_put_request_locked(fc, req); - spin_unlock(&fuse_lock); + fuse_put_request(fc, req); } else { void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; req->end = NULL; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); down_read(&fc->sbput_sem); if (fc->mounted) - fuse_release_background(req); + fuse_release_background(fc, req); up_read(&fc->sbput_sem); if (end) end(fc, req); @@ -242,6 +210,9 @@ static void background_request(struct fuse_conn *fc, struct fuse_req *req) { req->background = 1; list_add(&req->bg_entry, &fc->background); + fc->num_background++; + if (fc->num_background == FUSE_MAX_BACKGROUND) + fc->blocked = 1; if (req->inode) req->inode = igrab(req->inode); if (req->inode2) @@ -250,16 +221,16 @@ static void background_request(struct fuse_conn *fc, struct fuse_req *req) get_file(req->file); } -/* Called with fuse_lock held. Releases, and then reacquires it. */ +/* Called with fc->lock held. Releases, and then reacquires it. */ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) { sigset_t oldset; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); block_sigs(&oldset); wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED); restore_sigs(&oldset); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); if (req->state == FUSE_REQ_FINISHED && !req->interrupted) return; @@ -273,9 +244,9 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) locked state, there mustn't be any filesystem operation (e.g. page fault), since that could lead to deadlock */ - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); wait_event(req->waitq, !req->locked); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); } if (req->state == FUSE_REQ_PENDING) { list_del(&req->list); @@ -304,19 +275,10 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req) req->in.h.unique = fc->reqctr; req->in.h.len = sizeof(struct fuse_in_header) + len_args(req->in.numargs, (struct fuse_arg *) req->in.args); - if (!req->preallocated) { - /* If request is not preallocated (either FORGET or - RELEASE), then still decrease outstanding_sem, so - user can't open infinite number of files while not - processing the RELEASE requests. However for - efficiency do it without blocking, so if down() - would block, just increase the debt instead */ - if (down_trylock(&fc->outstanding_sem)) - fc->outstanding_debt++; - } list_add_tail(&req->list, &fc->pending); req->state = FUSE_REQ_PENDING; wake_up(&fc->waitq); + kill_fasync(&fc->fasync, SIGIO, POLL_IN); } /* @@ -325,7 +287,7 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req) void request_send(struct fuse_conn *fc, struct fuse_req *req) { req->isreply = 1; - spin_lock(&fuse_lock); + spin_lock(&fc->lock); if (!fc->connected) req->out.h.error = -ENOTCONN; else if (fc->conn_error) @@ -338,15 +300,16 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req) request_wait_answer(fc, req); } - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) { - spin_lock(&fuse_lock); + spin_lock(&fc->lock); + background_request(fc, req); if (fc->connected) { queue_request(fc, req); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } else { req->out.h.error = -ENOTCONN; request_end(fc, req); @@ -362,9 +325,6 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req) void request_send_background(struct fuse_conn *fc, struct fuse_req *req) { req->isreply = 1; - spin_lock(&fuse_lock); - background_request(fc, req); - spin_unlock(&fuse_lock); request_send_nowait(fc, req); } @@ -373,16 +333,16 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req) * anything that could cause a page-fault. If the request was already * interrupted bail out. */ -static int lock_request(struct fuse_req *req) +static int lock_request(struct fuse_conn *fc, struct fuse_req *req) { int err = 0; if (req) { - spin_lock(&fuse_lock); + spin_lock(&fc->lock); if (req->interrupted) err = -ENOENT; else req->locked = 1; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } return err; } @@ -392,18 +352,19 @@ static int lock_request(struct fuse_req *req) * requester thread is currently waiting for it to be unlocked, so * wake it up. */ -static void unlock_request(struct fuse_req *req) +static void unlock_request(struct fuse_conn *fc, struct fuse_req *req) { if (req) { - spin_lock(&fuse_lock); + spin_lock(&fc->lock); req->locked = 0; if (req->interrupted) wake_up(&req->waitq); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } } struct fuse_copy_state { + struct fuse_conn *fc; int write; struct fuse_req *req; const struct iovec *iov; @@ -416,11 +377,12 @@ struct fuse_copy_state { unsigned len; }; -static void fuse_copy_init(struct fuse_copy_state *cs, int write, - struct fuse_req *req, const struct iovec *iov, - unsigned long nr_segs) +static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc, + int write, struct fuse_req *req, + const struct iovec *iov, unsigned long nr_segs) { memset(cs, 0, sizeof(*cs)); + cs->fc = fc; cs->write = write; cs->req = req; cs->iov = iov; @@ -450,7 +412,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) unsigned long offset; int err; - unlock_request(cs->req); + unlock_request(cs->fc, cs->req); fuse_copy_finish(cs); if (!cs->seglen) { BUG_ON(!cs->nr_segs); @@ -473,7 +435,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) cs->seglen -= cs->len; cs->addr += cs->len; - return lock_request(cs->req); + return lock_request(cs->fc, cs->req); } /* Do as much copy to/from userspace buffer as we can */ @@ -585,9 +547,9 @@ static void request_wait(struct fuse_conn *fc) if (signal_pending(current)) break; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); schedule(); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); } set_current_state(TASK_RUNNING); remove_wait_queue(&fc->waitq, &wait); @@ -606,18 +568,21 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, unsigned long nr_segs, loff_t *off) { int err; - struct fuse_conn *fc; struct fuse_req *req; struct fuse_in *in; struct fuse_copy_state cs; unsigned reqsize; + struct fuse_conn *fc = fuse_get_conn(file); + if (!fc) + return -EPERM; restart: - spin_lock(&fuse_lock); - fc = file->private_data; - err = -EPERM; - if (!fc) + spin_lock(&fc->lock); + err = -EAGAIN; + if ((file->f_flags & O_NONBLOCK) && fc->connected && + list_empty(&fc->pending)) goto err_unlock; + request_wait(fc); err = -ENODEV; if (!fc->connected) @@ -641,14 +606,14 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, request_end(fc, req); goto restart; } - spin_unlock(&fuse_lock); - fuse_copy_init(&cs, 1, req, iov, nr_segs); + spin_unlock(&fc->lock); + fuse_copy_init(&cs, fc, 1, req, iov, nr_segs); err = fuse_copy_one(&cs, &in->h, sizeof(in->h)); if (!err) err = fuse_copy_args(&cs, in->numargs, in->argpages, (struct fuse_arg *) in->args, 0); fuse_copy_finish(&cs); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); req->locked = 0; if (!err && req->interrupted) err = -ENOENT; @@ -663,12 +628,12 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, else { req->state = FUSE_REQ_SENT; list_move_tail(&req->list, &fc->processing); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } return reqsize; err_unlock: - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); return err; } @@ -735,9 +700,9 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, struct fuse_copy_state cs; struct fuse_conn *fc = fuse_get_conn(file); if (!fc) - return -ENODEV; + return -EPERM; - fuse_copy_init(&cs, 0, NULL, iov, nr_segs); + fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs); if (nbytes < sizeof(struct fuse_out_header)) return -EINVAL; @@ -749,7 +714,7 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, oh.len != nbytes) goto err_finish; - spin_lock(&fuse_lock); + spin_lock(&fc->lock); err = -ENOENT; if (!fc->connected) goto err_unlock; @@ -760,9 +725,9 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, goto err_unlock; if (req->interrupted) { - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); fuse_copy_finish(&cs); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); request_end(fc, req); return -ENOENT; } @@ -770,12 +735,12 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, req->out.h = oh; req->locked = 1; cs.req = req; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); err = copy_out_args(&cs, &req->out, nbytes); fuse_copy_finish(&cs); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); req->locked = 0; if (!err) { if (req->interrupted) @@ -787,7 +752,7 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, return err ? err : nbytes; err_unlock: - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); err_finish: fuse_copy_finish(&cs); return err; @@ -804,18 +769,19 @@ static ssize_t fuse_dev_write(struct file *file, const char __user *buf, static unsigned fuse_dev_poll(struct file *file, poll_table *wait) { - struct fuse_conn *fc = fuse_get_conn(file); unsigned mask = POLLOUT | POLLWRNORM; - + struct fuse_conn *fc = fuse_get_conn(file); if (!fc) - return -ENODEV; + return POLLERR; poll_wait(file, &fc->waitq, wait); - spin_lock(&fuse_lock); - if (!list_empty(&fc->pending)) - mask |= POLLIN | POLLRDNORM; - spin_unlock(&fuse_lock); + spin_lock(&fc->lock); + if (!fc->connected) + mask = POLLERR; + else if (!list_empty(&fc->pending)) + mask |= POLLIN | POLLRDNORM; + spin_unlock(&fc->lock); return mask; } @@ -823,7 +789,7 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait) /* * Abort all requests on the given list (pending or processing) * - * This function releases and reacquires fuse_lock + * This function releases and reacquires fc->lock */ static void end_requests(struct fuse_conn *fc, struct list_head *head) { @@ -832,7 +798,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head) req = list_entry(head->next, struct fuse_req, list); req->out.h.error = -ECONNABORTED; request_end(fc, req); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); } } @@ -863,10 +829,10 @@ static void end_io_requests(struct fuse_conn *fc) req->end = NULL; /* The end function will consume this reference */ __fuse_get_request(req); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); wait_event(req->waitq, !req->locked); end(fc, req); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); } } } @@ -893,35 +859,44 @@ static void end_io_requests(struct fuse_conn *fc) */ void fuse_abort_conn(struct fuse_conn *fc) { - spin_lock(&fuse_lock); + spin_lock(&fc->lock); if (fc->connected) { fc->connected = 0; end_io_requests(fc); end_requests(fc, &fc->pending); end_requests(fc, &fc->processing); wake_up_all(&fc->waitq); + kill_fasync(&fc->fasync, SIGIO, POLL_IN); } - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } static int fuse_dev_release(struct inode *inode, struct file *file) { - struct fuse_conn *fc; - - spin_lock(&fuse_lock); - fc = file->private_data; + struct fuse_conn *fc = fuse_get_conn(file); if (fc) { + spin_lock(&fc->lock); fc->connected = 0; end_requests(fc, &fc->pending); end_requests(fc, &fc->processing); - } - spin_unlock(&fuse_lock); - if (fc) + spin_unlock(&fc->lock); + fasync_helper(-1, file, 0, &fc->fasync); kobject_put(&fc->kobj); + } return 0; } +static int fuse_dev_fasync(int fd, struct file *file, int on) +{ + struct fuse_conn *fc = fuse_get_conn(file); + if (!fc) + return -EPERM; + + /* No locking - fasync_helper does its own locking */ + return fasync_helper(fd, file, on, &fc->fasync); +} + const struct file_operations fuse_dev_operations = { .owner = THIS_MODULE, .llseek = no_llseek, @@ -931,6 +906,7 @@ const struct file_operations fuse_dev_operations = { .writev = fuse_dev_writev, .poll = fuse_dev_poll, .release = fuse_dev_release, + .fasync = fuse_dev_fasync, }; static struct miscdevice fuse_miscdevice = { diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 256355b8025..8d7546e832e 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -117,8 +117,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) return 0; fc = get_fuse_conn(inode); - req = fuse_get_request(fc); - if (!req) + req = fuse_get_req(fc); + if (IS_ERR(req)) return 0; fuse_lookup_init(req, entry->d_parent->d_inode, entry, &outarg); @@ -188,9 +188,9 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, if (entry->d_name.len > FUSE_NAME_MAX) return ERR_PTR(-ENAMETOOLONG); - req = fuse_get_request(fc); - if (!req) - return ERR_PTR(-EINTR); + req = fuse_get_req(fc); + if (IS_ERR(req)) + return ERR_PTR(PTR_ERR(req)); fuse_lookup_init(req, dir, entry, &outarg); request_send(fc, req); @@ -244,15 +244,14 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, struct file *file; int flags = nd->intent.open.flags - 1; - err = -ENOSYS; if (fc->no_create) - goto out; + return -ENOSYS; - err = -EINTR; - req = fuse_get_request(fc); - if (!req) - goto out; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + err = -ENOMEM; ff = fuse_file_alloc(); if (!ff) goto out_put_request; @@ -314,7 +313,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, fuse_file_free(ff); out_put_request: fuse_put_request(fc, req); - out: return err; } @@ -375,9 +373,9 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode, { struct fuse_mknod_in inarg; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.mode = mode; @@ -407,9 +405,9 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, int mode) { struct fuse_mkdir_in inarg; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.mode = mode; @@ -427,9 +425,9 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, { struct fuse_conn *fc = get_fuse_conn(dir); unsigned len = strlen(link) + 1; - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->in.h.opcode = FUSE_SYMLINK; req->in.numargs = 2; @@ -444,9 +442,9 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) { int err; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->in.h.opcode = FUSE_UNLINK; req->in.h.nodeid = get_node_id(dir); @@ -476,9 +474,9 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) { int err; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->in.h.opcode = FUSE_RMDIR; req->in.h.nodeid = get_node_id(dir); @@ -504,9 +502,9 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, int err; struct fuse_rename_in inarg; struct fuse_conn *fc = get_fuse_conn(olddir); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.newdir = get_node_id(newdir); @@ -553,9 +551,9 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, struct fuse_link_in inarg; struct inode *inode = entry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.oldnodeid = get_node_id(inode); @@ -583,9 +581,9 @@ int fuse_do_getattr(struct inode *inode) int err; struct fuse_attr_out arg; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->in.h.opcode = FUSE_GETATTR; req->in.h.nodeid = get_node_id(inode); @@ -673,9 +671,9 @@ static int fuse_access(struct inode *inode, int mask) if (fc->no_access) return 0; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.mask = mask; @@ -780,9 +778,9 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) if (is_bad_inode(inode)) return -EIO; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); page = alloc_page(GFP_KERNEL); if (!page) { @@ -809,11 +807,11 @@ static char *read_link(struct dentry *dentry) { struct inode *inode = dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_request(fc); + struct fuse_req *req = fuse_get_req(fc); char *link; - if (!req) - return ERR_PTR(-EINTR); + if (IS_ERR(req)) + return ERR_PTR(PTR_ERR(req)); link = (char *) __get_free_page(GFP_KERNEL); if (!link) { @@ -933,9 +931,9 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr) } } - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); iattr_to_fattr(attr, &inarg); @@ -995,9 +993,9 @@ static int fuse_setxattr(struct dentry *entry, const char *name, if (fc->no_setxattr) return -EOPNOTSUPP; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.size = size; @@ -1035,9 +1033,9 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name, if (fc->no_getxattr) return -EOPNOTSUPP; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.size = size; @@ -1085,9 +1083,9 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) if (fc->no_listxattr) return -EOPNOTSUPP; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.size = size; @@ -1131,9 +1129,9 @@ static int fuse_removexattr(struct dentry *entry, const char *name) if (fc->no_removexattr) return -EOPNOTSUPP; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->in.h.opcode = FUSE_REMOVEXATTR; req->in.h.nodeid = get_node_id(inode); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 975f2697e86..e4f041a11bb 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -22,9 +22,9 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir, struct fuse_req *req; int err; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); @@ -184,9 +184,9 @@ static int fuse_flush(struct file *file) if (fc->no_flush) return 0; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; @@ -223,9 +223,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) return 0; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; @@ -297,9 +297,9 @@ static int fuse_readpage(struct file *file, struct page *page) if (is_bad_inode(inode)) goto out; - err = -EINTR; - req = fuse_get_request(fc); - if (!req) + req = fuse_get_req(fc); + err = PTR_ERR(req); + if (IS_ERR(req)) goto out; req->out.page_zeroing = 1; @@ -368,10 +368,10 @@ static int fuse_readpages_fill(void *_data, struct page *page) (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read || req->pages[req->num_pages - 1]->index + 1 != page->index)) { fuse_send_readpages(req, data->file, inode); - data->req = req = fuse_get_request(fc); - if (!req) { + data->req = req = fuse_get_req(fc); + if (IS_ERR(req)) { unlock_page(page); - return -EINTR; + return PTR_ERR(req); } } req->pages[req->num_pages] = page; @@ -392,13 +392,17 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, data.file = file; data.inode = inode; - data.req = fuse_get_request(fc); - if (!data.req) - return -EINTR; + data.req = fuse_get_req(fc); + if (IS_ERR(data.req)) + return PTR_ERR(data.req); err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); - if (!err) - fuse_send_readpages(data.req, file, inode); + if (!err) { + if (data.req->num_pages) + fuse_send_readpages(data.req, file, inode); + else + fuse_put_request(fc, data.req); + } return err; } @@ -451,9 +455,9 @@ static int fuse_commit_write(struct file *file, struct page *page, if (is_bad_inode(inode)) return -EIO; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->num_pages = 1; req->pages[0] = page; @@ -528,9 +532,9 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf, if (is_bad_inode(inode)) return -EIO; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); while (count) { size_t nres; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index a16a04fcf41..19c7185a754 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2005 Miklos Szeredi <miklos@szeredi.hu> + Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -18,8 +18,8 @@ /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 -/** If more requests are outstanding, then the operation will block */ -#define FUSE_MAX_OUTSTANDING 10 +/** Maximum number of outstanding background requests */ +#define FUSE_MAX_BACKGROUND 10 /** It could be as large as PATH_MAX, but would that have any uses? */ #define FUSE_NAME_MAX 1024 @@ -131,8 +131,8 @@ struct fuse_conn; * A request to the client */ struct fuse_req { - /** This can be on either unused_list, pending processing or - io lists in fuse_conn */ + /** This can be on either pending processing or io lists in + fuse_conn */ struct list_head list; /** Entry on the background list */ @@ -144,15 +144,12 @@ struct fuse_req { /* * The following bitfields are either set once before the * request is queued or setting/clearing them is protected by - * fuse_lock + * fuse_conn->lock */ /** True if the request has reply */ unsigned isreply:1; - /** The request is preallocated */ - unsigned preallocated:1; - /** The request was interrupted */ unsigned interrupted:1; @@ -213,6 +210,9 @@ struct fuse_req { * unmounted. */ struct fuse_conn { + /** Lock protecting accessess to members of this structure */ + spinlock_t lock; + /** The user id for this mount */ uid_t user_id; @@ -244,19 +244,20 @@ struct fuse_conn { interrupted request) */ struct list_head background; - /** Controls the maximum number of outstanding requests */ - struct semaphore outstanding_sem; + /** Number of requests currently in the background */ + unsigned num_background; + + /** Flag indicating if connection is blocked. This will be + the case before the INIT reply is received, and if there + are too many outstading backgrounds requests */ + int blocked; - /** This counts the number of outstanding requests if - outstanding_sem would go negative */ - unsigned outstanding_debt; + /** waitq for blocked connection */ + wait_queue_head_t blocked_waitq; /** RW semaphore for exclusion with fuse_put_super() */ struct rw_semaphore sbput_sem; - /** The list of unused requests */ - struct list_head unused_list; - /** The next unique request id */ u64 reqctr; @@ -318,6 +319,9 @@ struct fuse_conn { /** kobject */ struct kobject kobj; + + /** O_ASYNC requests */ + struct fasync_struct *fasync; }; static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) @@ -349,21 +353,6 @@ static inline u64 get_node_id(struct inode *inode) extern const struct file_operations fuse_dev_operations; /** - * This is the single global spinlock which protects FUSE's structures - * - * The following data is protected by this lock: - * - * - the private_data field of the device file - * - the s_fs_info field of the super block - * - unused_list, pending, processing lists in fuse_conn - * - background list in fuse_conn - * - the unique request ID counter reqctr in fuse_conn - * - the sb (super_block) field in fuse_conn - * - the file (device file) field in fuse_conn - */ -extern spinlock_t fuse_lock; - -/** * Get a filled in inode */ struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid, @@ -461,11 +450,11 @@ void fuse_reset_request(struct fuse_req *req); /** * Reserve a preallocated request */ -struct fuse_req *fuse_get_request(struct fuse_conn *fc); +struct fuse_req *fuse_get_req(struct fuse_conn *fc); /** - * Decrement reference count of a request. If count goes to zero put - * on unused list (preallocated) or free request (not preallocated). + * Decrement reference count of a request. If count goes to zero free + * the request. */ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); @@ -487,7 +476,7 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req); /** * Release inodes and file associated with background request */ -void fuse_release_background(struct fuse_req *req); +void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req); /* Abort all requests */ void fuse_abort_conn(struct fuse_conn *fc); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 879e6fba948..fd34037b058 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2005 Miklos Szeredi <miklos@szeredi.hu> + Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -22,7 +22,6 @@ MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); MODULE_DESCRIPTION("Filesystem in Userspace"); MODULE_LICENSE("GPL"); -spinlock_t fuse_lock; static kmem_cache_t *fuse_inode_cachep; static struct subsystem connections_subsys; @@ -207,15 +206,17 @@ static void fuse_put_super(struct super_block *sb) down_write(&fc->sbput_sem); while (!list_empty(&fc->background)) - fuse_release_background(list_entry(fc->background.next, + fuse_release_background(fc, + list_entry(fc->background.next, struct fuse_req, bg_entry)); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); fc->mounted = 0; fc->connected = 0; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); up_write(&fc->sbput_sem); /* Flush all readers on this fs */ + kill_fasync(&fc->fasync, SIGIO, POLL_IN); wake_up_all(&fc->waitq); kobject_del(&fc->kobj); kobject_put(&fc->kobj); @@ -242,9 +243,9 @@ static int fuse_statfs(struct super_block *sb, struct kstatfs *buf) struct fuse_statfs_out outarg; int err; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&outarg, 0, sizeof(outarg)); req->in.numargs = 0; @@ -369,15 +370,7 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt) static void fuse_conn_release(struct kobject *kobj) { - struct fuse_conn *fc = get_fuse_conn_kobj(kobj); - - while (!list_empty(&fc->unused_list)) { - struct fuse_req *req; - req = list_entry(fc->unused_list.next, struct fuse_req, list); - list_del(&req->list); - fuse_request_free(req); - } - kfree(fc); + kfree(get_fuse_conn_kobj(kobj)); } static struct fuse_conn *new_conn(void) @@ -386,64 +379,25 @@ static struct fuse_conn *new_conn(void) fc = kzalloc(sizeof(*fc), GFP_KERNEL); if (fc) { - int i; + spin_lock_init(&fc->lock); init_waitqueue_head(&fc->waitq); + init_waitqueue_head(&fc->blocked_waitq); INIT_LIST_HEAD(&fc->pending); INIT_LIST_HEAD(&fc->processing); INIT_LIST_HEAD(&fc->io); - INIT_LIST_HEAD(&fc->unused_list); INIT_LIST_HEAD(&fc->background); - sema_init(&fc->outstanding_sem, 1); /* One for INIT */ init_rwsem(&fc->sbput_sem); kobj_set_kset_s(fc, connections_subsys); kobject_init(&fc->kobj); atomic_set(&fc->num_waiting, 0); - for (i = 0; i < FUSE_MAX_OUTSTANDING; i++) { - struct fuse_req *req = fuse_request_alloc(); - if (!req) { - kobject_put(&fc->kobj); - return NULL; - } - list_add(&req->list, &fc->unused_list); - } fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; fc->bdi.unplug_io_fn = default_unplug_io_fn; fc->reqctr = 0; + fc->blocked = 1; } return fc; } -static struct fuse_conn *get_conn(struct file *file, struct super_block *sb) -{ - struct fuse_conn *fc; - int err; - - err = -EINVAL; - if (file->f_op != &fuse_dev_operations) - goto out_err; - - err = -ENOMEM; - fc = new_conn(); - if (!fc) - goto out_err; - - spin_lock(&fuse_lock); - err = -EINVAL; - if (file->private_data) - goto out_unlock; - - kobject_get(&fc->kobj); - file->private_data = fc; - spin_unlock(&fuse_lock); - return fc; - - out_unlock: - spin_unlock(&fuse_lock); - kobject_put(&fc->kobj); - out_err: - return ERR_PTR(err); -} - static struct inode *get_root_inode(struct super_block *sb, unsigned mode) { struct fuse_attr attr; @@ -467,7 +421,6 @@ static struct super_operations fuse_super_operations = { static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) { - int i; struct fuse_init_out *arg = &req->misc.init_out; if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION) @@ -486,22 +439,13 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->minor = arg->minor; fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; } - - /* After INIT reply is received other requests can go - out. So do (FUSE_MAX_OUTSTANDING - 1) number of - up()s on outstanding_sem. The last up() is done in - fuse_putback_request() */ - for (i = 1; i < FUSE_MAX_OUTSTANDING; i++) - up(&fc->outstanding_sem); - fuse_put_request(fc, req); + fc->blocked = 0; + wake_up_all(&fc->blocked_waitq); } -static void fuse_send_init(struct fuse_conn *fc) +static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) { - /* This is called from fuse_read_super() so there's guaranteed - to be exactly one request available */ - struct fuse_req *req = fuse_get_request(fc); struct fuse_init_in *arg = &req->misc.init_in; arg->major = FUSE_KERNEL_VERSION; @@ -525,12 +469,9 @@ static void fuse_send_init(struct fuse_conn *fc) static unsigned long long conn_id(void) { + /* BKL is held for ->get_sb() */ static unsigned long long ctr = 1; - unsigned long long val; - spin_lock(&fuse_lock); - val = ctr++; - spin_unlock(&fuse_lock); - return val; + return ctr++; } static int fuse_fill_super(struct super_block *sb, void *data, int silent) @@ -540,6 +481,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) struct fuse_mount_data d; struct file *file; struct dentry *root_dentry; + struct fuse_req *init_req; int err; if (!parse_fuse_opt((char *) data, &d)) @@ -555,10 +497,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (!file) return -EINVAL; - fc = get_conn(file, sb); - fput(file); - if (IS_ERR(fc)) - return PTR_ERR(fc); + if (file->f_op != &fuse_dev_operations) + return -EINVAL; + + /* Setting file->private_data can't race with other mount() + instances, since BKL is held for ->get_sb() */ + if (file->private_data) + return -EINVAL; + + fc = new_conn(); + if (!fc) + return -ENOMEM; fc->flags = d.flags; fc->user_id = d.user_id; @@ -579,27 +528,40 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) goto err; } + init_req = fuse_request_alloc(); + if (!init_req) + goto err_put_root; + err = kobject_set_name(&fc->kobj, "%llu", conn_id()); if (err) - goto err_put_root; + goto err_free_req; err = kobject_add(&fc->kobj); if (err) - goto err_put_root; + goto err_free_req; sb->s_root = root_dentry; - spin_lock(&fuse_lock); fc->mounted = 1; fc->connected = 1; - spin_unlock(&fuse_lock); + kobject_get(&fc->kobj); + file->private_data = fc; + /* + * atomic_dec_and_test() in fput() provides the necessary + * memory barrier for file->private_data to be visible on all + * CPUs after this + */ + fput(file); - fuse_send_init(fc); + fuse_send_init(fc, init_req); return 0; + err_free_req: + fuse_request_free(init_req); err_put_root: dput(root_dentry); err: + fput(file); kobject_put(&fc->kobj); return err; } @@ -753,7 +715,6 @@ static int __init fuse_init(void) printk("fuse init (API version %i.%i)\n", FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); - spin_lock_init(&fuse_lock); res = fuse_fs_init(); if (res) goto err; diff --git a/fs/inotify.c b/fs/inotify.c index 367c487c014..1f50302849c 100644 --- a/fs/inotify.c +++ b/fs/inotify.c @@ -538,7 +538,7 @@ void inotify_d_instantiate(struct dentry *entry, struct inode *inode) WARN_ON(entry->d_flags & DCACHE_INOTIFY_PARENT_WATCHED); spin_lock(&entry->d_lock); parent = entry->d_parent; - if (inotify_inode_watched(parent->d_inode)) + if (parent->d_inode && inotify_inode_watched(parent->d_inode)) entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED; spin_unlock(&entry->d_lock); } diff --git a/fs/namespace.c b/fs/namespace.c index bf478addb85..2c5f1f80bdc 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -899,11 +899,13 @@ static int do_change_type(struct nameidata *nd, int flag) /* * do loopback mount. */ -static int do_loopback(struct nameidata *nd, char *old_name, int recurse) +static int do_loopback(struct nameidata *nd, char *old_name, unsigned long flags, int mnt_flags) { struct nameidata old_nd; struct vfsmount *mnt = NULL; + int recurse = flags & MS_REC; int err = mount_is_safe(nd); + if (err) return err; if (!old_name || !*old_name) @@ -937,6 +939,7 @@ static int do_loopback(struct nameidata *nd, char *old_name, int recurse) spin_unlock(&vfsmount_lock); release_mounts(&umount_list); } + mnt->mnt_flags = mnt_flags; out: up_write(&namespace_sem); @@ -1350,7 +1353,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, data_page); else if (flags & MS_BIND) - retval = do_loopback(&nd, dev_name, flags & MS_REC); + retval = do_loopback(&nd, dev_name, flags, mnt_flags); else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) retval = do_change_type(&nd, flags); else if (flags & MS_MOVE) diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index cfe9ce88161..6e92b0fe532 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -14,46 +14,46 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) { - struct svc_cred *cred = &rqstp->rq_cred; + struct svc_cred cred = rqstp->rq_cred; int i; int ret; if (exp->ex_flags & NFSEXP_ALLSQUASH) { - cred->cr_uid = exp->ex_anon_uid; - cred->cr_gid = exp->ex_anon_gid; - put_group_info(cred->cr_group_info); - cred->cr_group_info = groups_alloc(0); + cred.cr_uid = exp->ex_anon_uid; + cred.cr_gid = exp->ex_anon_gid; + cred.cr_group_info = groups_alloc(0); } else if (exp->ex_flags & NFSEXP_ROOTSQUASH) { struct group_info *gi; - if (!cred->cr_uid) - cred->cr_uid = exp->ex_anon_uid; - if (!cred->cr_gid) - cred->cr_gid = exp->ex_anon_gid; - gi = groups_alloc(cred->cr_group_info->ngroups); + if (!cred.cr_uid) + cred.cr_uid = exp->ex_anon_uid; + if (!cred.cr_gid) + cred.cr_gid = exp->ex_anon_gid; + gi = groups_alloc(cred.cr_group_info->ngroups); if (gi) - for (i = 0; i < cred->cr_group_info->ngroups; i++) { - if (!GROUP_AT(cred->cr_group_info, i)) + for (i = 0; i < cred.cr_group_info->ngroups; i++) { + if (!GROUP_AT(cred.cr_group_info, i)) GROUP_AT(gi, i) = exp->ex_anon_gid; else - GROUP_AT(gi, i) = GROUP_AT(cred->cr_group_info, i); + GROUP_AT(gi, i) = GROUP_AT(cred.cr_group_info, i); } - put_group_info(cred->cr_group_info); - cred->cr_group_info = gi; - } + cred.cr_group_info = gi; + } else + get_group_info(cred.cr_group_info); - if (cred->cr_uid != (uid_t) -1) - current->fsuid = cred->cr_uid; + if (cred.cr_uid != (uid_t) -1) + current->fsuid = cred.cr_uid; else current->fsuid = exp->ex_anon_uid; - if (cred->cr_gid != (gid_t) -1) - current->fsgid = cred->cr_gid; + if (cred.cr_gid != (gid_t) -1) + current->fsgid = cred.cr_gid; else current->fsgid = exp->ex_anon_gid; - if (!cred->cr_group_info) + if (!cred.cr_group_info) return -ENOMEM; - ret = set_current_groups(cred->cr_group_info); - if ((cred->cr_uid)) { + ret = set_current_groups(cred.cr_group_info); + put_group_info(cred.cr_group_info); + if ((cred.cr_uid)) { cap_t(current->cap_effective) &= ~CAP_NFSD_MASK; } else { cap_t(current->cap_effective) |= (CAP_NFSD_MASK & diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index c340be0a3f5..4e0578121d9 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -422,7 +422,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) goto out; err = path_lookup(buf, 0, &nd); - if (err) goto out; + if (err) goto out_no_path; exp.h.flags = 0; exp.ex_client = dom; @@ -475,6 +475,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) out: if (nd.dentry) path_release(&nd); + out_no_path: if (dom) auth_domain_put(dom); kfree(buf); diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 6d2dfed1de0..f61142afea4 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -682,7 +682,7 @@ static struct svc_procedure nfsd_procedures3[22] = { PROC(lookup, dirop, dirop, fhandle2, RC_NOCACHE, ST+FH+pAT+pAT), PROC(access, access, access, fhandle, RC_NOCACHE, ST+pAT+1), PROC(readlink, readlink, readlink, fhandle, RC_NOCACHE, ST+pAT+1+NFS3_MAXPATHLEN/4), - PROC(read, read, read, fhandle, RC_NOCACHE, ST+pAT+4+NFSSVC_MAXBLKSIZE), + PROC(read, read, read, fhandle, RC_NOCACHE, ST+pAT+4+NFSSVC_MAXBLKSIZE/4), PROC(write, write, write, fhandle, RC_REPLBUFF, ST+WC+4), PROC(create, create, create, fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC), PROC(mkdir, mkdir, create, fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC), diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 7391f4aabed..edb107e61b9 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -710,9 +710,9 @@ calculate_posix_ace_count(struct nfs4_acl *n4acl) /* Also, the remaining entries are for named users and * groups, and come in threes (mask, allow, deny): */ if (n4acl->naces < 7) - return -1; + return -EINVAL; if ((n4acl->naces - 7) % 3) - return -1; + return -EINVAL; return 4 + (n4acl->naces - 7)/3; } } @@ -790,7 +790,7 @@ nfs4_acl_split(struct nfs4_acl *acl, struct nfs4_acl *dacl) continue; error = nfs4_acl_add_ace(dacl, ace->type, ace->flag, - ace->access_mask, ace->whotype, ace->who) == -1; + ace->access_mask, ace->whotype, ace->who); if (error < 0) goto out; @@ -866,7 +866,7 @@ nfs4_acl_add_ace(struct nfs4_acl *acl, u32 type, u32 flag, u32 access_mask, struct nfs4_ace *ace; if ((ace = kmalloc(sizeof(*ace), GFP_KERNEL)) == NULL) - return -1; + return -ENOMEM; ace->type = type; ace->flag = flag; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index c872bd07fc1..dbaf3f93f32 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -441,8 +441,9 @@ nfsd4_probe_callback(struct nfs4_client *clp) goto out_clnt; } - /* the task holds a reference to the nfs4_client struct */ cb->cb_client = clnt; + + /* the task holds a reference to the nfs4_client struct */ atomic_inc(&clp->cl_count); msg.rpc_cred = nfsd4_lookupcred(clp,0); @@ -460,13 +461,12 @@ nfsd4_probe_callback(struct nfs4_client *clp) out_rpciod: atomic_dec(&clp->cl_count); rpciod_down(); + cb->cb_client = NULL; out_clnt: rpc_shutdown_client(clnt); - goto out_err; out_err: dprintk("NFSD: warning: no callback path to client %.*s\n", (int)clp->cl_name.len, clp->cl_name.data); - cb->cb_client = NULL; } static void diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 6d63f1d9e5f..b0e095ea0c0 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -288,8 +288,6 @@ nfsd4_putrootfh(struct svc_rqst *rqstp, struct svc_fh *current_fh) fh_put(current_fh); status = exp_pseudoroot(rqstp->rq_client, current_fh, &rqstp->rq_chandle); - if (!status) - status = nfserrno(nfsd_setuser(rqstp, current_fh->fh_export)); return status; } @@ -975,7 +973,7 @@ struct nfsd4_voidargs { int dummy; }; */ static struct svc_procedure nfsd_procedures4[2] = { PROC(null, void, void, void, RC_NOCACHE, 1), - PROC(compound, compound, compound, compound, RC_NOCACHE, NFSD_BUFSIZE) + PROC(compound, compound, compound, compound, RC_NOCACHE, NFSD_BUFSIZE/4) }; struct svc_version nfsd_version4 = { diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 47ec112b266..96c7578cbe1 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -147,6 +147,42 @@ get_nfs4_file(struct nfs4_file *fi) kref_get(&fi->fi_ref); } +static int num_delegations; + +/* + * Open owner state (share locks) + */ + +/* hash tables for nfs4_stateowner */ +#define OWNER_HASH_BITS 8 +#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS) +#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1) + +#define ownerid_hashval(id) \ + ((id) & OWNER_HASH_MASK) +#define ownerstr_hashval(clientid, ownername) \ + (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK) + +static struct list_head ownerid_hashtbl[OWNER_HASH_SIZE]; +static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE]; + +/* hash table for nfs4_file */ +#define FILE_HASH_BITS 8 +#define FILE_HASH_SIZE (1 << FILE_HASH_BITS) +#define FILE_HASH_MASK (FILE_HASH_SIZE - 1) +/* hash table for (open)nfs4_stateid */ +#define STATEID_HASH_BITS 10 +#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) +#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) + +#define file_hashval(x) \ + hash_ptr(x, FILE_HASH_BITS) +#define stateid_hashval(owner_id, file_id) \ + (((owner_id) + (file_id)) & STATEID_HASH_MASK) + +static struct list_head file_hashtbl[FILE_HASH_SIZE]; +static struct list_head stateid_hashtbl[STATEID_HASH_SIZE]; + static struct nfs4_delegation * alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type) { @@ -155,9 +191,12 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f struct nfs4_callback *cb = &stp->st_stateowner->so_client->cl_callback; dprintk("NFSD alloc_init_deleg\n"); + if (num_delegations > STATEID_HASH_SIZE * 4) + return NULL; dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL); if (dp == NULL) return dp; + num_delegations++; INIT_LIST_HEAD(&dp->dl_perfile); INIT_LIST_HEAD(&dp->dl_perclnt); INIT_LIST_HEAD(&dp->dl_recall_lru); @@ -192,6 +231,7 @@ nfs4_put_delegation(struct nfs4_delegation *dp) dprintk("NFSD: freeing dp %p\n",dp); put_nfs4_file(dp->dl_file); kmem_cache_free(deleg_slab, dp); + num_delegations--; } } @@ -330,22 +370,29 @@ put_nfs4_client(struct nfs4_client *clp) } static void +shutdown_callback_client(struct nfs4_client *clp) +{ + struct rpc_clnt *clnt = clp->cl_callback.cb_client; + + /* shutdown rpc client, ending any outstanding recall rpcs */ + if (clnt) { + clp->cl_callback.cb_client = NULL; + rpc_shutdown_client(clnt); + rpciod_down(); + } +} + +static void expire_client(struct nfs4_client *clp) { struct nfs4_stateowner *sop; struct nfs4_delegation *dp; - struct nfs4_callback *cb = &clp->cl_callback; - struct rpc_clnt *clnt = clp->cl_callback.cb_client; struct list_head reaplist; dprintk("NFSD: expire_client cl_count %d\n", atomic_read(&clp->cl_count)); - /* shutdown rpc client, ending any outstanding recall rpcs */ - if (atomic_read(&cb->cb_set) == 1 && clnt) { - rpc_shutdown_client(clnt); - clnt = clp->cl_callback.cb_client = NULL; - } + shutdown_callback_client(clp); INIT_LIST_HEAD(&reaplist); spin_lock(&recall_lock); @@ -936,40 +983,6 @@ out: return status; } -/* - * Open owner state (share locks) - */ - -/* hash tables for nfs4_stateowner */ -#define OWNER_HASH_BITS 8 -#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS) -#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1) - -#define ownerid_hashval(id) \ - ((id) & OWNER_HASH_MASK) -#define ownerstr_hashval(clientid, ownername) \ - (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK) - -static struct list_head ownerid_hashtbl[OWNER_HASH_SIZE]; -static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE]; - -/* hash table for nfs4_file */ -#define FILE_HASH_BITS 8 -#define FILE_HASH_SIZE (1 << FILE_HASH_BITS) -#define FILE_HASH_MASK (FILE_HASH_SIZE - 1) -/* hash table for (open)nfs4_stateid */ -#define STATEID_HASH_BITS 10 -#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) -#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) - -#define file_hashval(x) \ - hash_ptr(x, FILE_HASH_BITS) -#define stateid_hashval(owner_id, file_id) \ - (((owner_id) + (file_id)) & STATEID_HASH_MASK) - -static struct list_head file_hashtbl[FILE_HASH_SIZE]; -static struct list_head stateid_hashtbl[STATEID_HASH_SIZE]; - /* OPEN Share state helper functions */ static inline struct nfs4_file * alloc_init_file(struct inode *ino) @@ -1186,8 +1199,7 @@ move_to_close_lru(struct nfs4_stateowner *sop) { dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop); - unhash_stateowner(sop); - list_add_tail(&sop->so_close_lru, &close_lru); + list_move_tail(&sop->so_close_lru, &close_lru); sop->so_time = get_seconds(); } @@ -1916,8 +1928,7 @@ nfs4_laundromat(void) } dprintk("NFSD: purging unused open stateowner (so_id %d)\n", sop->so_id); - list_del(&sop->so_close_lru); - nfs4_put_stateowner(sop); + release_stateowner(sop); } if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT) clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT; @@ -2495,36 +2506,27 @@ nfs4_transform_lock_offset(struct file_lock *lock) lock->fl_end = OFFSET_MAX; } -static int -nfs4_verify_lock_stateowner(struct nfs4_stateowner *sop, unsigned int hashval) -{ - struct nfs4_stateowner *local = NULL; - int status = 0; - - if (hashval >= LOCK_HASH_SIZE) - goto out; - list_for_each_entry(local, &lock_ownerid_hashtbl[hashval], so_idhash) { - if (local == sop) { - status = 1; - goto out; - } - } -out: - return status; -} - +/* Hack!: For now, we're defining this just so we can use a pointer to it + * as a unique cookie to identify our (NFSv4's) posix locks. */ +static struct lock_manager_operations nfsd_posix_mng_ops = { +}; static inline void nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) { - struct nfs4_stateowner *sop = (struct nfs4_stateowner *) fl->fl_owner; - unsigned int hval = lockownerid_hashval(sop->so_id); + struct nfs4_stateowner *sop; + unsigned int hval; - deny->ld_sop = NULL; - if (nfs4_verify_lock_stateowner(sop, hval)) { + if (fl->fl_lmops == &nfsd_posix_mng_ops) { + sop = (struct nfs4_stateowner *) fl->fl_owner; + hval = lockownerid_hashval(sop->so_id); kref_get(&sop->so_ref); deny->ld_sop = sop; deny->ld_clientid = sop->so_client->cl_clientid; + } else { + deny->ld_sop = NULL; + deny->ld_clientid.cl_boot = 0; + deny->ld_clientid.cl_id = 0; } deny->ld_start = fl->fl_start; deny->ld_length = ~(u64)0; @@ -2736,6 +2738,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock file_lock.fl_pid = current->tgid; file_lock.fl_file = filp; file_lock.fl_flags = FL_POSIX; + file_lock.fl_lmops = &nfsd_posix_mng_ops; file_lock.fl_start = lock->lk_offset; if ((lock->lk_length == ~(u64)0) || @@ -2841,6 +2844,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner; file_lock.fl_pid = current->tgid; file_lock.fl_flags = FL_POSIX; + file_lock.fl_lmops = &nfsd_posix_mng_ops; file_lock.fl_start = lockt->lt_offset; if ((lockt->lt_length == ~(u64)0) || LOFF_OVERFLOW(lockt->lt_offset, lockt->lt_length)) @@ -2900,6 +2904,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock file_lock.fl_pid = current->tgid; file_lock.fl_file = filp; file_lock.fl_flags = FL_POSIX; + file_lock.fl_lmops = &nfsd_posix_mng_ops; file_lock.fl_start = locku->lu_offset; if ((locku->lu_length == ~(u64)0) || LOFF_OVERFLOW(locku->lu_offset, locku->lu_length)) @@ -3211,15 +3216,8 @@ __nfs4_state_shutdown(void) int i; struct nfs4_client *clp = NULL; struct nfs4_delegation *dp = NULL; - struct nfs4_stateowner *sop = NULL; struct list_head *pos, *next, reaplist; - list_for_each_safe(pos, next, &close_lru) { - sop = list_entry(pos, struct nfs4_stateowner, so_close_lru); - list_del(&sop->so_close_lru); - nfs4_put_stateowner(sop); - } - for (i = 0; i < CLIENT_HASH_SIZE; i++) { while (!list_empty(&conf_id_hashtbl[i])) { clp = list_entry(conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash); @@ -3244,8 +3242,6 @@ __nfs4_state_shutdown(void) } cancel_delayed_work(&laundromat_work); - flush_workqueue(laundry_wq); - destroy_workqueue(laundry_wq); nfsd4_shutdown_recdir(); nfs4_init = 0; } @@ -3253,6 +3249,8 @@ __nfs4_state_shutdown(void) void nfs4_state_shutdown(void) { + cancel_rearming_delayed_workqueue(laundry_wq, &laundromat_work); + destroy_workqueue(laundry_wq); nfs4_lock_state(); nfs4_release_reclaim(); __nfs4_state_shutdown(); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 03857fd8112..de3998f15f1 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -299,11 +299,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia buf, dummy32, &ace.who); if (status) goto out_nfserr; - if (nfs4_acl_add_ace(*acl, ace.type, ace.flag, - ace.access_mask, ace.whotype, ace.who) != 0) { - status = -ENOMEM; + status = nfs4_acl_add_ace(*acl, ace.type, ace.flag, + ace.access_mask, ace.whotype, ace.who); + if (status) goto out_nfserr; - } } } else *acl = NULL; @@ -2085,27 +2084,20 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_read WRITE32(eof); WRITE32(maxcount); ADJUST_ARGS(); - resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base; - + resp->xbuf->head[0].iov_len = (char*)p + - (char*)resp->xbuf->head[0].iov_base; resp->xbuf->page_len = maxcount; - /* read zero bytes -> don't set up tail */ - if(!maxcount) - return 0; - - /* set up page for remaining responses */ - svc_take_page(resp->rqstp); - resp->xbuf->tail[0].iov_base = - page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); - resp->rqstp->rq_restailpage = resp->rqstp->rq_resused-1; + /* Use rest of head for padding and remaining ops: */ + resp->rqstp->rq_restailpage = 0; + resp->xbuf->tail[0].iov_base = p; resp->xbuf->tail[0].iov_len = 0; - resp->p = resp->xbuf->tail[0].iov_base; - resp->end = resp->p + PAGE_SIZE/4; - if (maxcount&3) { - *(resp->p)++ = 0; + RESERVE_SPACE(4); + WRITE32(0); resp->xbuf->tail[0].iov_base += maxcount&3; resp->xbuf->tail[0].iov_len = 4 - (maxcount&3); + ADJUST_ARGS(); } return 0; } @@ -2142,21 +2134,20 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_r WRITE32(maxcount); ADJUST_ARGS(); - resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base; + resp->xbuf->head[0].iov_len = (char*)p + - (char*)resp->xbuf->head[0].iov_base; + resp->xbuf->page_len = maxcount; - svc_take_page(resp->rqstp); - resp->xbuf->tail[0].iov_base = - page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); - resp->rqstp->rq_restailpage = resp->rqstp->rq_resused-1; + /* Use rest of head for padding and remaining ops: */ + resp->rqstp->rq_restailpage = 0; + resp->xbuf->tail[0].iov_base = p; resp->xbuf->tail[0].iov_len = 0; - resp->p = resp->xbuf->tail[0].iov_base; - resp->end = resp->p + PAGE_SIZE/4; - - resp->xbuf->page_len = maxcount; if (maxcount&3) { - *(resp->p)++ = 0; + RESERVE_SPACE(4); + WRITE32(0); resp->xbuf->tail[0].iov_base += maxcount&3; resp->xbuf->tail[0].iov_len = 4 - (maxcount&3); + ADJUST_ARGS(); } return 0; } @@ -2166,7 +2157,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re { int maxcount; loff_t offset; - u32 *page, *savep; + u32 *page, *savep, *tailbase; ENCODE_HEAD; if (nfserr) @@ -2182,6 +2173,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re WRITE32(0); ADJUST_ARGS(); resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base; + tailbase = p; maxcount = PAGE_SIZE; if (maxcount > readdir->rd_maxcount) @@ -2226,14 +2218,12 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re *p++ = htonl(readdir->common.err == nfserr_eof); resp->xbuf->page_len = ((char*)p) - (char*)page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); - /* allocate a page for the tail */ - svc_take_page(resp->rqstp); - resp->xbuf->tail[0].iov_base = - page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); - resp->rqstp->rq_restailpage = resp->rqstp->rq_resused-1; + /* Use rest of head for padding and remaining ops: */ + resp->rqstp->rq_restailpage = 0; + resp->xbuf->tail[0].iov_base = tailbase; resp->xbuf->tail[0].iov_len = 0; resp->p = resp->xbuf->tail[0].iov_base; - resp->end = resp->p + PAGE_SIZE/4; + resp->end = resp->p + (PAGE_SIZE - resp->xbuf->head[0].iov_len)/4; return 0; err_no_verf: diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 3e6b75cd90f..06cd0db0f32 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -553,7 +553,7 @@ static struct svc_procedure nfsd_procedures2[18] = { PROC(none, void, void, none, RC_NOCACHE, ST), PROC(lookup, diropargs, diropres, fhandle, RC_NOCACHE, ST+FH+AT), PROC(readlink, readlinkargs, readlinkres, none, RC_NOCACHE, ST+1+NFS_MAXPATHLEN/4), - PROC(read, readargs, readres, fhandle, RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE), + PROC(read, readargs, readres, fhandle, RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE/4), PROC(none, void, void, none, RC_NOCACHE, ST), PROC(write, writeargs, attrstat, fhandle, RC_REPLBUFF, ST+AT), PROC(create, createargs, diropres, fhandle, RC_REPLBUFF, ST+FH+AT), diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 31018333dc3..6aa92d0e687 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -371,7 +371,6 @@ out_nfserr: static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf) { ssize_t buflen; - int error; buflen = vfs_getxattr(dentry, key, NULL, 0); if (buflen <= 0) @@ -381,10 +380,7 @@ static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf) if (!*buf) return -ENOMEM; - error = vfs_getxattr(dentry, key, *buf, buflen); - if (error < 0) - return error; - return buflen; + return vfs_getxattr(dentry, key, *buf, buflen); } #endif diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index bff0f0d0686..21f38accd03 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -153,6 +153,7 @@ struct o2hb_region { struct o2hb_bio_wait_ctxt { atomic_t wc_num_reqs; struct completion wc_io_complete; + int wc_error; }; static void o2hb_write_timeout(void *arg) @@ -186,6 +187,7 @@ static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc, { atomic_set(&wc->wc_num_reqs, num_ios); init_completion(&wc->wc_io_complete); + wc->wc_error = 0; } /* Used in error paths too */ @@ -218,8 +220,10 @@ static int o2hb_bio_end_io(struct bio *bio, { struct o2hb_bio_wait_ctxt *wc = bio->bi_private; - if (error) + if (error) { mlog(ML_ERROR, "IO Error %d\n", error); + wc->wc_error = error; + } if (bio->bi_size) return 1; @@ -390,6 +394,8 @@ static int o2hb_read_slots(struct o2hb_region *reg, bail_and_wait: o2hb_wait_on_io(reg, &wc); + if (wc.wc_error && !status) + status = wc.wc_error; if (bios) { for(i = 0; i < num_bios; i++) @@ -790,20 +796,24 @@ static int o2hb_highest_node(unsigned long *nodes, return highest; } -static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) +static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) { int i, ret, highest_node, change = 0; unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; struct bio *write_bio; struct o2hb_bio_wait_ctxt write_wc; - if (o2nm_configured_node_map(configured_nodes, sizeof(configured_nodes))) - return; + ret = o2nm_configured_node_map(configured_nodes, + sizeof(configured_nodes)); + if (ret) { + mlog_errno(ret); + return ret; + } highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); if (highest_node >= O2NM_MAX_NODES) { mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); - return; + return -EINVAL; } /* No sense in reading the slots of nodes that don't exist @@ -813,7 +823,7 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) ret = o2hb_read_slots(reg, highest_node + 1); if (ret < 0) { mlog_errno(ret); - return; + return ret; } /* With an up to date view of the slots, we can check that no @@ -831,7 +841,7 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) ret = o2hb_issue_node_write(reg, &write_bio, &write_wc); if (ret < 0) { mlog_errno(ret); - return; + return ret; } i = -1; @@ -847,6 +857,15 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) */ o2hb_wait_on_io(reg, &write_wc); bio_put(write_bio); + if (write_wc.wc_error) { + /* Do not re-arm the write timeout on I/O error - we + * can't be sure that the new block ever made it to + * disk */ + mlog(ML_ERROR, "Write error %d on device \"%s\"\n", + write_wc.wc_error, reg->hr_dev_name); + return write_wc.wc_error; + } + o2hb_arm_write_timeout(reg); /* let the person who launched us know when things are steady */ @@ -854,6 +873,8 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) if (atomic_dec_and_test(®->hr_steady_iterations)) wake_up(&o2hb_steady_queue); } + + return 0; } /* Subtract b from a, storing the result in a. a *must* have a larger @@ -913,7 +934,10 @@ static int o2hb_thread(void *data) * likely to time itself out. */ do_gettimeofday(&before_hb); - o2hb_do_disk_heartbeat(reg); + i = 0; + do { + ret = o2hb_do_disk_heartbeat(reg); + } while (ret && ++i < 2); do_gettimeofday(&after_hb); elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c index c3764f4744e..74ca4e5f976 100644 --- a/fs/ocfs2/dlm/userdlm.c +++ b/fs/ocfs2/dlm/userdlm.c @@ -139,6 +139,10 @@ static void user_ast(void *opaque) return; } + mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE, + "Lockres %s, requested ivmode. flags 0x%x\n", + lockres->l_name, lockres->l_flags); + /* we're downconverting. */ if (lockres->l_requested < lockres->l_level) { if (lockres->l_requested <= @@ -229,23 +233,42 @@ static void user_unlock_ast(void *opaque, enum dlm_status status) mlog(0, "UNLOCK AST called on lock %s\n", lockres->l_name); - if (status != DLM_NORMAL) + if (status != DLM_NORMAL && status != DLM_CANCELGRANT) mlog(ML_ERROR, "Dlm returns status %d\n", status); spin_lock(&lockres->l_lock); - if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) + /* The teardown flag gets set early during the unlock process, + * so test the cancel flag to make sure that this ast isn't + * for a concurrent cancel. */ + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN + && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) { lockres->l_level = LKM_IVMODE; - else { + } else if (status == DLM_CANCELGRANT) { + mlog(0, "Lock %s, cancel fails, flags 0x%x\n", + lockres->l_name, lockres->l_flags); + /* We tried to cancel a convert request, but it was + * already granted. Don't clear the busy flag - the + * ast should've done this already. */ + BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); + lockres->l_flags &= ~USER_LOCK_IN_CANCEL; + goto out_noclear; + } else { + BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); + /* Cancel succeeded, we want to re-queue */ + mlog(0, "Lock %s, cancel succeeds, flags 0x%x\n", + lockres->l_name, lockres->l_flags); lockres->l_requested = LKM_IVMODE; /* cancel an * upconvert * request. */ lockres->l_flags &= ~USER_LOCK_IN_CANCEL; /* we want the unblock thread to look at it again * now. */ - __user_dlm_queue_lockres(lockres); + if (lockres->l_flags & USER_LOCK_BLOCKED) + __user_dlm_queue_lockres(lockres); } lockres->l_flags &= ~USER_LOCK_BUSY; +out_noclear: spin_unlock(&lockres->l_lock); wake_up(&lockres->l_event); @@ -268,13 +291,26 @@ static void user_dlm_unblock_lock(void *opaque) spin_lock(&lockres->l_lock); - BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED)); - BUG_ON(!(lockres->l_flags & USER_LOCK_QUEUED)); + mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED), + "Lockres %s, flags 0x%x\n", + lockres->l_name, lockres->l_flags); - /* notice that we don't clear USER_LOCK_BLOCKED here. That's - * for user_ast to do. */ + /* notice that we don't clear USER_LOCK_BLOCKED here. If it's + * set, we want user_ast clear it. */ lockres->l_flags &= ~USER_LOCK_QUEUED; + /* It's valid to get here and no longer be blocked - if we get + * several basts in a row, we might be queued by the first + * one, the unblock thread might run and clear the queued + * flag, and finally we might get another bast which re-queues + * us before our ast for the downconvert is called. */ + if (!(lockres->l_flags & USER_LOCK_BLOCKED)) { + mlog(0, "Lockres %s, flags 0x%x: queued but not blocking\n", + lockres->l_name, lockres->l_flags); + spin_unlock(&lockres->l_lock); + goto drop_ref; + } + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { mlog(0, "lock is in teardown so we do nothing\n"); spin_unlock(&lockres->l_lock); @@ -282,7 +318,9 @@ static void user_dlm_unblock_lock(void *opaque) } if (lockres->l_flags & USER_LOCK_BUSY) { - mlog(0, "BUSY flag detected...\n"); + mlog(0, "Cancel lock %s, flags 0x%x\n", + lockres->l_name, lockres->l_flags); + if (lockres->l_flags & USER_LOCK_IN_CANCEL) { spin_unlock(&lockres->l_lock); goto drop_ref; @@ -296,14 +334,7 @@ static void user_dlm_unblock_lock(void *opaque) LKM_CANCEL, user_unlock_ast, lockres); - if (status == DLM_CANCELGRANT) { - /* If we got this, then the ast was fired - * before we could cancel. We cleanup our - * state, and restart the function. */ - spin_lock(&lockres->l_lock); - lockres->l_flags &= ~USER_LOCK_IN_CANCEL; - spin_unlock(&lockres->l_lock); - } else if (status != DLM_NORMAL) + if (status != DLM_NORMAL) user_log_dlm_error("dlmunlock", status, lockres); goto drop_ref; } @@ -581,6 +612,14 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres) mlog(0, "asked to destroy %s\n", lockres->l_name); spin_lock(&lockres->l_lock); + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { + mlog(0, "Lock is already torn down\n"); + spin_unlock(&lockres->l_lock); + return 0; + } + + lockres->l_flags |= USER_LOCK_IN_TEARDOWN; + while (lockres->l_flags & USER_LOCK_BUSY) { spin_unlock(&lockres->l_lock); @@ -606,7 +645,6 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres) lockres->l_flags &= ~USER_LOCK_ATTACHED; lockres->l_flags |= USER_LOCK_BUSY; - lockres->l_flags |= USER_LOCK_IN_TEARDOWN; spin_unlock(&lockres->l_lock); mlog(0, "unlocking lockres %s\n", lockres->l_name); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 34e903a6a46..581eb451a41 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -260,6 +260,17 @@ static int ocfs2_truncate_file(struct inode *inode, if (new_i_size == le64_to_cpu(fe->i_size)) goto bail; + /* This forces other nodes to sync and drop their pages. Do + * this even if we have a truncate without allocation change - + * ocfs2 cluster sizes can be much greater than page size, so + * we have to truncate them anyway. */ + status = ocfs2_data_lock(inode, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + ocfs2_data_unlock(inode, 1); + if (le32_to_cpu(fe->i_clusters) == ocfs2_clusters_for_bytes(osb->sb, new_i_size)) { mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n", @@ -272,14 +283,6 @@ static int ocfs2_truncate_file(struct inode *inode, goto bail; } - /* This forces other nodes to sync and drop their pages */ - status = ocfs2_data_lock(inode, 1); - if (status < 0) { - mlog_errno(status); - goto bail; - } - ocfs2_data_unlock(inode, 1); - /* alright, we're going to need to do a full blown alloc size * change. Orphan the inode so that recovery can complete the * truncate if necessary. This does the task of marking diff --git a/fs/pipe.c b/fs/pipe.c index 795df987cd3..e984beb93a0 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -36,7 +36,7 @@ */ /* Drop the inode semaphore and wait for a pipe event, atomically */ -void pipe_wait(struct inode * inode) +void pipe_wait(struct pipe_inode_info *pipe) { DEFINE_WAIT(wait); @@ -44,11 +44,14 @@ void pipe_wait(struct inode * inode) * Pipes are system-local resources, so sleeping on them * is considered a noninteractive wait: */ - prepare_to_wait(PIPE_WAIT(*inode), &wait, TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE); - mutex_unlock(PIPE_MUTEX(*inode)); + prepare_to_wait(&pipe->wait, &wait, + TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); schedule(); - finish_wait(PIPE_WAIT(*inode), &wait); - mutex_lock(PIPE_MUTEX(*inode)); + finish_wait(&pipe->wait, &wait); + if (pipe->inode) + mutex_lock(&pipe->inode->i_mutex); } static int @@ -91,7 +94,8 @@ pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len) return 0; } -static void anon_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buffer *buf) +static void anon_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { struct page *page = buf->page; @@ -100,30 +104,27 @@ static void anon_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buff /* * If nobody else uses this page, and we don't already have a * temporary page, let's keep track of it as a one-deep - * allocation cache + * allocation cache. (Otherwise just release our reference to it) */ - if (page_count(page) == 1 && !info->tmp_page) { - info->tmp_page = page; - return; - } - - /* - * Otherwise just release our reference to it - */ - page_cache_release(page); + if (page_count(page) == 1 && !pipe->tmp_page) + pipe->tmp_page = page; + else + page_cache_release(page); } -static void *anon_pipe_buf_map(struct file *file, struct pipe_inode_info *info, struct pipe_buffer *buf) +static void * anon_pipe_buf_map(struct file *file, struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { return kmap(buf->page); } -static void anon_pipe_buf_unmap(struct pipe_inode_info *info, struct pipe_buffer *buf) +static void anon_pipe_buf_unmap(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { kunmap(buf->page); } -static int anon_pipe_buf_steal(struct pipe_inode_info *info, +static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { buf->flags |= PIPE_BUF_FLAG_STOLEN; @@ -143,7 +144,7 @@ pipe_readv(struct file *filp, const struct iovec *_iov, unsigned long nr_segs, loff_t *ppos) { struct inode *inode = filp->f_dentry->d_inode; - struct pipe_inode_info *info; + struct pipe_inode_info *pipe; int do_wakeup; ssize_t ret; struct iovec *iov = (struct iovec *)_iov; @@ -156,13 +157,13 @@ pipe_readv(struct file *filp, const struct iovec *_iov, do_wakeup = 0; ret = 0; - mutex_lock(PIPE_MUTEX(*inode)); - info = inode->i_pipe; + mutex_lock(&inode->i_mutex); + pipe = inode->i_pipe; for (;;) { - int bufs = info->nrbufs; + int bufs = pipe->nrbufs; if (bufs) { - int curbuf = info->curbuf; - struct pipe_buffer *buf = info->bufs + curbuf; + int curbuf = pipe->curbuf; + struct pipe_buffer *buf = pipe->bufs + curbuf; struct pipe_buf_operations *ops = buf->ops; void *addr; size_t chars = buf->len; @@ -171,16 +172,17 @@ pipe_readv(struct file *filp, const struct iovec *_iov, if (chars > total_len) chars = total_len; - addr = ops->map(filp, info, buf); + addr = ops->map(filp, pipe, buf); if (IS_ERR(addr)) { if (!ret) ret = PTR_ERR(addr); break; } error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars); - ops->unmap(info, buf); + ops->unmap(pipe, buf); if (unlikely(error)) { - if (!ret) ret = -EFAULT; + if (!ret) + ret = -EFAULT; break; } ret += chars; @@ -188,10 +190,10 @@ pipe_readv(struct file *filp, const struct iovec *_iov, buf->len -= chars; if (!buf->len) { buf->ops = NULL; - ops->release(info, buf); + ops->release(pipe, buf); curbuf = (curbuf + 1) & (PIPE_BUFFERS-1); - info->curbuf = curbuf; - info->nrbufs = --bufs; + pipe->curbuf = curbuf; + pipe->nrbufs = --bufs; do_wakeup = 1; } total_len -= chars; @@ -200,9 +202,9 @@ pipe_readv(struct file *filp, const struct iovec *_iov, } if (bufs) /* More to do? */ continue; - if (!PIPE_WRITERS(*inode)) + if (!pipe->writers) break; - if (!PIPE_WAITING_WRITERS(*inode)) { + if (!pipe->waiting_writers) { /* syscall merging: Usually we must not sleep * if O_NONBLOCK is set, or if we got some data. * But if a writer sleeps in kernel space, then @@ -216,20 +218,22 @@ pipe_readv(struct file *filp, const struct iovec *_iov, } } if (signal_pending(current)) { - if (!ret) ret = -ERESTARTSYS; + if (!ret) + ret = -ERESTARTSYS; break; } if (do_wakeup) { - wake_up_interruptible_sync(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - pipe_wait(inode); + pipe_wait(pipe); } - mutex_unlock(PIPE_MUTEX(*inode)); - /* Signal writers asynchronously that there is more room. */ + mutex_unlock(&inode->i_mutex); + + /* Signal writers asynchronously that there is more room. */ if (do_wakeup) { - wake_up_interruptible(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } if (ret > 0) file_accessed(filp); @@ -240,6 +244,7 @@ static ssize_t pipe_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { struct iovec iov = { .iov_base = buf, .iov_len = count }; + return pipe_readv(filp, &iov, 1, ppos); } @@ -248,7 +253,7 @@ pipe_writev(struct file *filp, const struct iovec *_iov, unsigned long nr_segs, loff_t *ppos) { struct inode *inode = filp->f_dentry->d_inode; - struct pipe_inode_info *info; + struct pipe_inode_info *pipe; ssize_t ret; int do_wakeup; struct iovec *iov = (struct iovec *)_iov; @@ -262,10 +267,10 @@ pipe_writev(struct file *filp, const struct iovec *_iov, do_wakeup = 0; ret = 0; - mutex_lock(PIPE_MUTEX(*inode)); - info = inode->i_pipe; + mutex_lock(&inode->i_mutex); + pipe = inode->i_pipe; - if (!PIPE_READERS(*inode)) { + if (!pipe->readers) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; @@ -273,23 +278,25 @@ pipe_writev(struct file *filp, const struct iovec *_iov, /* We try to merge small writes */ chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ - if (info->nrbufs && chars != 0) { - int lastbuf = (info->curbuf + info->nrbufs - 1) & (PIPE_BUFFERS-1); - struct pipe_buffer *buf = info->bufs + lastbuf; + if (pipe->nrbufs && chars != 0) { + int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & + (PIPE_BUFFERS-1); + struct pipe_buffer *buf = pipe->bufs + lastbuf; struct pipe_buf_operations *ops = buf->ops; int offset = buf->offset + buf->len; + if (ops->can_merge && offset + chars <= PAGE_SIZE) { void *addr; int error; - addr = ops->map(filp, info, buf); + addr = ops->map(filp, pipe, buf); if (IS_ERR(addr)) { error = PTR_ERR(addr); goto out; } error = pipe_iov_copy_from_user(offset + addr, iov, chars); - ops->unmap(info, buf); + ops->unmap(pipe, buf); ret = error; do_wakeup = 1; if (error) @@ -304,16 +311,18 @@ pipe_writev(struct file *filp, const struct iovec *_iov, for (;;) { int bufs; - if (!PIPE_READERS(*inode)) { + + if (!pipe->readers) { send_sig(SIGPIPE, current, 0); - if (!ret) ret = -EPIPE; + if (!ret) + ret = -EPIPE; break; } - bufs = info->nrbufs; + bufs = pipe->nrbufs; if (bufs < PIPE_BUFFERS) { - int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS-1); - struct pipe_buffer *buf = info->bufs + newbuf; - struct page *page = info->tmp_page; + int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1); + struct pipe_buffer *buf = pipe->bufs + newbuf; + struct page *page = pipe->tmp_page; int error; if (!page) { @@ -322,9 +331,9 @@ pipe_writev(struct file *filp, const struct iovec *_iov, ret = ret ? : -ENOMEM; break; } - info->tmp_page = page; + pipe->tmp_page = page; } - /* Always wakeup, even if the copy fails. Otherwise + /* Always wake up, even if the copy fails. Otherwise * we lock up (O_NONBLOCK-)readers that sleep due to * syscall merging. * FIXME! Is this really true? @@ -337,7 +346,8 @@ pipe_writev(struct file *filp, const struct iovec *_iov, error = pipe_iov_copy_from_user(kmap(page), iov, chars); kunmap(page); if (unlikely(error)) { - if (!ret) ret = -EFAULT; + if (!ret) + ret = -EFAULT; break; } ret += chars; @@ -347,8 +357,8 @@ pipe_writev(struct file *filp, const struct iovec *_iov, buf->ops = &anon_pipe_buf_ops; buf->offset = 0; buf->len = chars; - info->nrbufs = ++bufs; - info->tmp_page = NULL; + pipe->nrbufs = ++bufs; + pipe->tmp_page = NULL; total_len -= chars; if (!total_len) @@ -357,27 +367,29 @@ pipe_writev(struct file *filp, const struct iovec *_iov, if (bufs < PIPE_BUFFERS) continue; if (filp->f_flags & O_NONBLOCK) { - if (!ret) ret = -EAGAIN; + if (!ret) + ret = -EAGAIN; break; } if (signal_pending(current)) { - if (!ret) ret = -ERESTARTSYS; + if (!ret) + ret = -ERESTARTSYS; break; } if (do_wakeup) { - wake_up_interruptible_sync(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); do_wakeup = 0; } - PIPE_WAITING_WRITERS(*inode)++; - pipe_wait(inode); - PIPE_WAITING_WRITERS(*inode)--; + pipe->waiting_writers++; + pipe_wait(pipe); + pipe->waiting_writers--; } out: - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); if (do_wakeup) { - wake_up_interruptible(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } if (ret > 0) file_update_time(filp); @@ -389,6 +401,7 @@ pipe_write(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) { struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; + return pipe_writev(filp, &iov, 1, ppos); } @@ -399,7 +412,8 @@ bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos) } static ssize_t -bad_pipe_w(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) +bad_pipe_w(struct file *filp, const char __user *buf, size_t count, + loff_t *ppos) { return -EBADF; } @@ -409,21 +423,22 @@ pipe_ioctl(struct inode *pino, struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = filp->f_dentry->d_inode; - struct pipe_inode_info *info; + struct pipe_inode_info *pipe; int count, buf, nrbufs; switch (cmd) { case FIONREAD: - mutex_lock(PIPE_MUTEX(*inode)); - info = inode->i_pipe; + mutex_lock(&inode->i_mutex); + pipe = inode->i_pipe; count = 0; - buf = info->curbuf; - nrbufs = info->nrbufs; + buf = pipe->curbuf; + nrbufs = pipe->nrbufs; while (--nrbufs >= 0) { - count += info->bufs[buf].len; + count += pipe->bufs[buf].len; buf = (buf+1) & (PIPE_BUFFERS-1); } - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); + return put_user(count, (int __user *)arg); default: return -EINVAL; @@ -436,17 +451,17 @@ pipe_poll(struct file *filp, poll_table *wait) { unsigned int mask; struct inode *inode = filp->f_dentry->d_inode; - struct pipe_inode_info *info = inode->i_pipe; + struct pipe_inode_info *pipe = inode->i_pipe; int nrbufs; - poll_wait(filp, PIPE_WAIT(*inode), wait); + poll_wait(filp, &pipe->wait, wait); /* Reading only -- no need for acquiring the semaphore. */ - nrbufs = info->nrbufs; + nrbufs = pipe->nrbufs; mask = 0; if (filp->f_mode & FMODE_READ) { mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0; - if (!PIPE_WRITERS(*inode) && filp->f_version != PIPE_WCOUNTER(*inode)) + if (!pipe->writers && filp->f_version != pipe->w_counter) mask |= POLLHUP; } @@ -456,7 +471,7 @@ pipe_poll(struct file *filp, poll_table *wait) * Most Unices do not set POLLERR for FIFOs but on Linux they * behave exactly like pipes for poll(). */ - if (!PIPE_READERS(*inode)) + if (!pipe->readers) mask |= POLLERR; } @@ -466,17 +481,21 @@ pipe_poll(struct file *filp, poll_table *wait) static int pipe_release(struct inode *inode, int decr, int decw) { - mutex_lock(PIPE_MUTEX(*inode)); - PIPE_READERS(*inode) -= decr; - PIPE_WRITERS(*inode) -= decw; - if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) { + struct pipe_inode_info *pipe; + + mutex_lock(&inode->i_mutex); + pipe = inode->i_pipe; + pipe->readers -= decr; + pipe->writers -= decw; + + if (!pipe->readers && !pipe->writers) { free_pipe_info(inode); } else { - wake_up_interruptible(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); - kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); return 0; } @@ -487,9 +506,9 @@ pipe_read_fasync(int fd, struct file *filp, int on) struct inode *inode = filp->f_dentry->d_inode; int retval; - mutex_lock(PIPE_MUTEX(*inode)); - retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode)); - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); + retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers); + mutex_unlock(&inode->i_mutex); if (retval < 0) return retval; @@ -504,9 +523,9 @@ pipe_write_fasync(int fd, struct file *filp, int on) struct inode *inode = filp->f_dentry->d_inode; int retval; - mutex_lock(PIPE_MUTEX(*inode)); - retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode)); - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); + retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers); + mutex_unlock(&inode->i_mutex); if (retval < 0) return retval; @@ -519,16 +538,17 @@ static int pipe_rdwr_fasync(int fd, struct file *filp, int on) { struct inode *inode = filp->f_dentry->d_inode; + struct pipe_inode_info *pipe = inode->i_pipe; int retval; - mutex_lock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode)); + retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); if (retval >= 0) - retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode)); + retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); if (retval < 0) return retval; @@ -567,9 +587,9 @@ pipe_read_open(struct inode *inode, struct file *filp) { /* We could have perhaps used atomic_t, but this and friends below are the only places. So it doesn't seem worthwhile. */ - mutex_lock(PIPE_MUTEX(*inode)); - PIPE_READERS(*inode)++; - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); + inode->i_pipe->readers++; + mutex_unlock(&inode->i_mutex); return 0; } @@ -577,9 +597,9 @@ pipe_read_open(struct inode *inode, struct file *filp) static int pipe_write_open(struct inode *inode, struct file *filp) { - mutex_lock(PIPE_MUTEX(*inode)); - PIPE_WRITERS(*inode)++; - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); + inode->i_pipe->writers++; + mutex_unlock(&inode->i_mutex); return 0; } @@ -587,12 +607,12 @@ pipe_write_open(struct inode *inode, struct file *filp) static int pipe_rdwr_open(struct inode *inode, struct file *filp) { - mutex_lock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); if (filp->f_mode & FMODE_READ) - PIPE_READERS(*inode)++; + inode->i_pipe->readers++; if (filp->f_mode & FMODE_WRITE) - PIPE_WRITERS(*inode)++; - mutex_unlock(PIPE_MUTEX(*inode)); + inode->i_pipe->writers++; + mutex_unlock(&inode->i_mutex); return 0; } @@ -675,37 +695,38 @@ static struct file_operations rdwr_pipe_fops = { .fasync = pipe_rdwr_fasync, }; -void free_pipe_info(struct inode *inode) +struct pipe_inode_info * alloc_pipe_info(struct inode *inode) +{ + struct pipe_inode_info *pipe; + + pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); + if (pipe) { + init_waitqueue_head(&pipe->wait); + pipe->r_counter = pipe->w_counter = 1; + pipe->inode = inode; + } + + return pipe; +} + +void __free_pipe_info(struct pipe_inode_info *pipe) { int i; - struct pipe_inode_info *info = inode->i_pipe; - inode->i_pipe = NULL; for (i = 0; i < PIPE_BUFFERS; i++) { - struct pipe_buffer *buf = info->bufs + i; + struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) - buf->ops->release(info, buf); + buf->ops->release(pipe, buf); } - if (info->tmp_page) - __free_page(info->tmp_page); - kfree(info); + if (pipe->tmp_page) + __free_page(pipe->tmp_page); + kfree(pipe); } -struct inode* pipe_new(struct inode* inode) +void free_pipe_info(struct inode *inode) { - struct pipe_inode_info *info; - - info = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); - if (!info) - goto fail_page; - inode->i_pipe = info; - - init_waitqueue_head(PIPE_WAIT(*inode)); - PIPE_RCOUNTER(*inode) = PIPE_WCOUNTER(*inode) = 1; - - return inode; -fail_page: - return NULL; + __free_pipe_info(inode->i_pipe); + inode->i_pipe = NULL; } static struct vfsmount *pipe_mnt __read_mostly; @@ -713,6 +734,7 @@ static int pipefs_delete_dentry(struct dentry *dentry) { return 1; } + static struct dentry_operations pipefs_dentry_operations = { .d_delete = pipefs_delete_dentry, }; @@ -720,13 +742,17 @@ static struct dentry_operations pipefs_dentry_operations = { static struct inode * get_pipe_inode(void) { struct inode *inode = new_inode(pipe_mnt->mnt_sb); + struct pipe_inode_info *pipe; if (!inode) goto fail_inode; - if(!pipe_new(inode)) + pipe = alloc_pipe_info(inode); + if (!pipe) goto fail_iput; - PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 1; + inode->i_pipe = pipe; + + pipe->readers = pipe->writers = 1; inode->i_fop = &rdwr_pipe_fops; /* @@ -741,10 +767,12 @@ static struct inode * get_pipe_inode(void) inode->i_gid = current->fsgid; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_blksize = PAGE_SIZE; + return inode; fail_iput: iput(inode); + fail_inode: return NULL; } @@ -757,7 +785,7 @@ int do_pipe(int *fd) struct inode * inode; struct file *f1, *f2; int error; - int i,j; + int i, j; error = -ENFILE; f1 = get_empty_filp(); @@ -790,6 +818,7 @@ int do_pipe(int *fd) dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &this); if (!dentry) goto close_f12_inode_i_j; + dentry->d_op = &pipefs_dentry_operations; d_add(dentry, inode); f1->f_vfsmnt = f2->f_vfsmnt = mntget(mntget(pipe_mnt)); @@ -813,6 +842,7 @@ int do_pipe(int *fd) fd_install(j, f2); fd[0] = i; fd[1] = j; + return 0; close_f12_inode_i_j: @@ -837,8 +867,9 @@ no_files: * d_name - pipe: will go nicely and kill the special-casing in procfs. */ -static struct super_block *pipefs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static struct super_block * +pipefs_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) { return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC); } @@ -852,6 +883,7 @@ static struct file_system_type pipe_fs_type = { static int __init init_pipe_fs(void) { int err = register_filesystem(&pipe_fs_type); + if (!err) { pipe_mnt = kern_mount(&pipe_fs_type); if (IS_ERR(pipe_mnt)) { diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 7efa73d44c9..20d4b2237fc 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -103,8 +103,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) { ssize_t acc = 0, tmp; - size_t tsz, nr_bytes; - u64 start; + size_t tsz; + u64 start, nr_bytes; struct vmcore *curr_m = NULL; if (buflen == 0 || *fpos >= vmcore_size) diff --git a/fs/read_write.c b/fs/read_write.c index 6256ca81a71..5bc0e9234f9 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -202,7 +202,7 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count goto Einval; inode = file->f_dentry->d_inode; - if (inode->i_flock && MANDATORY_LOCK(inode)) { + if (unlikely(inode->i_flock && MANDATORY_LOCK(inode))) { int retval = locks_mandatory_area( read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE, inode, file, pos, count); diff --git a/fs/select.c b/fs/select.c index 071660fa7b0..a8109baa5e4 100644 --- a/fs/select.c +++ b/fs/select.c @@ -310,8 +310,9 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, s64 *timeout) { fd_set_bits fds; - char *bits; - int ret, size, max_fdset; + void *bits; + int ret, max_fdset; + unsigned int size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; @@ -333,20 +334,21 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, * since we used fdset we need to allocate memory in units of * long-words. */ - ret = -ENOMEM; size = FDS_BYTES(n); - if (6*size < SELECT_STACK_ALLOC) - bits = stack_fds; - else + bits = stack_fds; + if (size > sizeof(stack_fds) / 6) { + /* Not enough space in on-stack array; must use kmalloc */ + ret = -ENOMEM; bits = kmalloc(6 * size, GFP_KERNEL); - if (!bits) - goto out_nofds; - fds.in = (unsigned long *) bits; - fds.out = (unsigned long *) (bits + size); - fds.ex = (unsigned long *) (bits + 2*size); - fds.res_in = (unsigned long *) (bits + 3*size); - fds.res_out = (unsigned long *) (bits + 4*size); - fds.res_ex = (unsigned long *) (bits + 5*size); + if (!bits) + goto out_nofds; + } + fds.in = bits; + fds.out = bits + size; + fds.ex = bits + 2*size; + fds.res_in = bits + 3*size; + fds.res_out = bits + 4*size; + fds.res_ex = bits + 5*size; if ((ret = get_fd_set(n, inp, fds.in)) || (ret = get_fd_set(n, outp, fds.out)) || diff --git a/fs/splice.c b/fs/splice.c index bfa42a277bb..e50a460239d 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -9,11 +9,12 @@ * that transfers data buffers to or from a pipe buffer. * * Named by Larry McVoy, original implementation from Linus, extended by - * Jens to support splicing to files and fixing the initial implementation - * bugs. + * Jens to support splicing to files, network, direct splicing, etc and + * fixing lots of bugs. * - * Copyright (C) 2005 Jens Axboe <axboe@suse.de> - * Copyright (C) 2005 Linus Torvalds <torvalds@osdl.org> + * Copyright (C) 2005-2006 Jens Axboe <axboe@suse.de> + * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> + * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> * */ #include <linux/fs.h> @@ -84,26 +85,43 @@ static void *page_cache_pipe_buf_map(struct file *file, struct pipe_buffer *buf) { struct page *page = buf->page; - - lock_page(page); + int err; if (!PageUptodate(page)) { - unlock_page(page); - return ERR_PTR(-EIO); - } + lock_page(page); + + /* + * Page got truncated/unhashed. This will cause a 0-byte + * splice, if this is the first page. + */ + if (!page->mapping) { + err = -ENODATA; + goto error; + } - if (!page->mapping) { + /* + * Uh oh, read-error from disk. + */ + if (!PageUptodate(page)) { + err = -EIO; + goto error; + } + + /* + * Page is ok afterall, fall through to mapping. + */ unlock_page(page); - return ERR_PTR(-ENODATA); } - return kmap(buf->page); + return kmap(page); +error: + unlock_page(page); + return ERR_PTR(err); } static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info, struct pipe_buffer *buf) { - unlock_page(buf->page); kunmap(buf->page); } @@ -119,34 +137,30 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = { * Pipe output worker. This sets up our pipe format with the page cache * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). */ -static ssize_t move_to_pipe(struct inode *inode, struct page **pages, +static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, int nr_pages, unsigned long offset, unsigned long len, unsigned int flags) { - struct pipe_inode_info *info; int ret, do_wakeup, i; ret = 0; do_wakeup = 0; i = 0; - mutex_lock(PIPE_MUTEX(*inode)); + if (pipe->inode) + mutex_lock(&pipe->inode->i_mutex); - info = inode->i_pipe; for (;;) { - int bufs; - - if (!PIPE_READERS(*inode)) { + if (!pipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } - bufs = info->nrbufs; - if (bufs < PIPE_BUFFERS) { - int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS - 1); - struct pipe_buffer *buf = info->bufs + newbuf; + if (pipe->nrbufs < PIPE_BUFFERS) { + int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); + struct pipe_buffer *buf = pipe->bufs + newbuf; struct page *page = pages[i++]; unsigned long this_len; @@ -158,8 +172,9 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages, buf->offset = offset; buf->len = this_len; buf->ops = &page_cache_pipe_buf_ops; - info->nrbufs = ++bufs; - do_wakeup = 1; + pipe->nrbufs++; + if (pipe->inode) + do_wakeup = 1; ret += this_len; len -= this_len; @@ -168,7 +183,7 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages, break; if (!len) break; - if (bufs < PIPE_BUFFERS) + if (pipe->nrbufs < PIPE_BUFFERS) continue; break; @@ -187,22 +202,26 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages, } if (do_wakeup) { - wake_up_interruptible_sync(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, - POLL_IN); + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); do_wakeup = 0; } - PIPE_WAITING_WRITERS(*inode)++; - pipe_wait(inode); - PIPE_WAITING_WRITERS(*inode)--; + pipe->waiting_writers++; + pipe_wait(pipe); + pipe->waiting_writers--; } - mutex_unlock(PIPE_MUTEX(*inode)); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); if (do_wakeup) { - wake_up_interruptible(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } while (i < nr_pages) @@ -211,15 +230,16 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages, return ret; } -static int __generic_file_splice_read(struct file *in, struct inode *pipe, - size_t len, unsigned int flags) +static int +__generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, + size_t len, unsigned int flags) { struct address_space *mapping = in->f_mapping; unsigned int offset, nr_pages; - struct page *pages[PIPE_BUFFERS], *shadow[PIPE_BUFFERS]; + struct page *pages[PIPE_BUFFERS]; struct page *page; - pgoff_t index, pidx; - int i, j; + pgoff_t index; + int i, error; index = in->f_pos >> PAGE_CACHE_SHIFT; offset = in->f_pos & ~PAGE_CACHE_MASK; @@ -229,78 +249,94 @@ static int __generic_file_splice_read(struct file *in, struct inode *pipe, nr_pages = PIPE_BUFFERS; /* - * initiate read-ahead on this page range - */ - do_page_cache_readahead(mapping, in, index, nr_pages); - - /* - * Get as many pages from the page cache as possible.. - * Start IO on the page cache entries we create (we - * can assume that any pre-existing ones we find have - * already had IO started on them). + * Initiate read-ahead on this page range. however, don't call into + * read-ahead if this is a non-zero offset (we are likely doing small + * chunk splice and the page is already there) for a single page. */ - i = find_get_pages(mapping, index, nr_pages, pages); + if (!offset || nr_pages > 1) + do_page_cache_readahead(mapping, in, index, nr_pages); /* - * common case - we found all pages and they are contiguous, - * kick them off + * Now fill in the holes: */ - if (i && (pages[i - 1]->index == index + i - 1)) - goto splice_them; + error = 0; + for (i = 0; i < nr_pages; i++, index++) { +find_page: + /* + * lookup the page for this index + */ + page = find_get_page(mapping, index); + if (!page) { + /* + * If in nonblock mode then dont block on + * readpage (we've kicked readahead so there + * will be asynchronous progress): + */ + if (flags & SPLICE_F_NONBLOCK) + break; - /* - * fill shadow[] with pages at the right locations, so we only - * have to fill holes - */ - memset(shadow, 0, nr_pages * sizeof(struct page *)); - for (j = 0; j < i; j++) - shadow[pages[j]->index - index] = pages[j]; + /* + * page didn't exist, allocate one + */ + page = page_cache_alloc_cold(mapping); + if (!page) + break; - /* - * now fill in the holes - */ - for (i = 0, pidx = index; i < nr_pages; pidx++, i++) { - int error; + error = add_to_page_cache_lru(page, mapping, index, + mapping_gfp_mask(mapping)); + if (unlikely(error)) { + page_cache_release(page); + break; + } - if (shadow[i]) - continue; + goto readpage; + } /* - * no page there, look one up / create it + * If the page isn't uptodate, we may need to start io on it */ - page = find_or_create_page(mapping, pidx, - mapping_gfp_mask(mapping)); - if (!page) - break; + if (!PageUptodate(page)) { + lock_page(page); + + /* + * page was truncated, stop here. if this isn't the + * first page, we'll just complete what we already + * added + */ + if (!page->mapping) { + unlock_page(page); + page_cache_release(page); + break; + } + /* + * page was already under io and is now done, great + */ + if (PageUptodate(page)) { + unlock_page(page); + goto fill_it; + } - if (PageUptodate(page)) - unlock_page(page); - else { +readpage: + /* + * need to read in the page + */ error = mapping->a_ops->readpage(in, page); if (unlikely(error)) { page_cache_release(page); + if (error == AOP_TRUNCATED_PAGE) + goto find_page; break; } } - shadow[i] = page; +fill_it: + pages[i] = page; } - if (!i) { - for (i = 0; i < nr_pages; i++) { - if (shadow[i]) - page_cache_release(shadow[i]); - } - return 0; - } + if (i) + return move_to_pipe(pipe, pages, i, offset, len, flags); - memcpy(pages, shadow, i * sizeof(struct page *)); - - /* - * Now we splice them into the pipe.. - */ -splice_them: - return move_to_pipe(pipe, pages, i, offset, len, flags); + return error; } /** @@ -311,9 +347,8 @@ splice_them: * @flags: splice modifier flags * * Will read pages from given file and fill them into a pipe. - * */ -ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, +ssize_t generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { ssize_t spliced; @@ -321,6 +356,7 @@ ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, ret = 0; spliced = 0; + while (len) { ret = __generic_file_splice_read(in, pipe, len, flags); @@ -360,10 +396,10 @@ static int pipe_to_sendpage(struct pipe_inode_info *info, int more; /* - * sub-optimal, but we are limited by the pipe ->map. we don't + * Sub-optimal, but we are limited by the pipe ->map. We don't * need a kmap'ed buffer here, we just want to make sure we * have the page pinned if the pipe page originates from the - * page cache + * page cache. */ ptr = buf->ops->map(file, info, buf); if (IS_ERR(ptr)) @@ -414,7 +450,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, int ret; /* - * after this, page will be locked and unmapped + * make sure the data in this buffer is uptodate */ src = buf->ops->map(file, info, buf); if (IS_ERR(src)) @@ -424,7 +460,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, offset = sd->pos & ~PAGE_CACHE_MASK; /* - * reuse buf page, if SPLICE_F_MOVE is set + * Reuse buf page, if SPLICE_F_MOVE is set. */ if (sd->flags & SPLICE_F_MOVE) { /* @@ -434,6 +470,9 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, if (buf->ops->steal(info, buf)) goto find_page; + /* + * this will also set the page locked + */ page = buf->page; if (add_to_page_cache(page, mapping, index, gfp_mask)) goto find_page; @@ -445,7 +484,7 @@ find_page: ret = -ENOMEM; page = find_or_create_page(mapping, index, gfp_mask); if (!page) - goto out; + goto out_nomem; /* * If the page is uptodate, it is also locked. If it isn't @@ -462,7 +501,7 @@ find_page: if (!PageUptodate(page)) { /* - * page got invalidated, repeat + * Page got invalidated, repeat. */ if (!page->mapping) { unlock_page(page); @@ -501,12 +540,14 @@ find_page: } else if (ret) goto out; + mark_page_accessed(page); balance_dirty_pages_ratelimited(mapping); out: if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { page_cache_release(page); unlock_page(page); } +out_nomem: buf->ops->unmap(info, buf); return ret; } @@ -519,11 +560,10 @@ typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, * key here is the 'actor' worker passed in that actually moves the data * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. */ -static ssize_t move_from_pipe(struct inode *inode, struct file *out, +static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, size_t len, unsigned int flags, splice_actor *actor) { - struct pipe_inode_info *info; int ret, do_wakeup, err; struct splice_desc sd; @@ -535,22 +575,19 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, sd.file = out; sd.pos = out->f_pos; - mutex_lock(PIPE_MUTEX(*inode)); + if (pipe->inode) + mutex_lock(&pipe->inode->i_mutex); - info = inode->i_pipe; for (;;) { - int bufs = info->nrbufs; - - if (bufs) { - int curbuf = info->curbuf; - struct pipe_buffer *buf = info->bufs + curbuf; + if (pipe->nrbufs) { + struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; struct pipe_buf_operations *ops = buf->ops; sd.len = buf->len; if (sd.len > sd.total_len) sd.len = sd.total_len; - err = actor(info, buf, &sd); + err = actor(pipe, buf, &sd); if (err) { if (!ret && err != -ENODATA) ret = err; @@ -561,13 +598,14 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, ret += sd.len; buf->offset += sd.len; buf->len -= sd.len; + if (!buf->len) { buf->ops = NULL; - ops->release(info, buf); - curbuf = (curbuf + 1) & (PIPE_BUFFERS - 1); - info->curbuf = curbuf; - info->nrbufs = --bufs; - do_wakeup = 1; + ops->release(pipe, buf); + pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); + pipe->nrbufs--; + if (pipe->inode) + do_wakeup = 1; } sd.pos += sd.len; @@ -576,11 +614,11 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, break; } - if (bufs) + if (pipe->nrbufs) continue; - if (!PIPE_WRITERS(*inode)) + if (!pipe->writers) break; - if (!PIPE_WAITING_WRITERS(*inode)) { + if (!pipe->waiting_writers) { if (ret) break; } @@ -598,31 +636,34 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, } if (do_wakeup) { - wake_up_interruptible_sync(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_WRITERS(*inode),SIGIO,POLL_OUT); + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); do_wakeup = 0; } - pipe_wait(inode); + pipe_wait(pipe); } - mutex_unlock(PIPE_MUTEX(*inode)); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); if (do_wakeup) { - wake_up_interruptible(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - mutex_lock(&out->f_mapping->host->i_mutex); out->f_pos = sd.pos; - mutex_unlock(&out->f_mapping->host->i_mutex); return ret; } /** * generic_file_splice_write - splice data from a pipe to a file - * @inode: pipe inode + * @pipe: pipe info * @out: file to write to * @len: number of bytes to splice * @flags: splice modifier flags @@ -631,14 +672,17 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, * the given pipe inode to the given file. * */ -ssize_t generic_file_splice_write(struct inode *inode, struct file *out, - size_t len, unsigned int flags) +ssize_t +generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, + size_t len, unsigned int flags) { struct address_space *mapping = out->f_mapping; - ssize_t ret = move_from_pipe(inode, out, len, flags, pipe_to_file); + ssize_t ret; + + ret = move_from_pipe(pipe, out, len, flags, pipe_to_file); /* - * if file or inode is SYNC and we actually wrote some data, sync it + * If file or inode is SYNC and we actually wrote some data, sync it. */ if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(mapping->host)) && ret > 0) { @@ -647,7 +691,7 @@ ssize_t generic_file_splice_write(struct inode *inode, struct file *out, mutex_lock(&inode->i_mutex); err = generic_osync_inode(mapping->host, mapping, - OSYNC_METADATA|OSYNC_DATA); + OSYNC_METADATA|OSYNC_DATA); mutex_unlock(&inode->i_mutex); if (err) @@ -670,10 +714,10 @@ EXPORT_SYMBOL(generic_file_splice_write); * is involved. * */ -ssize_t generic_splice_sendpage(struct inode *inode, struct file *out, +ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, size_t len, unsigned int flags) { - return move_from_pipe(inode, out, len, flags, pipe_to_sendpage); + return move_from_pipe(pipe, out, len, flags, pipe_to_sendpage); } EXPORT_SYMBOL(generic_splice_sendpage); @@ -681,19 +725,20 @@ EXPORT_SYMBOL(generic_splice_sendpage); /* * Attempt to initiate a splice from pipe to file. */ -static long do_splice_from(struct inode *pipe, struct file *out, size_t len, - unsigned int flags) +static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, + size_t len, unsigned int flags) { loff_t pos; int ret; - if (!out->f_op || !out->f_op->splice_write) + if (unlikely(!out->f_op || !out->f_op->splice_write)) return -EINVAL; - if (!(out->f_mode & FMODE_WRITE)) + if (unlikely(!(out->f_mode & FMODE_WRITE))) return -EBADF; pos = out->f_pos; + ret = rw_verify_area(WRITE, out, &pos, len); if (unlikely(ret < 0)) return ret; @@ -704,19 +749,20 @@ static long do_splice_from(struct inode *pipe, struct file *out, size_t len, /* * Attempt to initiate a splice from a file to a pipe. */ -static long do_splice_to(struct file *in, struct inode *pipe, size_t len, - unsigned int flags) +static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, + size_t len, unsigned int flags) { loff_t pos, isize, left; int ret; - if (!in->f_op || !in->f_op->splice_read) + if (unlikely(!in->f_op || !in->f_op->splice_read)) return -EINVAL; - if (!(in->f_mode & FMODE_READ)) + if (unlikely(!(in->f_mode & FMODE_READ))) return -EBADF; pos = in->f_pos; + ret = rw_verify_area(READ, in, &pos, len); if (unlikely(ret < 0)) return ret; @@ -726,32 +772,168 @@ static long do_splice_to(struct file *in, struct inode *pipe, size_t len, return 0; left = isize - in->f_pos; - if (left < len) + if (unlikely(left < len)) len = left; return in->f_op->splice_read(in, pipe, len, flags); } +long do_splice_direct(struct file *in, struct file *out, size_t len, + unsigned int flags) +{ + struct pipe_inode_info *pipe; + long ret, bytes; + umode_t i_mode; + int i; + + /* + * We require the input being a regular file, as we don't want to + * randomly drop data for eg socket -> socket splicing. Use the + * piped splicing for that! + */ + i_mode = in->f_dentry->d_inode->i_mode; + if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) + return -EINVAL; + + /* + * neither in nor out is a pipe, setup an internal pipe attached to + * 'out' and transfer the wanted data from 'in' to 'out' through that + */ + pipe = current->splice_pipe; + if (unlikely(!pipe)) { + pipe = alloc_pipe_info(NULL); + if (!pipe) + return -ENOMEM; + + /* + * We don't have an immediate reader, but we'll read the stuff + * out of the pipe right after the move_to_pipe(). So set + * PIPE_READERS appropriately. + */ + pipe->readers = 1; + + current->splice_pipe = pipe; + } + + /* + * Do the splice. + */ + ret = 0; + bytes = 0; + + while (len) { + size_t read_len, max_read_len; + + /* + * Do at most PIPE_BUFFERS pages worth of transfer: + */ + max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE)); + + ret = do_splice_to(in, pipe, max_read_len, flags); + if (unlikely(ret < 0)) + goto out_release; + + read_len = ret; + + /* + * NOTE: nonblocking mode only applies to the input. We + * must not do the output in nonblocking mode as then we + * could get stuck data in the internal pipe: + */ + ret = do_splice_from(pipe, out, read_len, + flags & ~SPLICE_F_NONBLOCK); + if (unlikely(ret < 0)) + goto out_release; + + bytes += ret; + len -= ret; + + /* + * In nonblocking mode, if we got back a short read then + * that was due to either an IO error or due to the + * pagecache entry not being there. In the IO error case + * the _next_ splice attempt will produce a clean IO error + * return value (not a short read), so in both cases it's + * correct to break out of the loop here: + */ + if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len)) + break; + } + + pipe->nrbufs = pipe->curbuf = 0; + + return bytes; + +out_release: + /* + * If we did an incomplete transfer we must release + * the pipe buffers in question: + */ + for (i = 0; i < PIPE_BUFFERS; i++) { + struct pipe_buffer *buf = pipe->bufs + i; + + if (buf->ops) { + buf->ops->release(pipe, buf); + buf->ops = NULL; + } + } + pipe->nrbufs = pipe->curbuf = 0; + + /* + * If we transferred some data, return the number of bytes: + */ + if (bytes > 0) + return bytes; + + return ret; +} + +EXPORT_SYMBOL(do_splice_direct); + /* * Determine where to splice to/from. */ -static long do_splice(struct file *in, struct file *out, size_t len, - unsigned int flags) +static long do_splice(struct file *in, loff_t __user *off_in, + struct file *out, loff_t __user *off_out, + size_t len, unsigned int flags) { - struct inode *pipe; + struct pipe_inode_info *pipe; + + pipe = in->f_dentry->d_inode->i_pipe; + if (pipe) { + if (off_in) + return -ESPIPE; + if (off_out) { + if (out->f_op->llseek == no_llseek) + return -EINVAL; + if (copy_from_user(&out->f_pos, off_out, + sizeof(loff_t))) + return -EFAULT; + } - pipe = in->f_dentry->d_inode; - if (pipe->i_pipe) return do_splice_from(pipe, out, len, flags); + } + + pipe = out->f_dentry->d_inode->i_pipe; + if (pipe) { + if (off_out) + return -ESPIPE; + if (off_in) { + if (in->f_op->llseek == no_llseek) + return -EINVAL; + if (copy_from_user(&in->f_pos, off_in, sizeof(loff_t))) + return -EFAULT; + } - pipe = out->f_dentry->d_inode; - if (pipe->i_pipe) return do_splice_to(in, pipe, len, flags); + } return -EINVAL; } -asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags) +asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, + int fd_out, loff_t __user *off_out, + size_t len, unsigned int flags) { long error; struct file *in, *out; @@ -761,13 +943,15 @@ asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags) return 0; error = -EBADF; - in = fget_light(fdin, &fput_in); + in = fget_light(fd_in, &fput_in); if (in) { if (in->f_mode & FMODE_READ) { - out = fget_light(fdout, &fput_out); + out = fget_light(fd_out, &fput_out); if (out) { if (out->f_mode & FMODE_WRITE) - error = do_splice(in, out, len, flags); + error = do_splice(in, off_in, + out, off_out, + len, flags); fput_light(out, fput_out); } } diff --git a/fs/sync.c b/fs/sync.c index 8616006d209..aab5ffe77e9 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -61,7 +61,7 @@ * will be available after a crash. */ asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes, - int flags) + unsigned int flags) { int ret; struct file *file; @@ -126,7 +126,7 @@ out: * `endbyte' is inclusive */ int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte, - int flags) + unsigned int flags) { int ret; struct address_space *mapping; diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 6cbbd165c60..4d191ef39b6 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -870,12 +870,14 @@ xfs_page_state_convert( pgoff_t end_index, last_index, tlast; ssize_t size, len; int flags, err, iomap_valid = 0, uptodate = 1; - int page_dirty, count = 0, trylock_flag = 0; + int page_dirty, count = 0; + int trylock = 0; int all_bh = unmapped; - /* wait for other IO threads? */ - if (startio && (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)) - trylock_flag |= BMAPI_TRYLOCK; + if (startio) { + if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking) + trylock |= BMAPI_TRYLOCK; + } /* Is this page beyond the end of the file? */ offset = i_size_read(inode); @@ -956,15 +958,13 @@ xfs_page_state_convert( if (buffer_unwritten(bh)) { type = IOMAP_UNWRITTEN; - flags = BMAPI_WRITE|BMAPI_IGNSTATE; + flags = BMAPI_WRITE | BMAPI_IGNSTATE; } else if (buffer_delay(bh)) { type = IOMAP_DELAY; - flags = BMAPI_ALLOCATE; - if (!startio) - flags |= trylock_flag; + flags = BMAPI_ALLOCATE | trylock; } else { type = IOMAP_NEW; - flags = BMAPI_WRITE|BMAPI_MMAP; + flags = BMAPI_WRITE | BMAPI_MMAP; } if (!iomap_valid) { diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 9fb0312665c..26fed0756f0 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -182,7 +182,7 @@ free_address( { a_list_t *aentry; - aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH); + aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT); if (likely(aentry)) { spin_lock(&as_lock); aentry->next = as_free_head; diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index ae4c4754ed3..269721af02f 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -252,7 +252,7 @@ xfs_file_sendfile_invis( STATIC ssize_t xfs_file_splice_read( struct file *infilp, - struct inode *pipe, + struct pipe_inode_info *pipe, size_t len, unsigned int flags) { @@ -266,7 +266,7 @@ xfs_file_splice_read( STATIC ssize_t xfs_file_splice_read_invis( struct file *infilp, - struct inode *pipe, + struct pipe_inode_info *pipe, size_t len, unsigned int flags) { @@ -279,7 +279,7 @@ xfs_file_splice_read_invis( STATIC ssize_t xfs_file_splice_write( - struct inode *pipe, + struct pipe_inode_info *pipe, struct file *outfilp, size_t len, unsigned int flags) @@ -293,7 +293,7 @@ xfs_file_splice_write( STATIC ssize_t xfs_file_splice_write_invis( - struct inode *pipe, + struct pipe_inode_info *pipe, struct file *outfilp, size_t len, unsigned int flags) diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 149237304fb..2e2e275c786 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -673,8 +673,7 @@ xfs_vn_setattr( if (ia_valid & ATTR_ATIME) { vattr.va_mask |= XFS_AT_ATIME; vattr.va_atime = attr->ia_atime; - if (ia_valid & ATTR_ATIME_SET) - inode->i_atime = attr->ia_atime; + inode->i_atime = attr->ia_atime; } if (ia_valid & ATTR_MTIME) { vattr.va_mask |= XFS_AT_MTIME; diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 90cd314acba..74a52937f20 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -338,7 +338,7 @@ ssize_t xfs_splice_read( bhv_desc_t *bdp, struct file *infilp, - struct inode *pipe, + struct pipe_inode_info *pipe, size_t count, int flags, int ioflags, @@ -380,7 +380,7 @@ xfs_splice_read( ssize_t xfs_splice_write( bhv_desc_t *bdp, - struct inode *pipe, + struct pipe_inode_info *pipe, struct file *outfilp, size_t count, int flags, diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h index eaa5659713f..55c689a86ad 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.h +++ b/fs/xfs/linux-2.6/xfs_lrw.h @@ -94,9 +94,9 @@ extern ssize_t xfs_sendfile(struct bhv_desc *, struct file *, loff_t *, int, size_t, read_actor_t, void *, struct cred *); extern ssize_t xfs_splice_read(struct bhv_desc *, struct file *, - struct inode *, size_t, int, int, + struct pipe_inode_info *, size_t, int, int, struct cred *); -extern ssize_t xfs_splice_write(struct bhv_desc *, struct inode *, +extern ssize_t xfs_splice_write(struct bhv_desc *, struct pipe_inode_info *, struct file *, size_t, int, int, struct cred *); diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index 6f1c79a28f8..88b09f18628 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -174,9 +174,9 @@ typedef ssize_t (*vop_sendfile_t)(bhv_desc_t *, struct file *, loff_t *, int, size_t, read_actor_t, void *, struct cred *); typedef ssize_t (*vop_splice_read_t)(bhv_desc_t *, struct file *, - struct inode *, size_t, int, int, + struct pipe_inode_info *, size_t, int, int, struct cred *); -typedef ssize_t (*vop_splice_write_t)(bhv_desc_t *, struct inode *, +typedef ssize_t (*vop_splice_write_t)(bhv_desc_t *, struct pipe_inode_info *, struct file *, size_t, int, int, struct cred *); typedef int (*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *, diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 4eeb856183b..deddbd03c16 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -158,9 +158,10 @@ xfs_ialloc_ag_alloc( */ agi = XFS_BUF_TO_AGI(agbp); newino = be32_to_cpu(agi->agi_newino); - if(likely(newino != NULLAGINO)) { - args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + - XFS_IALLOC_BLOCKS(args.mp); + args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + + XFS_IALLOC_BLOCKS(args.mp); + if (likely(newino != NULLAGINO && + (args.agbno < be32_to_cpu(agi->agi_length)))) { args.fsbno = XFS_AGB_TO_FSB(args.mp, be32_to_cpu(agi->agi_seqno), args.agbno); args.type = XFS_ALLOCTYPE_THIS_BNO; @@ -182,8 +183,8 @@ xfs_ialloc_ag_alloc( * Set the alignment for the allocation. * If stripe alignment is turned on then align at stripe unit * boundary. - * If the cluster size is smaller than a filesystem block - * then we're doing I/O for inodes in filesystem block size + * If the cluster size is smaller than a filesystem block + * then we're doing I/O for inodes in filesystem block size * pieces, so don't need alignment anyway. */ isaligned = 0; @@ -192,7 +193,7 @@ xfs_ialloc_ag_alloc( args.alignment = args.mp->m_dalign; isaligned = 1; } else if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) && - args.mp->m_sb.sb_inoalignmt >= + args.mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(args.mp, XFS_INODE_CLUSTER_SIZE(args.mp))) args.alignment = args.mp->m_sb.sb_inoalignmt; @@ -220,7 +221,7 @@ xfs_ialloc_ag_alloc( if ((error = xfs_alloc_vextent(&args))) return error; } - + /* * If stripe alignment is turned on, then try again with cluster * alignment. diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index bb33113eef9..b5385432526 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -421,7 +421,10 @@ finish_inode: ip->i_chash = chlnew; chlnew->chl_ip = ip; chlnew->chl_blkno = ip->i_blkno; + if (ch->ch_list) + ch->ch_list->chl_prev = chlnew; chlnew->chl_next = ch->ch_list; + chlnew->chl_prev = NULL; ch->ch_list = chlnew; chlnew = NULL; } @@ -723,23 +726,15 @@ xfs_iextract( ASSERT(ip->i_cnext == ip && ip->i_cprev == ip); ASSERT(ip->i_chash != NULL); chm=NULL; - for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) { - if (chl->chl_blkno == ip->i_blkno) { - if (chm == NULL) { - /* first item on the list */ - ch->ch_list = chl->chl_next; - } else { - chm->chl_next = chl->chl_next; - } - kmem_zone_free(xfs_chashlist_zone, chl); - break; - } else { - ASSERT(chl->chl_ip != ip); - chm = chl; - } - } - ASSERT_ALWAYS(chl != NULL); - } else { + chl = ip->i_chash; + if (chl->chl_prev) + chl->chl_prev->chl_next = chl->chl_next; + else + ch->ch_list = chl->chl_next; + if (chl->chl_next) + chl->chl_next->chl_prev = chl->chl_prev; + kmem_zone_free(xfs_chashlist_zone, chl); + } else { /* delete one inode from a non-empty list */ iq = ip->i_cnext; iq->i_cprev = ip->i_cprev; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 48146bdc6bd..94b60dd0380 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2732,16 +2732,29 @@ xfs_iunpin( ASSERT(atomic_read(&ip->i_pincount) > 0); if (atomic_dec_and_test(&ip->i_pincount)) { - vnode_t *vp = XFS_ITOV_NULL(ip); + /* + * If the inode is currently being reclaimed, the + * linux inode _and_ the xfs vnode may have been + * freed so we cannot reference either of them safely. + * Hence we should not try to do anything to them + * if the xfs inode is currently in the reclaim + * path. + * + * However, we still need to issue the unpin wakeup + * call as the inode reclaim may be blocked waiting for + * the inode to become unpinned. + */ + if (!(ip->i_flags & (XFS_IRECLAIM|XFS_IRECLAIMABLE))) { + vnode_t *vp = XFS_ITOV_NULL(ip); - /* make sync come back and flush this inode */ - if (vp) { - struct inode *inode = vn_to_inode(vp); + /* make sync come back and flush this inode */ + if (vp) { + struct inode *inode = vn_to_inode(vp); - if (!(inode->i_state & I_NEW)) - mark_inode_dirty_sync(inode); + if (!(inode->i_state & I_NEW)) + mark_inode_dirty_sync(inode); + } } - wake_up(&ip->i_ipin_wait); } } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 39ef9c36ea5..3b544db1790 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -189,6 +189,7 @@ typedef struct xfs_ihash { */ typedef struct xfs_chashlist { struct xfs_chashlist *chl_next; + struct xfs_chashlist *chl_prev; struct xfs_inode *chl_ip; xfs_daddr_t chl_blkno; /* starting block number of * the cluster */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 049fabb7f7e..c0b1c290688 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -270,7 +270,7 @@ xfs_mount_validate_sb( (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || - (sbp->sb_imax_pct > 100 || sbp->sb_imax_pct < 1))) { + (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) { xfs_fs_mount_cmn_err(flags, "SB sanity check 1 failed"); return XFS_ERROR(EFSCORRUPTED); } |