diff options
Diffstat (limited to 'fs')
129 files changed, 3599 insertions, 1933 deletions
diff --git a/fs/9p/fcall.c b/fs/9p/fcall.c index 71742ba150c..6f2617820a4 100644 --- a/fs/9p/fcall.c +++ b/fs/9p/fcall.c @@ -98,23 +98,20 @@ v9fs_t_attach(struct v9fs_session_info *v9ses, char *uname, char *aname, static void v9fs_t_clunk_cb(void *a, struct v9fs_fcall *tc, struct v9fs_fcall *rc, int err) { - int fid; + int fid, id; struct v9fs_session_info *v9ses; - if (err) - return; - + id = 0; fid = tc->params.tclunk.fid; - kfree(tc); - - if (!rc) - return; - - v9ses = a; - if (rc->id == RCLUNK) - v9fs_put_idpool(fid, &v9ses->fidpool); + if (rc) + id = rc->id; + kfree(tc); kfree(rc); + if (id == RCLUNK) { + v9ses = a; + v9fs_put_idpool(fid, &v9ses->fidpool); + } } /** diff --git a/fs/9p/mux.c b/fs/9p/mux.c index 3e5b124a721..f4407eb276c 100644 --- a/fs/9p/mux.c +++ b/fs/9p/mux.c @@ -50,15 +50,23 @@ enum { Wpending = 8, /* can write */ }; +enum { + None, + Flushing, + Flushed, +}; + struct v9fs_mux_poll_task; struct v9fs_req { + spinlock_t lock; int tag; struct v9fs_fcall *tcall; struct v9fs_fcall *rcall; int err; v9fs_mux_req_callback cb; void *cba; + int flush; struct list_head req_list; }; @@ -96,8 +104,8 @@ struct v9fs_mux_poll_task { struct v9fs_mux_rpc { struct v9fs_mux_data *m; - struct v9fs_req *req; int err; + struct v9fs_fcall *tcall; struct v9fs_fcall *rcall; wait_queue_head_t wqueue; }; @@ -524,10 +532,9 @@ again: static void process_request(struct v9fs_mux_data *m, struct v9fs_req *req) { - int ecode, tag; + int ecode; struct v9fs_str *ename; - tag = req->tag; if (!req->err && req->rcall->id == RERROR) { ecode = req->rcall->params.rerror.errno; ename = &req->rcall->params.rerror.error; @@ -553,23 +560,6 @@ static void process_request(struct v9fs_mux_data *m, struct v9fs_req *req) if (!req->err) req->err = -EIO; } - - if (req->err == ERREQFLUSH) - return; - - if (req->cb) { - dprintk(DEBUG_MUX, "calling callback tcall %p rcall %p\n", - req->tcall, req->rcall); - - (*req->cb) (req->cba, req->tcall, req->rcall, req->err); - req->cb = NULL; - } else - kfree(req->rcall); - - v9fs_mux_put_tag(m, tag); - - wake_up(&m->equeue); - kfree(req); } /** @@ -669,17 +659,26 @@ static void v9fs_read_work(void *a) list_for_each_entry_safe(rreq, rptr, &m->req_list, req_list) { if (rreq->tag == rcall->tag) { req = rreq; - req->rcall = rcall; - list_del(&req->req_list); - spin_unlock(&m->lock); - process_request(m, req); + if (req->flush != Flushing) + list_del(&req->req_list); break; } - } + spin_unlock(&m->lock); - if (!req) { - spin_unlock(&m->lock); + if (req) { + req->rcall = rcall; + process_request(m, req); + + if (req->flush != Flushing) { + if (req->cb) + (*req->cb) (req, req->cba); + else + kfree(req->rcall); + + wake_up(&m->equeue); + } + } else { if (err >= 0 && rcall->id != RFLUSH) dprintk(DEBUG_ERROR, "unexpected response mux %p id %d tag %d\n", @@ -746,7 +745,6 @@ static struct v9fs_req *v9fs_send_request(struct v9fs_mux_data *m, return ERR_PTR(-ENOMEM); v9fs_set_tag(tc, n); - if ((v9fs_debug_level&DEBUG_FCALL) == DEBUG_FCALL) { char buf[150]; @@ -754,12 +752,14 @@ static struct v9fs_req *v9fs_send_request(struct v9fs_mux_data *m, printk(KERN_NOTICE "<<< %p %s\n", m, buf); } + spin_lock_init(&req->lock); req->tag = n; req->tcall = tc; req->rcall = NULL; req->err = 0; req->cb = cb; req->cba = cba; + req->flush = None; spin_lock(&m->lock); list_add_tail(&req->req_list, &m->unsent_req_list); @@ -776,72 +776,108 @@ static struct v9fs_req *v9fs_send_request(struct v9fs_mux_data *m, return req; } -static void v9fs_mux_flush_cb(void *a, struct v9fs_fcall *tc, - struct v9fs_fcall *rc, int err) +static void v9fs_mux_free_request(struct v9fs_mux_data *m, struct v9fs_req *req) +{ + v9fs_mux_put_tag(m, req->tag); + kfree(req); +} + +static void v9fs_mux_flush_cb(struct v9fs_req *freq, void *a) { v9fs_mux_req_callback cb; int tag; struct v9fs_mux_data *m; - struct v9fs_req *req, *rptr; + struct v9fs_req *req, *rreq, *rptr; m = a; - dprintk(DEBUG_MUX, "mux %p tc %p rc %p err %d oldtag %d\n", m, tc, - rc, err, tc->params.tflush.oldtag); + dprintk(DEBUG_MUX, "mux %p tc %p rc %p err %d oldtag %d\n", m, + freq->tcall, freq->rcall, freq->err, + freq->tcall->params.tflush.oldtag); spin_lock(&m->lock); cb = NULL; - tag = tc->params.tflush.oldtag; - list_for_each_entry_safe(req, rptr, &m->req_list, req_list) { - if (req->tag == tag) { + tag = freq->tcall->params.tflush.oldtag; + req = NULL; + list_for_each_entry_safe(rreq, rptr, &m->req_list, req_list) { + if (rreq->tag == tag) { + req = rreq; list_del(&req->req_list); - if (req->cb) { - cb = req->cb; - req->cb = NULL; - spin_unlock(&m->lock); - (*cb) (req->cba, req->tcall, req->rcall, - req->err); - } - kfree(req); - wake_up(&m->equeue); break; } } + spin_unlock(&m->lock); - if (!cb) - spin_unlock(&m->lock); + if (req) { + spin_lock(&req->lock); + req->flush = Flushed; + spin_unlock(&req->lock); + + if (req->cb) + (*req->cb) (req, req->cba); + else + kfree(req->rcall); + + wake_up(&m->equeue); + } - v9fs_mux_put_tag(m, tag); - kfree(tc); - kfree(rc); + kfree(freq->tcall); + kfree(freq->rcall); + v9fs_mux_free_request(m, freq); } -static void +static int v9fs_mux_flush_request(struct v9fs_mux_data *m, struct v9fs_req *req) { struct v9fs_fcall *fc; + struct v9fs_req *rreq, *rptr; dprintk(DEBUG_MUX, "mux %p req %p tag %d\n", m, req, req->tag); + /* if a response was received for a request, do nothing */ + spin_lock(&req->lock); + if (req->rcall || req->err) { + spin_unlock(&req->lock); + dprintk(DEBUG_MUX, "mux %p req %p response already received\n", m, req); + return 0; + } + + req->flush = Flushing; + spin_unlock(&req->lock); + + spin_lock(&m->lock); + /* if the request is not sent yet, just remove it from the list */ + list_for_each_entry_safe(rreq, rptr, &m->unsent_req_list, req_list) { + if (rreq->tag == req->tag) { + dprintk(DEBUG_MUX, "mux %p req %p request is not sent yet\n", m, req); + list_del(&rreq->req_list); + req->flush = Flushed; + spin_unlock(&m->lock); + if (req->cb) + (*req->cb) (req, req->cba); + return 0; + } + } + spin_unlock(&m->lock); + + clear_thread_flag(TIF_SIGPENDING); fc = v9fs_create_tflush(req->tag); v9fs_send_request(m, fc, v9fs_mux_flush_cb, m); + return 1; } static void -v9fs_mux_rpc_cb(void *a, struct v9fs_fcall *tc, struct v9fs_fcall *rc, int err) +v9fs_mux_rpc_cb(struct v9fs_req *req, void *a) { struct v9fs_mux_rpc *r; - if (err == ERREQFLUSH) { - kfree(rc); - dprintk(DEBUG_MUX, "err req flush\n"); - return; - } - + dprintk(DEBUG_MUX, "req %p r %p\n", req, a); r = a; - dprintk(DEBUG_MUX, "mux %p req %p tc %p rc %p err %d\n", r->m, r->req, - tc, rc, err); - r->rcall = rc; - r->err = err; + r->rcall = req->rcall; + r->err = req->err; + + if (req->flush!=None && !req->err) + r->err = -ERESTARTSYS; + wake_up(&r->wqueue); } @@ -856,12 +892,13 @@ int v9fs_mux_rpc(struct v9fs_mux_data *m, struct v9fs_fcall *tc, struct v9fs_fcall **rc) { - int err; + int err, sigpending; unsigned long flags; struct v9fs_req *req; struct v9fs_mux_rpc r; r.err = 0; + r.tcall = tc; r.rcall = NULL; r.m = m; init_waitqueue_head(&r.wqueue); @@ -869,48 +906,50 @@ v9fs_mux_rpc(struct v9fs_mux_data *m, struct v9fs_fcall *tc, if (rc) *rc = NULL; + sigpending = 0; + if (signal_pending(current)) { + sigpending = 1; + clear_thread_flag(TIF_SIGPENDING); + } + req = v9fs_send_request(m, tc, v9fs_mux_rpc_cb, &r); if (IS_ERR(req)) { err = PTR_ERR(req); dprintk(DEBUG_MUX, "error %d\n", err); - return PTR_ERR(req); + return err; } - r.req = req; - dprintk(DEBUG_MUX, "mux %p tc %p tag %d rpc %p req %p\n", m, tc, - req->tag, &r, req); err = wait_event_interruptible(r.wqueue, r.rcall != NULL || r.err < 0); if (r.err < 0) err = r.err; if (err == -ERESTARTSYS && m->trans->status == Connected && m->err == 0) { - spin_lock(&m->lock); - req->tcall = NULL; - req->err = ERREQFLUSH; - spin_unlock(&m->lock); + if (v9fs_mux_flush_request(m, req)) { + /* wait until we get response of the flush message */ + do { + clear_thread_flag(TIF_SIGPENDING); + err = wait_event_interruptible(r.wqueue, + r.rcall || r.err); + } while (!r.rcall && !r.err && err==-ERESTARTSYS && + m->trans->status==Connected && !m->err); + } + sigpending = 1; + } - clear_thread_flag(TIF_SIGPENDING); - v9fs_mux_flush_request(m, req); + if (sigpending) { spin_lock_irqsave(¤t->sighand->siglock, flags); recalc_sigpending(); spin_unlock_irqrestore(¤t->sighand->siglock, flags); } - if (!err) { - if (r.rcall) - dprintk(DEBUG_MUX, "got response id %d tag %d\n", - r.rcall->id, r.rcall->tag); - - if (rc) - *rc = r.rcall; - else - kfree(r.rcall); - } else { + if (rc) + *rc = r.rcall; + else kfree(r.rcall); - dprintk(DEBUG_MUX, "got error %d\n", err); - if (err > 0) - err = -EIO; - } + + v9fs_mux_free_request(m, req); + if (err > 0) + err = -EIO; return err; } @@ -951,12 +990,15 @@ void v9fs_mux_cancel(struct v9fs_mux_data *m, int err) struct v9fs_req *req, *rtmp; LIST_HEAD(cancel_list); - dprintk(DEBUG_MUX, "mux %p err %d\n", m, err); + dprintk(DEBUG_ERROR, "mux %p err %d\n", m, err); m->err = err; spin_lock(&m->lock); list_for_each_entry_safe(req, rtmp, &m->req_list, req_list) { list_move(&req->req_list, &cancel_list); } + list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) { + list_move(&req->req_list, &cancel_list); + } spin_unlock(&m->lock); list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) { @@ -965,11 +1007,9 @@ void v9fs_mux_cancel(struct v9fs_mux_data *m, int err) req->err = err; if (req->cb) - (*req->cb) (req->cba, req->tcall, req->rcall, req->err); + (*req->cb) (req, req->cba); else kfree(req->rcall); - - kfree(req); } wake_up(&m->equeue); diff --git a/fs/9p/mux.h b/fs/9p/mux.h index e90bfd32ea4..fb10c50186a 100644 --- a/fs/9p/mux.h +++ b/fs/9p/mux.h @@ -24,6 +24,7 @@ */ struct v9fs_mux_data; +struct v9fs_req; /** * v9fs_mux_req_callback - callback function that is called when the @@ -36,8 +37,7 @@ struct v9fs_mux_data; * @rc - response call * @err - error code (non-zero if error occured) */ -typedef void (*v9fs_mux_req_callback)(void *a, struct v9fs_fcall *tc, - struct v9fs_fcall *rc, int err); +typedef void (*v9fs_mux_req_callback)(struct v9fs_req *req, void *a); int v9fs_mux_global_init(void); void v9fs_mux_global_exit(void); diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 083dcfcd158..1a8e46084f0 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -72,11 +72,17 @@ int v9fs_file_open(struct inode *inode, struct file *file) return -ENOSPC; } - err = v9fs_t_walk(v9ses, vfid->fid, fid, NULL, NULL); + err = v9fs_t_walk(v9ses, vfid->fid, fid, NULL, &fcall); if (err < 0) { dprintk(DEBUG_ERROR, "rewalk didn't work\n"); - goto put_fid; + if (fcall && fcall->id == RWALK) + goto clunk_fid; + else { + v9fs_put_idpool(fid, &v9ses->fidpool); + goto free_fcall; + } } + kfree(fcall); /* TODO: do special things for O_EXCL, O_NOFOLLOW, O_SYNC */ /* translate open mode appropriately */ @@ -109,8 +115,7 @@ int v9fs_file_open(struct inode *inode, struct file *file) clunk_fid: v9fs_t_clunk(v9ses, fid); -put_fid: - v9fs_put_idpool(fid, &v9ses->fidpool); +free_fcall: kfree(fcall); return err; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 133db366d30..2cb87ba4b1c 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -270,7 +270,10 @@ v9fs_create(struct v9fs_session_info *v9ses, u32 pfid, char *name, u32 perm, err = v9fs_t_walk(v9ses, pfid, fid, NULL, &fcall); if (err < 0) { PRINT_FCALL_ERROR("clone error", fcall); - goto put_fid; + if (fcall && fcall->id == RWALK) + goto clunk_fid; + else + goto put_fid; } kfree(fcall); @@ -322,6 +325,9 @@ v9fs_clone_walk(struct v9fs_session_info *v9ses, u32 fid, struct dentry *dentry) &fcall); if (err < 0) { + if (fcall && fcall->id == RWALK) + goto clunk_fid; + PRINT_FCALL_ERROR("walk error", fcall); v9fs_put_idpool(nfid, &v9ses->fidpool); goto error; @@ -640,19 +646,26 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, } result = v9fs_t_walk(v9ses, dirfidnum, newfid, - (char *)dentry->d_name.name, NULL); + (char *)dentry->d_name.name, &fcall); + if (result < 0) { - v9fs_put_idpool(newfid, &v9ses->fidpool); + if (fcall && fcall->id == RWALK) + v9fs_t_clunk(v9ses, newfid); + else + v9fs_put_idpool(newfid, &v9ses->fidpool); + if (result == -ENOENT) { d_add(dentry, NULL); dprintk(DEBUG_VFS, "Return negative dentry %p count %d\n", dentry, atomic_read(&dentry->d_count)); + kfree(fcall); return NULL; } dprintk(DEBUG_ERROR, "walk error:%d\n", result); goto FreeFcall; } + kfree(fcall); result = v9fs_t_stat(v9ses, newfid, &fcall); if (result < 0) { diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index b0a0ae509c0..61c599b4a1e 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -127,12 +127,13 @@ static struct super_block *v9fs_get_sb(struct file_system_type if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) { dprintk(DEBUG_ERROR, "problem initiating session\n"); - kfree(v9ses); - return ERR_PTR(newfid); + sb = ERR_PTR(newfid); + goto out_free_session; } sb = sget(fs_type, NULL, v9fs_set_super, v9ses); - + if (IS_ERR(sb)) + goto out_close_session; v9fs_fill_super(sb, v9ses, flags); inode = v9fs_get_inode(sb, S_IFDIR | mode); @@ -185,6 +186,12 @@ static struct super_block *v9fs_get_sb(struct file_system_type return sb; +out_close_session: + v9fs_session_close(v9ses); +out_free_session: + kfree(v9ses); + return sb; + put_back_sb: /* deactivate_super calls v9fs_kill_super which will frees the rest */ up_write(&sb->s_umount); diff --git a/fs/Kconfig b/fs/Kconfig index e207be68d4c..f9b5842c8d2 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -799,6 +799,7 @@ config PROC_KCORE config PROC_VMCORE bool "/proc/vmcore support (EXPERIMENTAL)" depends on PROC_FS && EXPERIMENTAL && CRASH_DUMP + default y help Exports the dump image of crashed kernel in ELF format. @@ -841,6 +842,12 @@ config TMPFS config HUGETLBFS bool "HugeTLB file system support" depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || BROKEN + help + hugetlbfs is a filesystem backing for HugeTLB pages, based on + ramfs. For architectures that support it, say Y here and read + <file:Documentation/vm/hugetlbpage.txt> for details. + + If unsure, say N. config HUGETLB_PAGE def_bool HUGETLBFS @@ -861,7 +868,7 @@ config RAMFS config CONFIGFS_FS tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)" - depends on EXPERIMENTAL + depends on SYSFS && EXPERIMENTAL help configfs is a ram-based filesystem that provides the converse of sysfs's functionality. Where sysfs is a filesystem-based diff --git a/fs/Makefile b/fs/Makefile index 83bf478e786..078d3d1191a 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -45,6 +45,7 @@ obj-$(CONFIG_DNOTIFY) += dnotify.o obj-$(CONFIG_PROC_FS) += proc/ obj-y += partitions/ obj-$(CONFIG_SYSFS) += sysfs/ +obj-$(CONFIG_CONFIGFS_FS) += configfs/ obj-y += devpts/ obj-$(CONFIG_PROFILING) += dcookies.o @@ -100,5 +101,4 @@ obj-$(CONFIG_BEFS_FS) += befs/ obj-$(CONFIG_HOSTFS) += hostfs/ obj-$(CONFIG_HPPFS) += hppfs/ obj-$(CONFIG_DEBUG_FS) += debugfs/ -obj-$(CONFIG_CONFIGFS_FS) += configfs/ obj-$(CONFIG_OCFS2_FS) += ocfs2/ diff --git a/fs/affs/namei.c b/fs/affs/namei.c index d4c2d636c47..a42143ca016 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -416,10 +416,9 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry, return retval; } - retval = -EIO; bh = affs_bread(sb, old_dentry->d_inode->i_ino); if (!bh) - goto done; + return -EIO; /* Remove header from its parent directory. */ affs_lock_dir(old_dir); diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 57c4903614e..d6603d02304 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -74,8 +74,8 @@ struct autofs_wait_queue { struct autofs_wait_queue *next; autofs_wqt_t wait_queue_token; /* We use the following to see what we are waiting for */ - int hash; - int len; + unsigned int hash; + unsigned int len; char *name; u32 dev; u64 ino; @@ -85,7 +85,6 @@ struct autofs_wait_queue { pid_t tgid; /* This is for status reporting upon return */ int status; - atomic_t notify; atomic_t wait_ctr; }; diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 84e030c8ddd..5100f984783 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -327,6 +327,7 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags) static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(dentry); int oz_mode = autofs4_oz_mode(sbi); unsigned int lookup_type; int status; @@ -340,13 +341,8 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) if (oz_mode || !lookup_type) goto done; - /* - * If a request is pending wait for it. - * If it's a mount then it won't be expired till at least - * a liitle later and if it's an expire then we might need - * to mount it again. - */ - if (autofs4_ispending(dentry)) { + /* If an expire request is pending wait for it. */ + if (ino && (ino->flags & AUTOFS_INF_EXPIRING)) { DPRINTK("waiting for active request %p name=%.*s", dentry, dentry->d_name.len, dentry->d_name.name); diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 142ab6aa2aa..ce103e7b0bc 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -189,14 +189,30 @@ static int autofs4_getpath(struct autofs_sb_info *sbi, return len; } +static struct autofs_wait_queue * +autofs4_find_wait(struct autofs_sb_info *sbi, + char *name, unsigned int hash, unsigned int len) +{ + struct autofs_wait_queue *wq; + + for (wq = sbi->queues; wq; wq = wq->next) { + if (wq->hash == hash && + wq->len == len && + wq->name && !memcmp(wq->name, name, len)) + break; + } + return wq; +} + int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, enum autofs_notify notify) { + struct autofs_info *ino; struct autofs_wait_queue *wq; char *name; unsigned int len = 0; unsigned int hash = 0; - int status; + int status, type; /* In catatonic mode, we don't wait for nobody */ if (sbi->catatonic) @@ -223,21 +239,41 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, return -EINTR; } - for (wq = sbi->queues ; wq ; wq = wq->next) { - if (wq->hash == dentry->d_name.hash && - wq->len == len && - wq->name && !memcmp(wq->name, name, len)) - break; - } + wq = autofs4_find_wait(sbi, name, hash, len); + ino = autofs4_dentry_ino(dentry); + if (!wq && ino && notify == NFY_NONE) { + /* + * Either we've betean the pending expire to post it's + * wait or it finished while we waited on the mutex. + * So we need to wait till either, the wait appears + * or the expire finishes. + */ + + while (ino->flags & AUTOFS_INF_EXPIRING) { + mutex_unlock(&sbi->wq_mutex); + schedule_timeout_interruptible(HZ/10); + if (mutex_lock_interruptible(&sbi->wq_mutex)) { + kfree(name); + return -EINTR; + } + wq = autofs4_find_wait(sbi, name, hash, len); + if (wq) + break; + } - if (!wq) { - /* Can't wait for an expire if there's no mount */ - if (notify == NFY_NONE && !d_mountpoint(dentry)) { + /* + * Not ideal but the status has already gone. Of the two + * cases where we wait on NFY_NONE neither depend on the + * return status of the wait. + */ + if (!wq) { kfree(name); mutex_unlock(&sbi->wq_mutex); - return -ENOENT; + return 0; } + } + if (!wq) { /* Create a new wait queue */ wq = kmalloc(sizeof(struct autofs_wait_queue),GFP_KERNEL); if (!wq) { @@ -263,20 +299,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, wq->tgid = current->tgid; wq->status = -EINTR; /* Status return if interrupted */ atomic_set(&wq->wait_ctr, 2); - atomic_set(&wq->notify, 1); - mutex_unlock(&sbi->wq_mutex); - } else { - atomic_inc(&wq->wait_ctr); mutex_unlock(&sbi->wq_mutex); - kfree(name); - DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d", - (unsigned long) wq->wait_queue_token, wq->len, wq->name, notify); - } - - if (notify != NFY_NONE && atomic_read(&wq->notify)) { - int type; - - atomic_dec(&wq->notify); if (sbi->version < 5) { if (notify == NFY_MOUNT) @@ -299,6 +322,12 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, /* autofs4_notify_daemon() may block */ autofs4_notify_daemon(sbi, wq, type); + } else { + atomic_inc(&wq->wait_ctr); + mutex_unlock(&sbi->wq_mutex); + kfree(name); + DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d", + (unsigned long) wq->wait_queue_token, wq->len, wq->name, notify); } /* wq->name is NULL if and only if the lock is already released */ diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 69f44dcdb0b..b1c902e319c 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -428,7 +428,6 @@ static int load_flat_file(struct linux_binprm * bprm, loff_t fpos; unsigned long start_code, end_code; int ret; - int exec_fileno; hdr = ((struct flat_hdr *) bprm->buf); /* exec-header */ inode = bprm->file->f_dentry->d_inode; @@ -502,21 +501,12 @@ static int load_flat_file(struct linux_binprm * bprm, goto err; } - /* check file descriptor */ - exec_fileno = get_unused_fd(); - if (exec_fileno < 0) { - ret = -EMFILE; - goto err; - } - get_file(bprm->file); - fd_install(exec_fileno, bprm->file); - /* Flush all traces of the currently running executable */ if (id == 0) { result = flush_old_exec(bprm); if (result) { ret = result; - goto err_close; + goto err; } /* OK, This is the point of no return */ @@ -548,7 +538,7 @@ static int load_flat_file(struct linux_binprm * bprm, textpos = (unsigned long) -ENOMEM; printk("Unable to mmap process text, errno %d\n", (int)-textpos); ret = textpos; - goto err_close; + goto err; } down_write(¤t->mm->mmap_sem); @@ -564,7 +554,7 @@ static int load_flat_file(struct linux_binprm * bprm, (int)-datapos); do_munmap(current->mm, textpos, text_len); ret = realdatastart; - goto err_close; + goto err; } datapos = realdatastart + MAX_SHARED_LIBS * sizeof(unsigned long); @@ -587,7 +577,7 @@ static int load_flat_file(struct linux_binprm * bprm, do_munmap(current->mm, textpos, text_len); do_munmap(current->mm, realdatastart, data_len + extra); ret = result; - goto err_close; + goto err; } reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len)); @@ -606,7 +596,7 @@ static int load_flat_file(struct linux_binprm * bprm, printk("Unable to allocate RAM for process text/data, errno %d\n", (int)-textpos); ret = textpos; - goto err_close; + goto err; } realdatastart = textpos + ntohl(hdr->data_start); @@ -652,7 +642,7 @@ static int load_flat_file(struct linux_binprm * bprm, do_munmap(current->mm, textpos, text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long)); ret = result; - goto err_close; + goto err; } } @@ -717,7 +707,7 @@ static int load_flat_file(struct linux_binprm * bprm, addr = calc_reloc(*rp, libinfo, id, 0); if (addr == RELOC_FAILED) { ret = -ENOEXEC; - goto err_close; + goto err; } *rp = addr; } @@ -747,7 +737,7 @@ static int load_flat_file(struct linux_binprm * bprm, rp = (unsigned long *) calc_reloc(addr, libinfo, id, 1); if (rp == (unsigned long *)RELOC_FAILED) { ret = -ENOEXEC; - goto err_close; + goto err; } /* Get the pointer's value. */ @@ -762,7 +752,7 @@ static int load_flat_file(struct linux_binprm * bprm, addr = calc_reloc(addr, libinfo, id, 0); if (addr == RELOC_FAILED) { ret = -ENOEXEC; - goto err_close; + goto err; } /* Write back the relocated pointer. */ @@ -783,8 +773,6 @@ static int load_flat_file(struct linux_binprm * bprm, stack_len); return 0; -err_close: - sys_close(exec_fileno); err: return ret; } @@ -1116,6 +1116,9 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors) bp->bio1.bi_io_vec = &bp->bv1; bp->bio2.bi_io_vec = &bp->bv2; + bp->bio1.bi_max_vecs = 1; + bp->bio2.bi_max_vecs = 1; + bp->bio1.bi_end_io = bio_pair_end_1; bp->bio2.bi_end_io = bio_pair_end_2; diff --git a/fs/block_dev.c b/fs/block_dev.c index af88c43043d..f5958f413bd 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1104,6 +1104,8 @@ const struct file_operations def_blk_fops = { .readv = generic_file_readv, .writev = generic_file_write_nolock, .sendfile = generic_file_sendfile, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, }; int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg) diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index 8a2de038882..7271bb0257f 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -1,7 +1,18 @@ +Version 1.43 +------------ +POSIX locking to servers which support CIFS POSIX Extensions +(disabled by default controlled by proc/fs/cifs/Experimental). +Handle conversion of long share names (especially Asian languages) +to Unicode during mount. + Version 1.42 ------------ Fix slow oplock break when mounted to different servers at the same time and -the tids match and we try to find matching fid on wrong server. +the tids match and we try to find matching fid on wrong server. Fix read +looping when signing required by server (2.6.16 kernel only). Fix readdir +vs. rename race which could cause each to hang. Return . and .. even +if server does not. Allow searches to skip first three entries and +begin at any location. Fix oops in find_writeable_file. Version 1.41 ------------ diff --git a/fs/cifs/README b/fs/cifs/README index b2b4d080376..0355003f4f0 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -511,6 +511,14 @@ LinuxExtensionsEnabled If set to one then the client will attempt to support and want to map the uid and gid fields to values supplied at mount (rather than the actual values, then set this to zero. (default 1) +Experimental When set to 1 used to enable certain experimental + features (currently enables multipage writes + when signing is enabled, the multipage write + performance enhancement was disabled when + signing turned on in case buffer was modified + just before it was sent, also this flag will + be used to use the new experimental sessionsetup + code). These experimental features and tracing can be enabled by changing flags in /proc/fs/cifs (after the cifs module has been installed or built into the diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index d4b713e5aff..c262d8874ce 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -33,6 +33,7 @@ #include <linux/vfs.h> #include <linux/mempool.h> #include <linux/delay.h> +#include <linux/kthread.h> #include "cifsfs.h" #include "cifspdu.h" #define DECLARE_GLOBALS_HERE @@ -75,9 +76,6 @@ unsigned int cifs_max_pending = CIFS_MAX_REQ; module_param(cifs_max_pending, int, 0); MODULE_PARM_DESC(cifs_max_pending,"Simultaneous requests to server. Default: 50 Range: 2 to 256"); -static DECLARE_COMPLETION(cifs_oplock_exited); -static DECLARE_COMPLETION(cifs_dnotify_exited); - extern mempool_t *cifs_sm_req_poolp; extern mempool_t *cifs_req_poolp; extern mempool_t *cifs_mid_poolp; @@ -841,10 +839,6 @@ static int cifs_oplock_thread(void * dummyarg) __u16 netfid; int rc; - daemonize("cifsoplockd"); - allow_signal(SIGTERM); - - oplockThread = current; do { if (try_to_freeze()) continue; @@ -900,9 +894,9 @@ static int cifs_oplock_thread(void * dummyarg) set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(1); /* yield in case q were corrupt */ } - } while(!signal_pending(current)); - oplockThread = NULL; - complete_and_exit (&cifs_oplock_exited, 0); + } while (!kthread_should_stop()); + + return 0; } static int cifs_dnotify_thread(void * dummyarg) @@ -910,10 +904,6 @@ static int cifs_dnotify_thread(void * dummyarg) struct list_head *tmp; struct cifsSesInfo *ses; - daemonize("cifsdnotifyd"); - allow_signal(SIGTERM); - - dnotifyThread = current; do { if(try_to_freeze()) continue; @@ -931,8 +921,9 @@ static int cifs_dnotify_thread(void * dummyarg) wake_up_all(&ses->server->response_q); } read_unlock(&GlobalSMBSeslock); - } while(!signal_pending(current)); - complete_and_exit (&cifs_dnotify_exited, 0); + } while (!kthread_should_stop()); + + return 0; } static int __init @@ -982,32 +973,48 @@ init_cifs(void) } rc = cifs_init_inodecache(); - if (!rc) { - rc = cifs_init_mids(); - if (!rc) { - rc = cifs_init_request_bufs(); - if (!rc) { - rc = register_filesystem(&cifs_fs_type); - if (!rc) { - rc = (int)kernel_thread(cifs_oplock_thread, NULL, - CLONE_FS | CLONE_FILES | CLONE_VM); - if(rc > 0) { - rc = (int)kernel_thread(cifs_dnotify_thread, NULL, - CLONE_FS | CLONE_FILES | CLONE_VM); - if(rc > 0) - return 0; - else - cERROR(1,("error %d create dnotify thread", rc)); - } else { - cERROR(1,("error %d create oplock thread",rc)); - } - } - cifs_destroy_request_bufs(); - } - cifs_destroy_mids(); - } - cifs_destroy_inodecache(); + if (rc) + goto out_clean_proc; + + rc = cifs_init_mids(); + if (rc) + goto out_destroy_inodecache; + + rc = cifs_init_request_bufs(); + if (rc) + goto out_destroy_mids; + + rc = register_filesystem(&cifs_fs_type); + if (rc) + goto out_destroy_request_bufs; + + oplockThread = kthread_run(cifs_oplock_thread, NULL, "cifsoplockd"); + if (IS_ERR(oplockThread)) { + rc = PTR_ERR(oplockThread); + cERROR(1,("error %d create oplock thread", rc)); + goto out_unregister_filesystem; } + + dnotifyThread = kthread_run(cifs_dnotify_thread, NULL, "cifsdnotifyd"); + if (IS_ERR(dnotifyThread)) { + rc = PTR_ERR(dnotifyThread); + cERROR(1,("error %d create dnotify thread", rc)); + goto out_stop_oplock_thread; + } + + return 0; + + out_stop_oplock_thread: + kthread_stop(oplockThread); + out_unregister_filesystem: + unregister_filesystem(&cifs_fs_type); + out_destroy_request_bufs: + cifs_destroy_request_bufs(); + out_destroy_mids: + cifs_destroy_mids(); + out_destroy_inodecache: + cifs_destroy_inodecache(); + out_clean_proc: #ifdef CONFIG_PROC_FS cifs_proc_clean(); #endif @@ -1025,14 +1032,8 @@ exit_cifs(void) cifs_destroy_inodecache(); cifs_destroy_mids(); cifs_destroy_request_bufs(); - if(oplockThread) { - send_sig(SIGTERM, oplockThread, 1); - wait_for_completion(&cifs_oplock_exited); - } - if(dnotifyThread) { - send_sig(SIGTERM, dnotifyThread, 1); - wait_for_completion(&cifs_dnotify_exited); - } + kthread_stop(oplockThread); + kthread_stop(dnotifyThread); } MODULE_AUTHOR("Steve French <sfrench@us.ibm.com>"); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 4e829dc672a..c98755dca86 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -99,5 +99,5 @@ extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t); extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); extern int cifs_ioctl (struct inode * inode, struct file * filep, unsigned int command, unsigned long arg); -#define CIFS_VERSION "1.42" +#define CIFS_VERSION "1.43" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 2879ba343ca..310ea2f0e0b 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -267,7 +267,7 @@ extern int CIFSSMBLock(const int xid, struct cifsTconInfo *tcon, const int waitFlag); extern int CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon, const __u16 smb_file_id, const int get_flag, - const __u64 len, const __u64 offset, + const __u64 len, struct file_lock *, const __u16 lock_type, const int waitFlag); extern int CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon); extern int CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index d705500aa28..925881e00ff 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1355,7 +1355,8 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon, int CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon, const __u16 smb_file_id, const int get_flag, const __u64 len, - const __u64 lkoffset, const __u16 lock_type, const int waitFlag) + struct file_lock *pLockData, const __u16 lock_type, + const int waitFlag) { struct smb_com_transaction2_sfi_req *pSMB = NULL; struct smb_com_transaction2_sfi_rsp *pSMBr = NULL; @@ -1366,6 +1367,10 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon, __u16 params, param_offset, offset, byte_count, count; cFYI(1, ("Posix Lock")); + + if(pLockData == NULL) + return EINVAL; + rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); if (rc) @@ -1404,10 +1409,10 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon, parm_data->lock_type = cpu_to_le16(lock_type); if(waitFlag) - parm_data->lock_flags = 1; + parm_data->lock_flags = cpu_to_le16(1); parm_data->pid = cpu_to_le32(current->tgid); - parm_data->start = lkoffset; - parm_data->length = len; /* normalize negative numbers */ + parm_data->start = cpu_to_le64(pLockData->fl_start); + parm_data->length = cpu_to_le64(len); /* normalize negative numbers */ pSMB->DataOffset = cpu_to_le16(offset); pSMB->Fid = smb_file_id; @@ -1419,8 +1424,33 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon, (struct smb_hdr *) pSMBr, &bytes_returned, 0); if (rc) { cFYI(1, ("Send error in Posix Lock = %d", rc)); - } + } else if (get_flag) { + /* lock structure can be returned on get */ + __u16 data_offset; + __u16 data_count; + rc = validate_t2((struct smb_t2_rsp *)pSMBr); + if (rc || (pSMBr->ByteCount < sizeof(struct cifs_posix_lock))) { + rc = -EIO; /* bad smb */ + goto plk_err_exit; + } + if(pLockData == NULL) { + rc = -EINVAL; + goto plk_err_exit; + } + data_offset = le16_to_cpu(pSMBr->t2.DataOffset); + data_count = le16_to_cpu(pSMBr->t2.DataCount); + if(data_count < sizeof(struct cifs_posix_lock)) { + rc = -EIO; + goto plk_err_exit; + } + parm_data = (struct cifs_posix_lock *) + ((char *)&pSMBr->hdr.Protocol + data_offset); + if(parm_data->lock_type == cpu_to_le16(CIFS_UNLCK)) + pLockData->fl_type = F_UNLCK; + } + +plk_err_exit: if (pSMB) cifs_small_buf_release(pSMB); @@ -3119,7 +3149,7 @@ findFirstRetry: psrch_inf->endOfSearch = FALSE; psrch_inf->entries_in_buffer = le16_to_cpu(parms->SearchCount); - psrch_inf->index_of_last_entry = + psrch_inf->index_of_last_entry = 2 /* skip . and .. */ + psrch_inf->entries_in_buffer; *pnetfid = parms->SearchHandle; } else { diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 0b86d5ca901..bae1479318d 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2148,6 +2148,8 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, /* We look for obvious messed up bcc or strings in response so we do not go off the end since (at least) WIN2K and Windows XP have a major bug in not null terminating last Unicode string in response */ + if(ses->serverOS) + kfree(ses->serverOS); ses->serverOS = kzalloc(2 * (len + 1), GFP_KERNEL); if(ses->serverOS == NULL) goto sesssetup_nomem; @@ -2160,6 +2162,8 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, if (remaining_words > 0) { len = UniStrnlen((wchar_t *)bcc_ptr, remaining_words-1); + if(ses->serverNOS) + kfree(ses->serverNOS); ses->serverNOS = kzalloc(2 * (len + 1),GFP_KERNEL); if(ses->serverNOS == NULL) goto sesssetup_nomem; @@ -2177,6 +2181,8 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, if (remaining_words > 0) { len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words); /* last string is not always null terminated (for e.g. for Windows XP & 2000) */ + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2*(len+1),GFP_KERNEL); if(ses->serverDomain == NULL) @@ -2187,15 +2193,22 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, ses->serverDomain[2*len] = 0; ses->serverDomain[1+(2*len)] = 0; } /* else no more room so create dummy domain string */ - else + else { + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2, GFP_KERNEL); + } } else { /* no room so create dummy domain and NOS string */ /* if these kcallocs fail not much we can do, but better to not fail the sesssetup itself */ + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2, GFP_KERNEL); + if(ses->serverNOS) + kfree(ses->serverNOS); ses->serverNOS = kzalloc(2, GFP_KERNEL); } @@ -2204,6 +2217,8 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, if (((long) bcc_ptr + len) - (long) pByteArea(smb_buffer_response) <= BCC(smb_buffer_response)) { + if(ses->serverOS) + kfree(ses->serverOS); ses->serverOS = kzalloc(len + 1,GFP_KERNEL); if(ses->serverOS == NULL) goto sesssetup_nomem; @@ -2214,6 +2229,8 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, bcc_ptr++; len = strnlen(bcc_ptr, 1024); + if(ses->serverNOS) + kfree(ses->serverNOS); ses->serverNOS = kzalloc(len + 1,GFP_KERNEL); if(ses->serverNOS == NULL) goto sesssetup_nomem; @@ -2223,6 +2240,8 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, bcc_ptr++; len = strnlen(bcc_ptr, 1024); + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(len + 1,GFP_KERNEL); if(ses->serverDomain == NULL) goto sesssetup_nomem; @@ -2427,6 +2446,8 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses, /* We look for obvious messed up bcc or strings in response so we do not go off the end since (at least) WIN2K and Windows XP have a major bug in not null terminating last Unicode string in response */ + if(ses->serverOS) + kfree(ses->serverOS); ses->serverOS = kzalloc(2 * (len + 1), GFP_KERNEL); cifs_strfromUCS_le(ses->serverOS, @@ -2441,6 +2462,8 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses, len = UniStrnlen((wchar_t *)bcc_ptr, remaining_words - 1); + if(ses->serverNOS) + kfree(ses->serverNOS); ses->serverNOS = kzalloc(2 * (len + 1), GFP_KERNEL); @@ -2454,7 +2477,9 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses, remaining_words -= len + 1; if (remaining_words > 0) { len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words); - /* last string is not always null terminated (for e.g. for Windows XP & 2000) */ + /* last string not null terminated (e.g.Windows XP/2000) */ + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2*(len+1),GFP_KERNEL); cifs_strfromUCS_le(ses->serverDomain, (__le16 *)bcc_ptr, @@ -2463,11 +2488,18 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses, ses->serverDomain[2*len] = 0; ses->serverDomain[1+(2*len)] = 0; } /* else no more room so create dummy domain string */ - else + else { + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2,GFP_KERNEL); - } else { /* no room so create dummy domain and NOS string */ + } + } else {/* no room use dummy domain&NOS */ + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2, GFP_KERNEL); + if(ses->serverNOS) + kfree(ses->serverNOS); ses->serverNOS = kzalloc(2, GFP_KERNEL); } } else { /* ASCII */ @@ -2476,6 +2508,8 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses, if (((long) bcc_ptr + len) - (long) pByteArea(smb_buffer_response) <= BCC(smb_buffer_response)) { + if(ses->serverOS) + kfree(ses->serverOS); ses->serverOS = kzalloc(len + 1, GFP_KERNEL); strncpy(ses->serverOS, bcc_ptr, len); @@ -2484,6 +2518,8 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses, bcc_ptr++; len = strnlen(bcc_ptr, 1024); + if(ses->serverNOS) + kfree(ses->serverNOS); ses->serverNOS = kzalloc(len + 1,GFP_KERNEL); strncpy(ses->serverNOS, bcc_ptr, len); bcc_ptr += len; @@ -2491,6 +2527,8 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses, bcc_ptr++; len = strnlen(bcc_ptr, 1024); + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(len + 1, GFP_KERNEL); strncpy(ses->serverDomain, bcc_ptr, len); bcc_ptr += len; @@ -2728,6 +2766,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, /* We look for obvious messed up bcc or strings in response so we do not go off the end since (at least) WIN2K and Windows XP have a major bug in not null terminating last Unicode string in response */ + if(ses->serverOS) + kfree(ses->serverOS); ses->serverOS = kzalloc(2 * (len + 1), GFP_KERNEL); cifs_strfromUCS_le(ses->serverOS, @@ -2743,6 +2783,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, bcc_ptr, remaining_words - 1); + if(ses->serverNOS) + kfree(ses->serverNOS); ses->serverNOS = kzalloc(2 * (len + 1), GFP_KERNEL); @@ -2760,6 +2802,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, if (remaining_words > 0) { len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words); /* last string is not always null terminated (for e.g. for Windows XP & 2000) */ + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2 * (len + @@ -2777,13 +2821,20 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, [1 + (2 * len)] = 0; } /* else no more room so create dummy domain string */ - else + else { + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2, GFP_KERNEL); + } } else { /* no room so create dummy domain and NOS string */ + if(ses->serverDomain); + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2, GFP_KERNEL); + if(ses->serverNOS) + kfree(ses->serverNOS); ses->serverNOS = kzalloc(2, GFP_KERNEL); } @@ -2792,6 +2843,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, if (((long) bcc_ptr + len) - (long) pByteArea(smb_buffer_response) <= BCC(smb_buffer_response)) { + if(ses->serverOS) + kfree(ses->serverOS); ses->serverOS = kzalloc(len + 1, GFP_KERNEL); @@ -2803,6 +2856,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, bcc_ptr++; len = strnlen(bcc_ptr, 1024); + if(ses->serverNOS) + kfree(ses->serverNOS); ses->serverNOS = kzalloc(len + 1, GFP_KERNEL); @@ -2812,6 +2867,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, bcc_ptr++; len = strnlen(bcc_ptr, 1024); + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(len + 1, GFP_KERNEL); @@ -3116,6 +3173,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses, /* We look for obvious messed up bcc or strings in response so we do not go off the end since (at least) WIN2K and Windows XP have a major bug in not null terminating last Unicode string in response */ + if(ses->serverOS) + kfree(ses->serverOS); ses->serverOS = kzalloc(2 * (len + 1), GFP_KERNEL); cifs_strfromUCS_le(ses->serverOS, @@ -3131,6 +3190,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses, bcc_ptr, remaining_words - 1); + if(ses->serverNOS) + kfree(ses->serverNOS); ses->serverNOS = kzalloc(2 * (len + 1), GFP_KERNEL); @@ -3147,6 +3208,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses, if (remaining_words > 0) { len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words); /* last string not always null terminated (e.g. for Windows XP & 2000) */ + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2 * (len + @@ -3172,10 +3235,17 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses, len)] = 0; } /* else no more room so create dummy domain string */ - else + else { + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2,GFP_KERNEL); + } } else { /* no room so create dummy domain and NOS string */ + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2, GFP_KERNEL); + if(ses->serverNOS) + kfree(ses->serverNOS); ses->serverNOS = kzalloc(2, GFP_KERNEL); } } else { /* ASCII */ @@ -3183,6 +3253,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses, if (((long) bcc_ptr + len) - (long) pByteArea(smb_buffer_response) <= BCC(smb_buffer_response)) { + if(ses->serverOS) + kfree(ses->serverOS); ses->serverOS = kzalloc(len + 1,GFP_KERNEL); strncpy(ses->serverOS,bcc_ptr, len); @@ -3191,6 +3263,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses, bcc_ptr++; len = strnlen(bcc_ptr, 1024); + if(ses->serverNOS) + kfree(ses->serverNOS); ses->serverNOS = kzalloc(len+1,GFP_KERNEL); strncpy(ses->serverNOS, bcc_ptr, len); bcc_ptr += len; @@ -3198,6 +3272,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses, bcc_ptr++; len = strnlen(bcc_ptr, 1024); + if(ses->serverDomain) + kfree(ses->serverDomain); ses->serverDomain = kzalloc(len+1,GFP_KERNEL); strncpy(ses->serverDomain, bcc_ptr, len); bcc_ptr += len; @@ -3282,7 +3358,8 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, bcc_ptr++; /* align */ } - if(ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + if(ses->server->secMode & + (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; if (ses->capabilities & CAP_STATUS32) { @@ -3294,8 +3371,10 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, if (ses->capabilities & CAP_UNICODE) { smb_buffer->Flags2 |= SMBFLG2_UNICODE; length = - cifs_strtoUCS((__le16 *) bcc_ptr, tree, 100, nls_codepage); - bcc_ptr += 2 * length; /* convert num of 16 bit words to bytes */ + cifs_strtoUCS((__le16 *) bcc_ptr, tree, + 6 /* max utf8 char length in bytes */ * + (/* server len*/ + 256 /* share len */), nls_codepage); + bcc_ptr += 2 * length; /* convert num 16 bit words to bytes */ bcc_ptr += 2; /* skip trailing null */ } else { /* ASCII */ strcpy(bcc_ptr, tree); @@ -3447,6 +3526,12 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, pSesInfo->server->secMode, pSesInfo->server->capabilities, pSesInfo->server->timeZone)); +#ifdef CONFIG_CIFS_EXPERIMENTAL + if(experimEnabled > 1) + rc = CIFS_SessSetup(xid, pSesInfo, CIFS_NTLM /* type */, + &ntlmv2_flag, nls_info); + else +#endif if (extended_security && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) && (pSesInfo->server->secType == NTLMSSP)) { diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 1d0ca3eaaca..82315edc77d 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -139,9 +139,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, cifs_sb = CIFS_SB(inode->i_sb); pTcon = cifs_sb->tcon; - mutex_lock(&direntry->d_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex); if(full_path == NULL) { FreeXid(xid); return -ENOMEM; @@ -316,9 +314,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, cifs_sb = CIFS_SB(inode->i_sb); pTcon = cifs_sb->tcon; - mutex_lock(&direntry->d_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex); if(full_path == NULL) rc = -ENOMEM; else if (pTcon->ses->capabilities & CAP_UNIX) { @@ -440,6 +436,20 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, struct name cifs_sb = CIFS_SB(parent_dir_inode->i_sb); pTcon = cifs_sb->tcon; + /* + * Don't allow the separator character in a path component. + * The VFS will not allow "/", but "\" is allowed by posix. + */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) { + int i; + for (i = 0; i < direntry->d_name.len; i++) + if (direntry->d_name.name[i] == '\\') { + cFYI(1, ("Invalid file name")); + FreeXid(xid); + return ERR_PTR(-EINVAL); + } + } + /* can not grab the rename sem here since it would deadlock in the cases (beginning of sys_rename itself) in which we already have the sb rename sem */ diff --git a/fs/cifs/fcntl.c b/fs/cifs/fcntl.c index ec4dfe9bf5e..633a9381132 100644 --- a/fs/cifs/fcntl.c +++ b/fs/cifs/fcntl.c @@ -86,9 +86,7 @@ int cifs_dir_notify(struct file * file, unsigned long arg) cifs_sb = CIFS_SB(file->f_dentry->d_sb); pTcon = cifs_sb->tcon; - mutex_lock(&file->f_dentry->d_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(file->f_dentry); - mutex_unlock(&file->f_dentry->d_sb->s_vfs_rename_mutex); if(full_path == NULL) { rc = -ENOMEM; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 5c497c52977..e2b4ce1dad6 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -84,6 +84,8 @@ static inline int cifs_get_disposition(unsigned int flags) return FILE_OVERWRITE_IF; else if ((flags & O_CREAT) == O_CREAT) return FILE_OPEN_IF; + else if ((flags & O_TRUNC) == O_TRUNC) + return FILE_OVERWRITE; else return FILE_OPEN; } @@ -203,9 +205,7 @@ int cifs_open(struct inode *inode, struct file *file) } } - mutex_lock(&inode->i_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(file->f_dentry); - mutex_unlock(&inode->i_sb->s_vfs_rename_mutex); if (full_path == NULL) { FreeXid(xid); return -ENOMEM; @@ -658,7 +658,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) else posix_lock_type = CIFS_WRLCK; rc = CIFSSMBPosixLock(xid, pTcon, netfid, 1 /* get */, - length, pfLock->fl_start, + length, pfLock, posix_lock_type, wait_flag); FreeXid(xid); return rc; @@ -706,7 +706,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) return -EOPNOTSUPP; } rc = CIFSSMBPosixLock(xid, pTcon, netfid, 0 /* set */, - length, pfLock->fl_start, + length, pfLock, posix_lock_type, wait_flag); } else rc = CIFSSMBLock(xid, pTcon, netfid, length, pfLock->fl_start, @@ -906,9 +906,10 @@ static ssize_t cifs_write(struct file *file, const char *write_data, if (rc != 0) break; } - /* BB FIXME We can not sign across two buffers yet */ - if((pTcon->ses->server->secMode & - (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) == 0) { + if(experimEnabled || (pTcon->ses->server && + ((pTcon->ses->server->secMode & + (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + == 0))) { struct kvec iov[2]; unsigned int len; @@ -923,13 +924,13 @@ static ssize_t cifs_write(struct file *file, const char *write_data, *poffset, &bytes_written, iov, 1, long_op); } else - /* BB FIXME fixup indentation of line below */ - rc = CIFSSMBWrite(xid, pTcon, - open_file->netfid, - min_t(const int, cifs_sb->wsize, - write_size - total_written), - *poffset, &bytes_written, - write_data + total_written, NULL, long_op); + rc = CIFSSMBWrite(xid, pTcon, + open_file->netfid, + min_t(const int, cifs_sb->wsize, + write_size - total_written), + *poffset, &bytes_written, + write_data + total_written, + NULL, long_op); } if (rc || (bytes_written == 0)) { if (total_written) @@ -968,6 +969,16 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode) struct cifsFileInfo *open_file; int rc; + /* Having a null inode here (because mapping->host was set to zero by + the VFS or MM) should not happen but we had reports of on oops (due to + it being zero) during stress testcases so we need to check for it */ + + if(cifs_inode == NULL) { + cERROR(1,("Null inode passed to cifs_writeable_file")); + dump_stack(); + return NULL; + } + read_lock(&GlobalSMBSeslock); list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { if (open_file->closePend) @@ -1093,12 +1104,11 @@ static int cifs_writepages(struct address_space *mapping, if (cifs_sb->wsize < PAGE_CACHE_SIZE) return generic_writepages(mapping, wbc); - /* BB FIXME we do not have code to sign across multiple buffers yet, - so go to older writepage style write which we can sign if needed */ if((cifs_sb->tcon->ses) && (cifs_sb->tcon->ses->server)) if(cifs_sb->tcon->ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) - return generic_writepages(mapping, wbc); + if(!experimEnabled) + return generic_writepages(mapping, wbc); /* * BB: Is this meaningful for a non-block-device file system? diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 957ddd1571c..4093764ef46 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -722,9 +722,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) cifs_sb = CIFS_SB(inode->i_sb); pTcon = cifs_sb->tcon; - mutex_lock(&inode->i_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&inode->i_sb->s_vfs_rename_mutex); if (full_path == NULL) { FreeXid(xid); return -ENOMEM; @@ -807,9 +805,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) cifs_sb = CIFS_SB(inode->i_sb); pTcon = cifs_sb->tcon; - mutex_lock(&inode->i_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&inode->i_sb->s_vfs_rename_mutex); if (full_path == NULL) { FreeXid(xid); return -ENOMEM; @@ -1141,9 +1137,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs) rc = 0; } - mutex_lock(&direntry->d_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex); if (full_path == NULL) { FreeXid(xid); return -ENOMEM; diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 9562f5bba65..2ec99f83314 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -48,10 +48,8 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, /* No need to check for cross device links since server will do that BB note DFS case in future though (when we may have to check) */ - mutex_lock(&inode->i_sb->s_vfs_rename_mutex); fromName = build_path_from_dentry(old_file); toName = build_path_from_dentry(direntry); - mutex_unlock(&inode->i_sb->s_vfs_rename_mutex); if((fromName == NULL) || (toName == NULL)) { rc = -ENOMEM; goto cifs_hl_exit; @@ -103,9 +101,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd) xid = GetXid(); - mutex_lock(&direntry->d_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex); if (!full_path) goto out_no_free; @@ -164,9 +160,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname) cifs_sb = CIFS_SB(inode->i_sb); pTcon = cifs_sb->tcon; - mutex_lock(&inode->i_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&inode->i_sb->s_vfs_rename_mutex); if(full_path == NULL) { FreeXid(xid); diff --git a/fs/cifs/ntlmssp.c b/fs/cifs/ntlmssp.c index 78866f92574..115359cc7a3 100644 --- a/fs/cifs/ntlmssp.c +++ b/fs/cifs/ntlmssp.c @@ -121,6 +121,20 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, const int type, } + /* copy session key */ + + /* if Unicode, align strings to two byte boundary */ + + /* copy user name */ /* BB Do we need to special case null user name? */ + + /* copy domain name */ + + /* copy Linux version */ + + /* copy network operating system name */ + + /* update bcc and smb buffer length */ + /* rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buf_type, 0); */ /* SMB request buf freed in SendReceive2 */ diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 2f6e2825571..b689c503512 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -404,9 +404,7 @@ static int initiate_cifs_search(const int xid, struct file *file) if(pTcon == NULL) return -EINVAL; - mutex_lock(&file->f_dentry->d_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(file->f_dentry); - mutex_unlock(&file->f_dentry->d_sb->s_vfs_rename_mutex); if(full_path == NULL) { return -ENOMEM; @@ -592,6 +590,13 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon, first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry - cifsFile->srch_inf.entries_in_buffer; + + /* if first entry in buf is zero then is first buffer + in search response data which means it is likely . and .. + will be in this buffer, although some servers do not return + . and .. for the root of a drive and for those we need + to start two entries earlier */ + /* dump_cifs_file_struct(file, "In fce ");*/ if(((index_to_find < cifsFile->srch_inf.index_of_last_entry) && is_dir_changed(file)) || @@ -634,23 +639,14 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon, char * end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + smbCalcSize((struct smb_hdr *) cifsFile->srch_inf.ntwrk_buf_start); + + current_entry = cifsFile->srch_inf.srch_entries_start; first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry - cifsFile->srch_inf.entries_in_buffer; pos_in_buf = index_to_find - first_entry_in_buffer; cFYI(1,("found entry - pos_in_buf %d",pos_in_buf)); - current_entry = cifsFile->srch_inf.srch_entries_start; for(i=0;(i<(pos_in_buf)) && (current_entry != NULL);i++) { /* go entry by entry figuring out which is first */ - /* if( . or ..) - skip */ - rc = cifs_entry_is_dot(current_entry,cifsFile); - if(rc == 1) /* is . or .. so skip */ { - cFYI(1,("Entry is .")); /* BB removeme BB */ - /* continue; */ - } else if (rc == 2 ) { - cFYI(1,("Entry is ..")); /* BB removeme BB */ - /* continue; */ - } current_entry = nxt_dir_entry(current_entry,end_of_smb); } if((current_entry == NULL) && (i < pos_in_buf)) { @@ -770,6 +766,11 @@ static int cifs_filldir(char *pfindEntry, struct file *file, if(file->f_dentry == NULL) return -ENOENT; + rc = cifs_entry_is_dot(pfindEntry,pCifsF); + /* skip . and .. since we added them first */ + if(rc != 0) + return 0; + cifs_sb = CIFS_SB(file->f_dentry->d_sb); qstring.name = scratch_buf; @@ -898,22 +899,22 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) switch ((int) file->f_pos) { case 0: - /*if (filldir(direntry, ".", 1, file->f_pos, + if (filldir(direntry, ".", 1, file->f_pos, file->f_dentry->d_inode->i_ino, DT_DIR) < 0) { - cERROR(1, ("Filldir for current dir failed ")); + cERROR(1, ("Filldir for current dir failed")); rc = -ENOMEM; break; } - file->f_pos++; */ + file->f_pos++; case 1: - /* if (filldir(direntry, "..", 2, file->f_pos, + if (filldir(direntry, "..", 2, file->f_pos, file->f_dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) { cERROR(1, ("Filldir for parent dir failed ")); rc = -ENOMEM; break; } - file->f_pos++; */ - case 2: + file->f_pos++; + default: /* 1) If search is active, is in current search buffer? if it before then restart search @@ -927,7 +928,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) return rc; } } - default: if(file->private_data == NULL) { rc = -EINVAL; FreeXid(xid); @@ -947,8 +947,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) kfree(cifsFile->search_resume_name); cifsFile->search_resume_name = NULL; */ - /* BB account for . and .. in f_pos as special case */ - rc = find_cifs_entry(xid,pTcon, file, ¤t_entry,&num_to_fill); if(rc) { @@ -977,7 +975,8 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) num_to_fill, i)); break; } - + /* if buggy server returns . and .. late do + we want to check for that here? */ rc = cifs_filldir(current_entry, file, filldir, direntry,tmp_buf); file->f_pos++; diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 3938444d87b..7754d641775 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -62,9 +62,7 @@ int cifs_removexattr(struct dentry * direntry, const char * ea_name) cifs_sb = CIFS_SB(sb); pTcon = cifs_sb->tcon; - mutex_lock(&sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&sb->s_vfs_rename_mutex); if(full_path == NULL) { FreeXid(xid); return -ENOMEM; @@ -116,9 +114,7 @@ int cifs_setxattr(struct dentry * direntry, const char * ea_name, cifs_sb = CIFS_SB(sb); pTcon = cifs_sb->tcon; - mutex_lock(&sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&sb->s_vfs_rename_mutex); if(full_path == NULL) { FreeXid(xid); return -ENOMEM; @@ -223,9 +219,7 @@ ssize_t cifs_getxattr(struct dentry * direntry, const char * ea_name, cifs_sb = CIFS_SB(sb); pTcon = cifs_sb->tcon; - mutex_lock(&sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&sb->s_vfs_rename_mutex); if(full_path == NULL) { FreeXid(xid); return -ENOMEM; @@ -341,9 +335,7 @@ ssize_t cifs_listxattr(struct dentry * direntry, char * data, size_t buf_size) cifs_sb = CIFS_SB(sb); pTcon = cifs_sb->tcon; - mutex_lock(&sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&sb->s_vfs_rename_mutex); if(full_path == NULL) { FreeXid(xid); return -ENOMEM; diff --git a/fs/compat.c b/fs/compat.c index 7f8e26ea427..b1f64786a61 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1217,6 +1217,10 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, if (ret < 0) goto out; + ret = security_file_permission(file, type == READ ? MAY_READ:MAY_WRITE); + if (ret) + goto out; + fnv = NULL; if (type == READ) { fn = file->f_op->read; @@ -1313,6 +1317,26 @@ out: return ret; } +asmlinkage long +compat_sys_vmsplice(int fd, const struct compat_iovec __user *iov32, + unsigned int nr_segs, unsigned int flags) +{ + unsigned i; + struct iovec *iov; + if (nr_segs > UIO_MAXIOV) + return -EINVAL; + iov = compat_alloc_user_space(nr_segs * sizeof(struct iovec)); + for (i = 0; i < nr_segs; i++) { + struct compat_iovec v; + if (get_user(v.iov_base, &iov32[i].iov_base) || + get_user(v.iov_len, &iov32[i].iov_len) || + put_user(compat_ptr(v.iov_base), &iov[i].iov_base) || + put_user(v.iov_len, &iov[i].iov_len)) + return -EFAULT; + } + return sys_vmsplice(fd, iov, nr_segs, flags); +} + /* * Exactly like fs/open.c:sys_open(), except that it doesn't set the * O_LARGEFILE flag. @@ -1889,7 +1913,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, } if (sigmask) { - if (sigsetsize |= sizeof(compat_sigset_t)) + if (sigsetsize != sizeof(compat_sigset_t)) return -EINVAL; if (copy_from_user(&ss32, sigmask, sizeof(ss32))) return -EFAULT; @@ -2006,109 +2030,115 @@ union compat_nfsctl_res { struct knfsd_fh cr32_getfs; }; -static int compat_nfs_svc_trans(struct nfsctl_arg *karg, struct compat_nfsctl_arg __user *arg) +static int compat_nfs_svc_trans(struct nfsctl_arg *karg, + struct compat_nfsctl_arg __user *arg) { - int err; - - err = access_ok(VERIFY_READ, &arg->ca32_svc, sizeof(arg->ca32_svc)); - err |= get_user(karg->ca_version, &arg->ca32_version); - err |= __get_user(karg->ca_svc.svc_port, &arg->ca32_svc.svc32_port); - err |= __get_user(karg->ca_svc.svc_nthreads, &arg->ca32_svc.svc32_nthreads); - return (err) ? -EFAULT : 0; + if (!access_ok(VERIFY_READ, &arg->ca32_svc, sizeof(arg->ca32_svc)) || + get_user(karg->ca_version, &arg->ca32_version) || + __get_user(karg->ca_svc.svc_port, &arg->ca32_svc.svc32_port) || + __get_user(karg->ca_svc.svc_nthreads, + &arg->ca32_svc.svc32_nthreads)) + return -EFAULT; + return 0; } -static int compat_nfs_clnt_trans(struct nfsctl_arg *karg, struct compat_nfsctl_arg __user *arg) +static int compat_nfs_clnt_trans(struct nfsctl_arg *karg, + struct compat_nfsctl_arg __user *arg) { - int err; - - err = access_ok(VERIFY_READ, &arg->ca32_client, sizeof(arg->ca32_client)); - err |= get_user(karg->ca_version, &arg->ca32_version); - err |= __copy_from_user(&karg->ca_client.cl_ident[0], - &arg->ca32_client.cl32_ident[0], - NFSCLNT_IDMAX); - err |= __get_user(karg->ca_client.cl_naddr, &arg->ca32_client.cl32_naddr); - err |= __copy_from_user(&karg->ca_client.cl_addrlist[0], - &arg->ca32_client.cl32_addrlist[0], - (sizeof(struct in_addr) * NFSCLNT_ADDRMAX)); - err |= __get_user(karg->ca_client.cl_fhkeytype, - &arg->ca32_client.cl32_fhkeytype); - err |= __get_user(karg->ca_client.cl_fhkeylen, - &arg->ca32_client.cl32_fhkeylen); - err |= __copy_from_user(&karg->ca_client.cl_fhkey[0], - &arg->ca32_client.cl32_fhkey[0], - NFSCLNT_KEYMAX); + if (!access_ok(VERIFY_READ, &arg->ca32_client, + sizeof(arg->ca32_client)) || + get_user(karg->ca_version, &arg->ca32_version) || + __copy_from_user(&karg->ca_client.cl_ident[0], + &arg->ca32_client.cl32_ident[0], + NFSCLNT_IDMAX) || + __get_user(karg->ca_client.cl_naddr, + &arg->ca32_client.cl32_naddr) || + __copy_from_user(&karg->ca_client.cl_addrlist[0], + &arg->ca32_client.cl32_addrlist[0], + (sizeof(struct in_addr) * NFSCLNT_ADDRMAX)) || + __get_user(karg->ca_client.cl_fhkeytype, + &arg->ca32_client.cl32_fhkeytype) || + __get_user(karg->ca_client.cl_fhkeylen, + &arg->ca32_client.cl32_fhkeylen) || + __copy_from_user(&karg->ca_client.cl_fhkey[0], + &arg->ca32_client.cl32_fhkey[0], + NFSCLNT_KEYMAX)) + return -EFAULT; - return (err) ? -EFAULT : 0; + return 0; } -static int compat_nfs_exp_trans(struct nfsctl_arg *karg, struct compat_nfsctl_arg __user *arg) +static int compat_nfs_exp_trans(struct nfsctl_arg *karg, + struct compat_nfsctl_arg __user *arg) { - int err; - - err = access_ok(VERIFY_READ, &arg->ca32_export, sizeof(arg->ca32_export)); - err |= get_user(karg->ca_version, &arg->ca32_version); - err |= __copy_from_user(&karg->ca_export.ex_client[0], - &arg->ca32_export.ex32_client[0], - NFSCLNT_IDMAX); - err |= __copy_from_user(&karg->ca_export.ex_path[0], - &arg->ca32_export.ex32_path[0], - NFS_MAXPATHLEN); - err |= __get_user(karg->ca_export.ex_dev, - &arg->ca32_export.ex32_dev); - err |= __get_user(karg->ca_export.ex_ino, - &arg->ca32_export.ex32_ino); - err |= __get_user(karg->ca_export.ex_flags, - &arg->ca32_export.ex32_flags); - err |= __get_user(karg->ca_export.ex_anon_uid, - &arg->ca32_export.ex32_anon_uid); - err |= __get_user(karg->ca_export.ex_anon_gid, - &arg->ca32_export.ex32_anon_gid); + if (!access_ok(VERIFY_READ, &arg->ca32_export, + sizeof(arg->ca32_export)) || + get_user(karg->ca_version, &arg->ca32_version) || + __copy_from_user(&karg->ca_export.ex_client[0], + &arg->ca32_export.ex32_client[0], + NFSCLNT_IDMAX) || + __copy_from_user(&karg->ca_export.ex_path[0], + &arg->ca32_export.ex32_path[0], + NFS_MAXPATHLEN) || + __get_user(karg->ca_export.ex_dev, + &arg->ca32_export.ex32_dev) || + __get_user(karg->ca_export.ex_ino, + &arg->ca32_export.ex32_ino) || + __get_user(karg->ca_export.ex_flags, + &arg->ca32_export.ex32_flags) || + __get_user(karg->ca_export.ex_anon_uid, + &arg->ca32_export.ex32_anon_uid) || + __get_user(karg->ca_export.ex_anon_gid, + &arg->ca32_export.ex32_anon_gid)) + return -EFAULT; SET_UID(karg->ca_export.ex_anon_uid, karg->ca_export.ex_anon_uid); SET_GID(karg->ca_export.ex_anon_gid, karg->ca_export.ex_anon_gid); - return (err) ? -EFAULT : 0; + return 0; } -static int compat_nfs_getfd_trans(struct nfsctl_arg *karg, struct compat_nfsctl_arg __user *arg) +static int compat_nfs_getfd_trans(struct nfsctl_arg *karg, + struct compat_nfsctl_arg __user *arg) { - int err; - - err = access_ok(VERIFY_READ, &arg->ca32_getfd, sizeof(arg->ca32_getfd)); - err |= get_user(karg->ca_version, &arg->ca32_version); - err |= __copy_from_user(&karg->ca_getfd.gd_addr, - &arg->ca32_getfd.gd32_addr, - (sizeof(struct sockaddr))); - err |= __copy_from_user(&karg->ca_getfd.gd_path, - &arg->ca32_getfd.gd32_path, - (NFS_MAXPATHLEN+1)); - err |= __get_user(karg->ca_getfd.gd_version, - &arg->ca32_getfd.gd32_version); + if (!access_ok(VERIFY_READ, &arg->ca32_getfd, + sizeof(arg->ca32_getfd)) || + get_user(karg->ca_version, &arg->ca32_version) || + __copy_from_user(&karg->ca_getfd.gd_addr, + &arg->ca32_getfd.gd32_addr, + (sizeof(struct sockaddr))) || + __copy_from_user(&karg->ca_getfd.gd_path, + &arg->ca32_getfd.gd32_path, + (NFS_MAXPATHLEN+1)) || + __get_user(karg->ca_getfd.gd_version, + &arg->ca32_getfd.gd32_version)) + return -EFAULT; - return (err) ? -EFAULT : 0; + return 0; } -static int compat_nfs_getfs_trans(struct nfsctl_arg *karg, struct compat_nfsctl_arg __user *arg) +static int compat_nfs_getfs_trans(struct nfsctl_arg *karg, + struct compat_nfsctl_arg __user *arg) { - int err; - - err = access_ok(VERIFY_READ, &arg->ca32_getfs, sizeof(arg->ca32_getfs)); - err |= get_user(karg->ca_version, &arg->ca32_version); - err |= __copy_from_user(&karg->ca_getfs.gd_addr, - &arg->ca32_getfs.gd32_addr, - (sizeof(struct sockaddr))); - err |= __copy_from_user(&karg->ca_getfs.gd_path, - &arg->ca32_getfs.gd32_path, - (NFS_MAXPATHLEN+1)); - err |= __get_user(karg->ca_getfs.gd_maxlen, - &arg->ca32_getfs.gd32_maxlen); + if (!access_ok(VERIFY_READ,&arg->ca32_getfs,sizeof(arg->ca32_getfs)) || + get_user(karg->ca_version, &arg->ca32_version) || + __copy_from_user(&karg->ca_getfs.gd_addr, + &arg->ca32_getfs.gd32_addr, + (sizeof(struct sockaddr))) || + __copy_from_user(&karg->ca_getfs.gd_path, + &arg->ca32_getfs.gd32_path, + (NFS_MAXPATHLEN+1)) || + __get_user(karg->ca_getfs.gd_maxlen, + &arg->ca32_getfs.gd32_maxlen)) + return -EFAULT; - return (err) ? -EFAULT : 0; + return 0; } /* This really doesn't need translations, we are only passing * back a union which contains opaque nfs file handle data. */ -static int compat_nfs_getfh_res_trans(union nfsctl_res *kres, union compat_nfsctl_res __user *res) +static int compat_nfs_getfh_res_trans(union nfsctl_res *kres, + union compat_nfsctl_res __user *res) { int err; @@ -2117,8 +2147,9 @@ static int compat_nfs_getfh_res_trans(union nfsctl_res *kres, union compat_nfsct return (err) ? -EFAULT : 0; } -asmlinkage long compat_sys_nfsservctl(int cmd, struct compat_nfsctl_arg __user *arg, - union compat_nfsctl_res __user *res) +asmlinkage long compat_sys_nfsservctl(int cmd, + struct compat_nfsctl_arg __user *arg, + union compat_nfsctl_res __user *res) { struct nfsctl_arg *karg; union nfsctl_res *kres; diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 8ed9b06a982..5f952187fc5 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -504,14 +504,16 @@ static int populate_groups(struct config_group *group) int ret = 0; int i; - if (group && group->default_groups) { - /* FYI, we're faking mkdir here + if (group->default_groups) { + /* + * FYI, we're faking mkdir here * I'm not sure we need this semaphore, as we're called * from our parent's mkdir. That holds our parent's * i_mutex, so afaik lookup cannot continue through our * parent to find us, let alone mess with our tree. * That said, taking our i_mutex is closer to mkdir - * emulation, and shouldn't hurt. */ + * emulation, and shouldn't hurt. + */ mutex_lock(&dentry->d_inode->i_mutex); for (i = 0; group->default_groups[i]; i++) { @@ -546,20 +548,34 @@ static void unlink_obj(struct config_item *item) item->ci_group = NULL; item->ci_parent = NULL; + + /* Drop the reference for ci_entry */ config_item_put(item); + /* Drop the reference for ci_parent */ config_group_put(group); } } static void link_obj(struct config_item *parent_item, struct config_item *item) { - /* Parent seems redundant with group, but it makes certain - * traversals much nicer. */ + /* + * Parent seems redundant with group, but it makes certain + * traversals much nicer. + */ item->ci_parent = parent_item; + + /* + * We hold a reference on the parent for the child's ci_parent + * link. + */ item->ci_group = config_group_get(to_config_group(parent_item)); list_add_tail(&item->ci_entry, &item->ci_group->cg_children); + /* + * We hold a reference on the child for ci_entry on the parent's + * cg_children + */ config_item_get(item); } @@ -684,6 +700,10 @@ static void client_drop_item(struct config_item *parent_item, type = parent_item->ci_type; BUG_ON(!type); + /* + * If ->drop_item() exists, it is responsible for the + * config_item_put(). + */ if (type->ct_group_ops && type->ct_group_ops->drop_item) type->ct_group_ops->drop_item(to_config_group(parent_item), item); @@ -694,23 +714,28 @@ static void client_drop_item(struct config_item *parent_item, static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) { - int ret; + int ret, module_got = 0; struct config_group *group; struct config_item *item; struct config_item *parent_item; struct configfs_subsystem *subsys; struct configfs_dirent *sd; struct config_item_type *type; - struct module *owner; + struct module *owner = NULL; char *name; - if (dentry->d_parent == configfs_sb->s_root) - return -EPERM; + if (dentry->d_parent == configfs_sb->s_root) { + ret = -EPERM; + goto out; + } sd = dentry->d_parent->d_fsdata; - if (!(sd->s_type & CONFIGFS_USET_DIR)) - return -EPERM; + if (!(sd->s_type & CONFIGFS_USET_DIR)) { + ret = -EPERM; + goto out; + } + /* Get a working ref for the duration of this function */ parent_item = configfs_get_config_item(dentry->d_parent); type = parent_item->ci_type; subsys = to_config_group(parent_item)->cg_subsys; @@ -719,15 +744,16 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (!type || !type->ct_group_ops || (!type->ct_group_ops->make_group && !type->ct_group_ops->make_item)) { - config_item_put(parent_item); - return -EPERM; /* What lack-of-mkdir returns */ + ret = -EPERM; /* Lack-of-mkdir returns -EPERM */ + goto out_put; } name = kmalloc(dentry->d_name.len + 1, GFP_KERNEL); if (!name) { - config_item_put(parent_item); - return -ENOMEM; + ret = -ENOMEM; + goto out_put; } + snprintf(name, dentry->d_name.len + 1, "%s", dentry->d_name.name); down(&subsys->su_sem); @@ -748,40 +774,67 @@ static int configfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) kfree(name); if (!item) { - config_item_put(parent_item); - return -ENOMEM; + /* + * If item == NULL, then link_obj() was never called. + * There are no extra references to clean up. + */ + ret = -ENOMEM; + goto out_put; } - ret = -EINVAL; + /* + * link_obj() has been called (via link_group() for groups). + * From here on out, errors must clean that up. + */ + type = item->ci_type; - if (type) { - owner = type->ct_owner; - if (try_module_get(owner)) { - if (group) { - ret = configfs_attach_group(parent_item, - item, - dentry); - } else { - ret = configfs_attach_item(parent_item, - item, - dentry); - } + if (!type) { + ret = -EINVAL; + goto out_unlink; + } - if (ret) { - down(&subsys->su_sem); - if (group) - unlink_group(group); - else - unlink_obj(item); - client_drop_item(parent_item, item); - up(&subsys->su_sem); + owner = type->ct_owner; + if (!try_module_get(owner)) { + ret = -EINVAL; + goto out_unlink; + } - config_item_put(parent_item); - module_put(owner); - } - } + /* + * I hate doing it this way, but if there is + * an error, module_put() probably should + * happen after any cleanup. + */ + module_got = 1; + + if (group) + ret = configfs_attach_group(parent_item, item, dentry); + else + ret = configfs_attach_item(parent_item, item, dentry); + +out_unlink: + if (ret) { + /* Tear down everything we built up */ + down(&subsys->su_sem); + if (group) + unlink_group(group); + else + unlink_obj(item); + client_drop_item(parent_item, item); + up(&subsys->su_sem); + + if (module_got) + module_put(owner); } +out_put: + /* + * link_obj()/link_group() took a reference from child->parent, + * so the parent is safely pinned. We can drop our working + * reference. + */ + config_item_put(parent_item); + +out: return ret; } @@ -801,6 +854,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) if (sd->s_type & CONFIGFS_USET_DEFAULT) return -EPERM; + /* Get a working ref until we have the child */ parent_item = configfs_get_config_item(dentry->d_parent); subsys = to_config_group(parent_item)->cg_subsys; BUG_ON(!subsys); @@ -817,6 +871,7 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) return ret; } + /* Get a working ref for the duration of this function */ item = configfs_get_config_item(dentry); /* Drop reference from above, item already holds one. */ diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 85d166cdcae..b55b4ea9a67 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -67,12 +67,13 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d static int debugfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) { - struct inode *inode = debugfs_get_inode(dir->i_sb, mode, dev); + struct inode *inode; int error = -EPERM; if (dentry->d_inode) return -EEXIST; + inode = debugfs_get_inode(dir->i_sb, mode, dev); if (inode) { d_instantiate(dentry, inode); dget(dentry); diff --git a/fs/direct-io.c b/fs/direct-io.c index 910a8ed74b5..b05d1b21877 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -929,8 +929,7 @@ do_holes: block_in_page += this_chunk_blocks; dio->blocks_available -= this_chunk_blocks; next_block: - if (dio->block_in_file > dio->final_block_in_request) - BUG(); + BUG_ON(dio->block_in_file > dio->final_block_in_request); if (dio->block_in_file == dio->final_block_in_request) break; } diff --git a/fs/dquot.c b/fs/dquot.c index 6b388692093..81d87a413c6 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -590,8 +590,7 @@ we_slept: atomic_dec(&dquot->dq_count); #ifdef __DQUOT_PARANOIA /* sanity check */ - if (!list_empty(&dquot->dq_free)) - BUG(); + BUG_ON(!list_empty(&dquot->dq_free)); #endif put_dquot_last(dquot); spin_unlock(&dq_list_lock); @@ -666,8 +665,7 @@ we_slept: return NODQUOT; } #ifdef __DQUOT_PARANOIA - if (!dquot->dq_sb) /* Has somebody invalidated entry under us? */ - BUG(); + BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */ #endif return dquot; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 242fe1a66ce..1b4491cdd11 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -599,7 +599,7 @@ sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event) switch (op) { case EPOLL_CTL_ADD: if (!epi) { - epds.events |= POLLERR | POLLHUP | POLLRDHUP; + epds.events |= POLLERR | POLLHUP; error = ep_insert(ep, &epds, tfile, fd); } else @@ -613,7 +613,7 @@ sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event) break; case EPOLL_CTL_MOD: if (epi) { - epds.events |= POLLERR | POLLHUP | POLLRDHUP; + epds.events |= POLLERR | POLLHUP; error = ep_modify(ep, epi, &epds); } else error = -ENOENT; diff --git a/fs/exec.c b/fs/exec.c index 950ebd43cdc..3a79d97ac23 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -561,7 +561,7 @@ static int exec_mmap(struct mm_struct *mm) arch_pick_mmap_layout(mm); if (old_mm) { up_read(&old_mm->mmap_sem); - if (active_mm != old_mm) BUG(); + BUG_ON(active_mm != old_mm); mmput(old_mm); return 0; } @@ -665,9 +665,7 @@ static int de_thread(struct task_struct *tsk) * and to assume its PID: */ if (!thread_group_leader(current)) { - struct task_struct *parent; struct dentry *proc_dentry1, *proc_dentry2; - unsigned long ptrace; /* * Wait for the thread group leader to be a zombie. @@ -678,6 +676,18 @@ static int de_thread(struct task_struct *tsk) while (leader->exit_state != EXIT_ZOMBIE) yield(); + /* + * The only record we have of the real-time age of a + * process, regardless of execs it's done, is start_time. + * All the past CPU time is accumulated in signal_struct + * from sister threads now dead. But in this non-leader + * exec, nothing survives from the original leader thread, + * whose birth marks the true age of this process now. + * When we take on its identity by switching to its PID, we + * also take its birthdate (always earlier than our own). + */ + current->start_time = leader->start_time; + spin_lock(&leader->proc_lock); spin_lock(¤t->proc_lock); proc_dentry1 = proc_pid_unhash(current); @@ -692,22 +702,6 @@ static int de_thread(struct task_struct *tsk) * two threads with a switched PID, and release * the former thread group leader: */ - ptrace = leader->ptrace; - parent = leader->parent; - if (unlikely(ptrace) && unlikely(parent == current)) { - /* - * Joker was ptracing his own group leader, - * and now he wants to be his own parent! - * We can't have that. - */ - ptrace = 0; - } - - ptrace_unlink(current); - ptrace_unlink(leader); - remove_parent(current); - remove_parent(leader); - /* Become a process group leader with the old leader's pid. * Note: The old leader also uses thispid until release_task @@ -718,19 +712,15 @@ static int de_thread(struct task_struct *tsk) attach_pid(current, PIDTYPE_PID, current->pid); attach_pid(current, PIDTYPE_PGID, current->signal->pgrp); attach_pid(current, PIDTYPE_SID, current->signal->session); - list_add_tail(¤t->tasks, &init_task.tasks); + list_add_tail_rcu(¤t->tasks, &init_task.tasks); - current->parent = current->real_parent = leader->real_parent; - leader->parent = leader->real_parent = child_reaper; current->group_leader = current; - leader->group_leader = leader; + leader->group_leader = current; - add_parent(current); - add_parent(leader); - if (ptrace) { - current->ptrace = ptrace; - __ptrace_link(current, parent); - } + /* Reduce leader to a thread */ + detach_pid(leader, PIDTYPE_PGID); + detach_pid(leader, PIDTYPE_SID); + list_del_init(&leader->tasks); current->exit_signal = SIGCHLD; diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index b06b54f1bbb..4c39009350f 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -102,7 +102,7 @@ find_exported_dentry(struct super_block *sb, void *obj, void *parent, if (acceptable(context, result)) return result; if (S_ISDIR(result->d_inode->i_mode)) { - /* there is no other dentry, so fail */ + err = -EACCES; goto err_result; } diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 48ae0339af1..2edd7eec88f 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -711,7 +711,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, * direct blocks blocks */ if (num == 0 && blks > 1) { - current_block = le32_to_cpu(where->key + 1); + current_block = le32_to_cpu(where->key) + 1; for (i = 1; i < blks; i++) *(where->p + i ) = cpu_to_le32(current_block++); } @@ -724,7 +724,7 @@ static int ext3_splice_branch(handle_t *handle, struct inode *inode, if (block_i) { block_i->last_alloc_logical_block = block + blks - 1; block_i->last_alloc_physical_block = - le32_to_cpu(where[num].key + blks - 1); + le32_to_cpu(where[num].key) + blks - 1; } /* We are done with atomic stuff, now do the rest of housekeeping */ @@ -814,11 +814,13 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, /* Simplest case - block found, no allocation needed */ if (!partial) { - first_block = chain[depth - 1].key; + first_block = le32_to_cpu(chain[depth - 1].key); clear_buffer_new(bh_result); count++; /*map more blocks*/ while (count < maxblocks && count <= blocks_to_boundary) { + unsigned long blk; + if (!verify_chain(chain, partial)) { /* * Indirect block might be removed by @@ -831,8 +833,9 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, count = 0; break; } - if (le32_to_cpu(*(chain[depth-1].p+count) == - (first_block + count))) + blk = le32_to_cpu(*(chain[depth-1].p + count)); + + if (blk == first_block + count) count++; else break; diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index aaf1da17b6d..8c22aa9a7fb 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -48,6 +48,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, if (!S_ISDIR(inode->i_mode)) flags &= ~EXT3_DIRSYNC_FL; + mutex_lock(&inode->i_mutex); oldflags = ei->i_flags; /* The JOURNAL_DATA flag is modifiable only by root */ @@ -60,8 +61,10 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, * This test looks nicer. Thanks to Pauline Middelink */ if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) { - if (!capable(CAP_LINUX_IMMUTABLE)) + if (!capable(CAP_LINUX_IMMUTABLE)) { + mutex_unlock(&inode->i_mutex); return -EPERM; + } } /* @@ -69,14 +72,18 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, * the relevant capability. */ if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { - if (!capable(CAP_SYS_RESOURCE)) + if (!capable(CAP_SYS_RESOURCE)) { + mutex_unlock(&inode->i_mutex); return -EPERM; + } } handle = ext3_journal_start(inode, 1); - if (IS_ERR(handle)) + if (IS_ERR(handle)) { + mutex_unlock(&inode->i_mutex); return PTR_ERR(handle); + } if (IS_SYNC(inode)) handle->h_sync = 1; err = ext3_reserve_inode_write(handle, inode, &iloc); @@ -93,11 +100,14 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, err = ext3_mark_iloc_dirty(handle, inode, &iloc); flags_err: ext3_journal_stop(handle); - if (err) + if (err) { + mutex_unlock(&inode->i_mutex); return err; + } if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) err = ext3_change_inode_journal_flag(inode, jflag); + mutex_unlock(&inode->i_mutex); return err; } case EXT3_IOC_GETVERSION: diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 1041dab6de2..34b39e9a1e5 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -213,7 +213,7 @@ static int setup_new_group_blocks(struct super_block *sb, goto exit_bh; } lock_buffer(bh); - memcpy(gdb->b_data, sbi->s_group_desc[i], bh->b_size); + memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, bh->b_size); set_buffer_uptodate(gdb); unlock_buffer(bh); ext3_journal_dirty_metadata(handle, gdb); @@ -974,6 +974,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) { ext3_warning(sb, __FUNCTION__, "multiple resizers run on filesystem!"); + unlock_super(sb); err = -EBUSY; goto exit_put; } diff --git a/fs/fcntl.c b/fs/fcntl.c index 2a2479196f9..d35cbc6bc11 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -453,8 +453,7 @@ static void send_sigio_to_task(struct task_struct *p, /* Make sure we are called with one of the POLL_* reasons, otherwise we could leak kernel stack into userspace. */ - if ((reason & __SI_MASK) != __SI_POLL) - BUG(); + BUG_ON((reason & __SI_MASK) != __SI_POLL); if (reason - POLL_IN >= NSIGPOLL) si.si_band = ~0L; else diff --git a/fs/fifo.c b/fs/fifo.c index 889f722ee36..49035b174b4 100644 --- a/fs/fifo.c +++ b/fs/fifo.c @@ -15,30 +15,35 @@ #include <linux/fs.h> #include <linux/pipe_fs_i.h> -static void wait_for_partner(struct inode* inode, unsigned int* cnt) +static void wait_for_partner(struct inode* inode, unsigned int *cnt) { int cur = *cnt; - while(cur == *cnt) { - pipe_wait(inode); - if(signal_pending(current)) + + while (cur == *cnt) { + pipe_wait(inode->i_pipe); + if (signal_pending(current)) break; } } static void wake_up_partner(struct inode* inode) { - wake_up_interruptible(PIPE_WAIT(*inode)); + wake_up_interruptible(&inode->i_pipe->wait); } static int fifo_open(struct inode *inode, struct file *filp) { + struct pipe_inode_info *pipe; int ret; - mutex_lock(PIPE_MUTEX(*inode)); - if (!inode->i_pipe) { + mutex_lock(&inode->i_mutex); + pipe = inode->i_pipe; + if (!pipe) { ret = -ENOMEM; - if(!pipe_new(inode)) + pipe = alloc_pipe_info(inode); + if (!pipe) goto err_nocleanup; + inode->i_pipe = pipe; } filp->f_version = 0; @@ -53,18 +58,18 @@ static int fifo_open(struct inode *inode, struct file *filp) * opened, even when there is no process writing the FIFO. */ filp->f_op = &read_fifo_fops; - PIPE_RCOUNTER(*inode)++; - if (PIPE_READERS(*inode)++ == 0) + pipe->r_counter++; + if (pipe->readers++ == 0) wake_up_partner(inode); - if (!PIPE_WRITERS(*inode)) { + if (!pipe->writers) { if ((filp->f_flags & O_NONBLOCK)) { /* suppress POLLHUP until we have * seen a writer */ - filp->f_version = PIPE_WCOUNTER(*inode); + filp->f_version = pipe->w_counter; } else { - wait_for_partner(inode, &PIPE_WCOUNTER(*inode)); + wait_for_partner(inode, &pipe->w_counter); if(signal_pending(current)) goto err_rd; } @@ -78,16 +83,16 @@ static int fifo_open(struct inode *inode, struct file *filp) * errno=ENXIO when there is no process reading the FIFO. */ ret = -ENXIO; - if ((filp->f_flags & O_NONBLOCK) && !PIPE_READERS(*inode)) + if ((filp->f_flags & O_NONBLOCK) && !pipe->readers) goto err; filp->f_op = &write_fifo_fops; - PIPE_WCOUNTER(*inode)++; - if (!PIPE_WRITERS(*inode)++) + pipe->w_counter++; + if (!pipe->writers++) wake_up_partner(inode); - if (!PIPE_READERS(*inode)) { - wait_for_partner(inode, &PIPE_RCOUNTER(*inode)); + if (!pipe->readers) { + wait_for_partner(inode, &pipe->r_counter); if (signal_pending(current)) goto err_wr; } @@ -102,11 +107,11 @@ static int fifo_open(struct inode *inode, struct file *filp) */ filp->f_op = &rdwr_fifo_fops; - PIPE_READERS(*inode)++; - PIPE_WRITERS(*inode)++; - PIPE_RCOUNTER(*inode)++; - PIPE_WCOUNTER(*inode)++; - if (PIPE_READERS(*inode) == 1 || PIPE_WRITERS(*inode) == 1) + pipe->readers++; + pipe->writers++; + pipe->r_counter++; + pipe->w_counter++; + if (pipe->readers == 1 || pipe->writers == 1) wake_up_partner(inode); break; @@ -116,27 +121,27 @@ static int fifo_open(struct inode *inode, struct file *filp) } /* Ok! */ - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); return 0; err_rd: - if (!--PIPE_READERS(*inode)) - wake_up_interruptible(PIPE_WAIT(*inode)); + if (!--pipe->readers) + wake_up_interruptible(&pipe->wait); ret = -ERESTARTSYS; goto err; err_wr: - if (!--PIPE_WRITERS(*inode)) - wake_up_interruptible(PIPE_WAIT(*inode)); + if (!--pipe->writers) + wake_up_interruptible(&pipe->wait); ret = -ERESTARTSYS; goto err; err: - if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) + if (!pipe->readers && !pipe->writers) free_pipe_info(inode); err_nocleanup: - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); return ret; } diff --git a/fs/freevxfs/vxfs_olt.c b/fs/freevxfs/vxfs_olt.c index 76a0708ae97..04950084790 100644 --- a/fs/freevxfs/vxfs_olt.c +++ b/fs/freevxfs/vxfs_olt.c @@ -42,24 +42,21 @@ static inline void vxfs_get_fshead(struct vxfs_oltfshead *fshp, struct vxfs_sb_info *infp) { - if (infp->vsi_fshino) - BUG(); + BUG_ON(infp->vsi_fshino); infp->vsi_fshino = fshp->olt_fsino[0]; } static inline void vxfs_get_ilist(struct vxfs_oltilist *ilistp, struct vxfs_sb_info *infp) { - if (infp->vsi_iext) - BUG(); + BUG_ON(infp->vsi_iext); infp->vsi_iext = ilistp->olt_iext[0]; } static inline u_long vxfs_oblock(struct super_block *sbp, daddr_t block, u_long bsize) { - if (sbp->s_blocksize % bsize) - BUG(); + BUG_ON(sbp->s_blocksize % bsize); return (block * (sbp->s_blocksize / bsize)); } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 23d1f52eb1b..104a62dadb9 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2005 Miklos Szeredi <miklos@szeredi.hu> + Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -23,13 +23,11 @@ static kmem_cache_t *fuse_req_cachep; static struct fuse_conn *fuse_get_conn(struct file *file) { - struct fuse_conn *fc; - spin_lock(&fuse_lock); - fc = file->private_data; - if (fc && !fc->connected) - fc = NULL; - spin_unlock(&fuse_lock); - return fc; + /* + * Lockless access is OK, because file->private data is set + * once during mount and is valid until the file is released. + */ + return file->private_data; } static void fuse_request_init(struct fuse_req *req) @@ -74,10 +72,8 @@ static void restore_sigs(sigset_t *oldset) */ void fuse_reset_request(struct fuse_req *req) { - int preallocated = req->preallocated; BUG_ON(atomic_read(&req->count) != 1); fuse_request_init(req); - req->preallocated = preallocated; } static void __fuse_get_request(struct fuse_req *req) @@ -92,80 +88,64 @@ static void __fuse_put_request(struct fuse_req *req) atomic_dec(&req->count); } -static struct fuse_req *do_get_request(struct fuse_conn *fc) +struct fuse_req *fuse_get_req(struct fuse_conn *fc) { struct fuse_req *req; - - spin_lock(&fuse_lock); - BUG_ON(list_empty(&fc->unused_list)); - req = list_entry(fc->unused_list.next, struct fuse_req, list); - list_del_init(&req->list); - spin_unlock(&fuse_lock); - fuse_request_init(req); - req->preallocated = 1; - req->in.h.uid = current->fsuid; - req->in.h.gid = current->fsgid; - req->in.h.pid = current->pid; - return req; -} - -/* This can return NULL, but only in case it's interrupted by a SIGKILL */ -struct fuse_req *fuse_get_request(struct fuse_conn *fc) -{ - int intr; sigset_t oldset; + int intr; + int err; atomic_inc(&fc->num_waiting); block_sigs(&oldset); - intr = down_interruptible(&fc->outstanding_sem); + intr = wait_event_interruptible(fc->blocked_waitq, !fc->blocked); restore_sigs(&oldset); - if (intr) { - atomic_dec(&fc->num_waiting); - return NULL; - } - return do_get_request(fc); -} + err = -EINTR; + if (intr) + goto out; -/* Must be called with fuse_lock held */ -static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req) -{ - if (req->preallocated) { - atomic_dec(&fc->num_waiting); - list_add(&req->list, &fc->unused_list); - } else - fuse_request_free(req); + req = fuse_request_alloc(); + err = -ENOMEM; + if (!req) + goto out; - /* If we are in debt decrease that first */ - if (fc->outstanding_debt) - fc->outstanding_debt--; - else - up(&fc->outstanding_sem); + req->in.h.uid = current->fsuid; + req->in.h.gid = current->fsgid; + req->in.h.pid = current->pid; + req->waiting = 1; + return req; + + out: + atomic_dec(&fc->num_waiting); + return ERR_PTR(err); } void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) { if (atomic_dec_and_test(&req->count)) { - spin_lock(&fuse_lock); - fuse_putback_request(fc, req); - spin_unlock(&fuse_lock); + if (req->waiting) + atomic_dec(&fc->num_waiting); + fuse_request_free(req); } } -static void fuse_put_request_locked(struct fuse_conn *fc, struct fuse_req *req) -{ - if (atomic_dec_and_test(&req->count)) - fuse_putback_request(fc, req); -} - -void fuse_release_background(struct fuse_req *req) +/* + * Called with sbput_sem held for read (request_end) or write + * (fuse_put_super). By the time fuse_put_super() is finished, all + * inodes belonging to background requests must be released, so the + * iputs have to be done within the locked region. + */ +void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req) { iput(req->inode); iput(req->inode2); - if (req->file) - fput(req->file); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); list_del(&req->bg_entry); - spin_unlock(&fuse_lock); + if (fc->num_background == FUSE_MAX_BACKGROUND) { + fc->blocked = 0; + wake_up_all(&fc->blocked_waitq); + } + fc->num_background--; + spin_unlock(&fc->lock); } /* @@ -184,24 +164,29 @@ void fuse_release_background(struct fuse_req *req) * interrupted and put in the background, it will return with an error * and hence never be reset and reused. * - * Called with fuse_lock, unlocks it + * Called with fc->lock, unlocks it */ static void request_end(struct fuse_conn *fc, struct fuse_req *req) { list_del(&req->list); req->state = FUSE_REQ_FINISHED; if (!req->background) { + spin_unlock(&fc->lock); wake_up(&req->waitq); - fuse_put_request_locked(fc, req); - spin_unlock(&fuse_lock); + fuse_put_request(fc, req); } else { void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; req->end = NULL; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); down_read(&fc->sbput_sem); if (fc->mounted) - fuse_release_background(req); + fuse_release_background(fc, req); up_read(&fc->sbput_sem); + + /* fput must go outside sbput_sem, otherwise it can deadlock */ + if (req->file) + fput(req->file); + if (end) end(fc, req); else @@ -242,6 +227,9 @@ static void background_request(struct fuse_conn *fc, struct fuse_req *req) { req->background = 1; list_add(&req->bg_entry, &fc->background); + fc->num_background++; + if (fc->num_background == FUSE_MAX_BACKGROUND) + fc->blocked = 1; if (req->inode) req->inode = igrab(req->inode); if (req->inode2) @@ -250,16 +238,16 @@ static void background_request(struct fuse_conn *fc, struct fuse_req *req) get_file(req->file); } -/* Called with fuse_lock held. Releases, and then reacquires it. */ +/* Called with fc->lock held. Releases, and then reacquires it. */ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) { sigset_t oldset; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); block_sigs(&oldset); wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED); restore_sigs(&oldset); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); if (req->state == FUSE_REQ_FINISHED && !req->interrupted) return; @@ -273,9 +261,9 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) locked state, there mustn't be any filesystem operation (e.g. page fault), since that could lead to deadlock */ - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); wait_event(req->waitq, !req->locked); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); } if (req->state == FUSE_REQ_PENDING) { list_del(&req->list); @@ -304,19 +292,14 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req) req->in.h.unique = fc->reqctr; req->in.h.len = sizeof(struct fuse_in_header) + len_args(req->in.numargs, (struct fuse_arg *) req->in.args); - if (!req->preallocated) { - /* If request is not preallocated (either FORGET or - RELEASE), then still decrease outstanding_sem, so - user can't open infinite number of files while not - processing the RELEASE requests. However for - efficiency do it without blocking, so if down() - would block, just increase the debt instead */ - if (down_trylock(&fc->outstanding_sem)) - fc->outstanding_debt++; - } list_add_tail(&req->list, &fc->pending); req->state = FUSE_REQ_PENDING; + if (!req->waiting) { + req->waiting = 1; + atomic_inc(&fc->num_waiting); + } wake_up(&fc->waitq); + kill_fasync(&fc->fasync, SIGIO, POLL_IN); } /* @@ -325,7 +308,7 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req) void request_send(struct fuse_conn *fc, struct fuse_req *req) { req->isreply = 1; - spin_lock(&fuse_lock); + spin_lock(&fc->lock); if (!fc->connected) req->out.h.error = -ENOTCONN; else if (fc->conn_error) @@ -338,15 +321,16 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req) request_wait_answer(fc, req); } - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) { - spin_lock(&fuse_lock); + spin_lock(&fc->lock); + background_request(fc, req); if (fc->connected) { queue_request(fc, req); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } else { req->out.h.error = -ENOTCONN; request_end(fc, req); @@ -362,9 +346,6 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req) void request_send_background(struct fuse_conn *fc, struct fuse_req *req) { req->isreply = 1; - spin_lock(&fuse_lock); - background_request(fc, req); - spin_unlock(&fuse_lock); request_send_nowait(fc, req); } @@ -373,16 +354,16 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req) * anything that could cause a page-fault. If the request was already * interrupted bail out. */ -static int lock_request(struct fuse_req *req) +static int lock_request(struct fuse_conn *fc, struct fuse_req *req) { int err = 0; if (req) { - spin_lock(&fuse_lock); + spin_lock(&fc->lock); if (req->interrupted) err = -ENOENT; else req->locked = 1; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } return err; } @@ -392,18 +373,19 @@ static int lock_request(struct fuse_req *req) * requester thread is currently waiting for it to be unlocked, so * wake it up. */ -static void unlock_request(struct fuse_req *req) +static void unlock_request(struct fuse_conn *fc, struct fuse_req *req) { if (req) { - spin_lock(&fuse_lock); + spin_lock(&fc->lock); req->locked = 0; if (req->interrupted) wake_up(&req->waitq); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } } struct fuse_copy_state { + struct fuse_conn *fc; int write; struct fuse_req *req; const struct iovec *iov; @@ -416,11 +398,12 @@ struct fuse_copy_state { unsigned len; }; -static void fuse_copy_init(struct fuse_copy_state *cs, int write, - struct fuse_req *req, const struct iovec *iov, - unsigned long nr_segs) +static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc, + int write, struct fuse_req *req, + const struct iovec *iov, unsigned long nr_segs) { memset(cs, 0, sizeof(*cs)); + cs->fc = fc; cs->write = write; cs->req = req; cs->iov = iov; @@ -450,7 +433,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) unsigned long offset; int err; - unlock_request(cs->req); + unlock_request(cs->fc, cs->req); fuse_copy_finish(cs); if (!cs->seglen) { BUG_ON(!cs->nr_segs); @@ -473,7 +456,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) cs->seglen -= cs->len; cs->addr += cs->len; - return lock_request(cs->req); + return lock_request(cs->fc, cs->req); } /* Do as much copy to/from userspace buffer as we can */ @@ -585,9 +568,9 @@ static void request_wait(struct fuse_conn *fc) if (signal_pending(current)) break; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); schedule(); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); } set_current_state(TASK_RUNNING); remove_wait_queue(&fc->waitq, &wait); @@ -606,18 +589,21 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, unsigned long nr_segs, loff_t *off) { int err; - struct fuse_conn *fc; struct fuse_req *req; struct fuse_in *in; struct fuse_copy_state cs; unsigned reqsize; + struct fuse_conn *fc = fuse_get_conn(file); + if (!fc) + return -EPERM; restart: - spin_lock(&fuse_lock); - fc = file->private_data; - err = -EPERM; - if (!fc) + spin_lock(&fc->lock); + err = -EAGAIN; + if ((file->f_flags & O_NONBLOCK) && fc->connected && + list_empty(&fc->pending)) goto err_unlock; + request_wait(fc); err = -ENODEV; if (!fc->connected) @@ -641,14 +627,14 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, request_end(fc, req); goto restart; } - spin_unlock(&fuse_lock); - fuse_copy_init(&cs, 1, req, iov, nr_segs); + spin_unlock(&fc->lock); + fuse_copy_init(&cs, fc, 1, req, iov, nr_segs); err = fuse_copy_one(&cs, &in->h, sizeof(in->h)); if (!err) err = fuse_copy_args(&cs, in->numargs, in->argpages, (struct fuse_arg *) in->args, 0); fuse_copy_finish(&cs); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); req->locked = 0; if (!err && req->interrupted) err = -ENOENT; @@ -663,12 +649,12 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, else { req->state = FUSE_REQ_SENT; list_move_tail(&req->list, &fc->processing); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } return reqsize; err_unlock: - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); return err; } @@ -735,9 +721,9 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, struct fuse_copy_state cs; struct fuse_conn *fc = fuse_get_conn(file); if (!fc) - return -ENODEV; + return -EPERM; - fuse_copy_init(&cs, 0, NULL, iov, nr_segs); + fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs); if (nbytes < sizeof(struct fuse_out_header)) return -EINVAL; @@ -749,7 +735,7 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, oh.len != nbytes) goto err_finish; - spin_lock(&fuse_lock); + spin_lock(&fc->lock); err = -ENOENT; if (!fc->connected) goto err_unlock; @@ -760,9 +746,9 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, goto err_unlock; if (req->interrupted) { - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); fuse_copy_finish(&cs); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); request_end(fc, req); return -ENOENT; } @@ -770,12 +756,12 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, req->out.h = oh; req->locked = 1; cs.req = req; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); err = copy_out_args(&cs, &req->out, nbytes); fuse_copy_finish(&cs); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); req->locked = 0; if (!err) { if (req->interrupted) @@ -787,7 +773,7 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, return err ? err : nbytes; err_unlock: - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); err_finish: fuse_copy_finish(&cs); return err; @@ -804,18 +790,19 @@ static ssize_t fuse_dev_write(struct file *file, const char __user *buf, static unsigned fuse_dev_poll(struct file *file, poll_table *wait) { - struct fuse_conn *fc = fuse_get_conn(file); unsigned mask = POLLOUT | POLLWRNORM; - + struct fuse_conn *fc = fuse_get_conn(file); if (!fc) - return -ENODEV; + return POLLERR; poll_wait(file, &fc->waitq, wait); - spin_lock(&fuse_lock); - if (!list_empty(&fc->pending)) - mask |= POLLIN | POLLRDNORM; - spin_unlock(&fuse_lock); + spin_lock(&fc->lock); + if (!fc->connected) + mask = POLLERR; + else if (!list_empty(&fc->pending)) + mask |= POLLIN | POLLRDNORM; + spin_unlock(&fc->lock); return mask; } @@ -823,7 +810,7 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait) /* * Abort all requests on the given list (pending or processing) * - * This function releases and reacquires fuse_lock + * This function releases and reacquires fc->lock */ static void end_requests(struct fuse_conn *fc, struct list_head *head) { @@ -832,7 +819,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head) req = list_entry(head->next, struct fuse_req, list); req->out.h.error = -ECONNABORTED; request_end(fc, req); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); } } @@ -863,10 +850,10 @@ static void end_io_requests(struct fuse_conn *fc) req->end = NULL; /* The end function will consume this reference */ __fuse_get_request(req); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); wait_event(req->waitq, !req->locked); end(fc, req); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); } } } @@ -893,35 +880,44 @@ static void end_io_requests(struct fuse_conn *fc) */ void fuse_abort_conn(struct fuse_conn *fc) { - spin_lock(&fuse_lock); + spin_lock(&fc->lock); if (fc->connected) { fc->connected = 0; end_io_requests(fc); end_requests(fc, &fc->pending); end_requests(fc, &fc->processing); wake_up_all(&fc->waitq); + kill_fasync(&fc->fasync, SIGIO, POLL_IN); } - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } static int fuse_dev_release(struct inode *inode, struct file *file) { - struct fuse_conn *fc; - - spin_lock(&fuse_lock); - fc = file->private_data; + struct fuse_conn *fc = fuse_get_conn(file); if (fc) { + spin_lock(&fc->lock); fc->connected = 0; end_requests(fc, &fc->pending); end_requests(fc, &fc->processing); - } - spin_unlock(&fuse_lock); - if (fc) + spin_unlock(&fc->lock); + fasync_helper(-1, file, 0, &fc->fasync); kobject_put(&fc->kobj); + } return 0; } +static int fuse_dev_fasync(int fd, struct file *file, int on) +{ + struct fuse_conn *fc = fuse_get_conn(file); + if (!fc) + return -EPERM; + + /* No locking - fasync_helper does its own locking */ + return fasync_helper(fd, file, on, &fc->fasync); +} + const struct file_operations fuse_dev_operations = { .owner = THIS_MODULE, .llseek = no_llseek, @@ -931,6 +927,7 @@ const struct file_operations fuse_dev_operations = { .writev = fuse_dev_writev, .poll = fuse_dev_poll, .release = fuse_dev_release, + .fasync = fuse_dev_fasync, }; static struct miscdevice fuse_miscdevice = { diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 256355b8025..8d7546e832e 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -117,8 +117,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) return 0; fc = get_fuse_conn(inode); - req = fuse_get_request(fc); - if (!req) + req = fuse_get_req(fc); + if (IS_ERR(req)) return 0; fuse_lookup_init(req, entry->d_parent->d_inode, entry, &outarg); @@ -188,9 +188,9 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, if (entry->d_name.len > FUSE_NAME_MAX) return ERR_PTR(-ENAMETOOLONG); - req = fuse_get_request(fc); - if (!req) - return ERR_PTR(-EINTR); + req = fuse_get_req(fc); + if (IS_ERR(req)) + return ERR_PTR(PTR_ERR(req)); fuse_lookup_init(req, dir, entry, &outarg); request_send(fc, req); @@ -244,15 +244,14 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, struct file *file; int flags = nd->intent.open.flags - 1; - err = -ENOSYS; if (fc->no_create) - goto out; + return -ENOSYS; - err = -EINTR; - req = fuse_get_request(fc); - if (!req) - goto out; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + err = -ENOMEM; ff = fuse_file_alloc(); if (!ff) goto out_put_request; @@ -314,7 +313,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, fuse_file_free(ff); out_put_request: fuse_put_request(fc, req); - out: return err; } @@ -375,9 +373,9 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode, { struct fuse_mknod_in inarg; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.mode = mode; @@ -407,9 +405,9 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, int mode) { struct fuse_mkdir_in inarg; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.mode = mode; @@ -427,9 +425,9 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, { struct fuse_conn *fc = get_fuse_conn(dir); unsigned len = strlen(link) + 1; - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->in.h.opcode = FUSE_SYMLINK; req->in.numargs = 2; @@ -444,9 +442,9 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) { int err; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->in.h.opcode = FUSE_UNLINK; req->in.h.nodeid = get_node_id(dir); @@ -476,9 +474,9 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) { int err; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->in.h.opcode = FUSE_RMDIR; req->in.h.nodeid = get_node_id(dir); @@ -504,9 +502,9 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, int err; struct fuse_rename_in inarg; struct fuse_conn *fc = get_fuse_conn(olddir); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.newdir = get_node_id(newdir); @@ -553,9 +551,9 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, struct fuse_link_in inarg; struct inode *inode = entry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.oldnodeid = get_node_id(inode); @@ -583,9 +581,9 @@ int fuse_do_getattr(struct inode *inode) int err; struct fuse_attr_out arg; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->in.h.opcode = FUSE_GETATTR; req->in.h.nodeid = get_node_id(inode); @@ -673,9 +671,9 @@ static int fuse_access(struct inode *inode, int mask) if (fc->no_access) return 0; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.mask = mask; @@ -780,9 +778,9 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) if (is_bad_inode(inode)) return -EIO; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); page = alloc_page(GFP_KERNEL); if (!page) { @@ -809,11 +807,11 @@ static char *read_link(struct dentry *dentry) { struct inode *inode = dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_request(fc); + struct fuse_req *req = fuse_get_req(fc); char *link; - if (!req) - return ERR_PTR(-EINTR); + if (IS_ERR(req)) + return ERR_PTR(PTR_ERR(req)); link = (char *) __get_free_page(GFP_KERNEL); if (!link) { @@ -933,9 +931,9 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr) } } - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); iattr_to_fattr(attr, &inarg); @@ -995,9 +993,9 @@ static int fuse_setxattr(struct dentry *entry, const char *name, if (fc->no_setxattr) return -EOPNOTSUPP; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.size = size; @@ -1035,9 +1033,9 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name, if (fc->no_getxattr) return -EOPNOTSUPP; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.size = size; @@ -1085,9 +1083,9 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) if (fc->no_listxattr) return -EOPNOTSUPP; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.size = size; @@ -1131,9 +1129,9 @@ static int fuse_removexattr(struct dentry *entry, const char *name) if (fc->no_removexattr) return -EOPNOTSUPP; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->in.h.opcode = FUSE_REMOVEXATTR; req->in.h.nodeid = get_node_id(inode); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 975f2697e86..fc342cf7c2c 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2005 Miklos Szeredi <miklos@szeredi.hu> + Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -22,9 +22,9 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir, struct fuse_req *req; int err; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); @@ -184,9 +184,9 @@ static int fuse_flush(struct file *file) if (fc->no_flush) return 0; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; @@ -223,9 +223,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) return 0; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; @@ -297,9 +297,9 @@ static int fuse_readpage(struct file *file, struct page *page) if (is_bad_inode(inode)) goto out; - err = -EINTR; - req = fuse_get_request(fc); - if (!req) + req = fuse_get_req(fc); + err = PTR_ERR(req); + if (IS_ERR(req)) goto out; req->out.page_zeroing = 1; @@ -368,10 +368,10 @@ static int fuse_readpages_fill(void *_data, struct page *page) (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read || req->pages[req->num_pages - 1]->index + 1 != page->index)) { fuse_send_readpages(req, data->file, inode); - data->req = req = fuse_get_request(fc); - if (!req) { + data->req = req = fuse_get_req(fc); + if (IS_ERR(req)) { unlock_page(page); - return -EINTR; + return PTR_ERR(req); } } req->pages[req->num_pages] = page; @@ -392,13 +392,17 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, data.file = file; data.inode = inode; - data.req = fuse_get_request(fc); - if (!data.req) - return -EINTR; + data.req = fuse_get_req(fc); + if (IS_ERR(data.req)) + return PTR_ERR(data.req); err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); - if (!err) - fuse_send_readpages(data.req, file, inode); + if (!err) { + if (data.req->num_pages) + fuse_send_readpages(data.req, file, inode); + else + fuse_put_request(fc, data.req); + } return err; } @@ -451,9 +455,9 @@ static int fuse_commit_write(struct file *file, struct page *page, if (is_bad_inode(inode)) return -EIO; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->num_pages = 1; req->pages[0] = page; @@ -528,9 +532,9 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf, if (is_bad_inode(inode)) return -EIO; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); while (count) { size_t nres; @@ -561,8 +565,12 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf, buf += nres; if (nres != nbytes) break; - if (count) - fuse_reset_request(req); + if (count) { + fuse_put_request(fc, req); + req = fuse_get_req(fc); + if (IS_ERR(req)) + break; + } } fuse_put_request(fc, req); if (res > 0) { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index a16a04fcf41..0474202cb5d 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2005 Miklos Szeredi <miklos@szeredi.hu> + Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -18,8 +18,8 @@ /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 -/** If more requests are outstanding, then the operation will block */ -#define FUSE_MAX_OUTSTANDING 10 +/** Maximum number of outstanding background requests */ +#define FUSE_MAX_BACKGROUND 10 /** It could be as large as PATH_MAX, but would that have any uses? */ #define FUSE_NAME_MAX 1024 @@ -131,8 +131,8 @@ struct fuse_conn; * A request to the client */ struct fuse_req { - /** This can be on either unused_list, pending processing or - io lists in fuse_conn */ + /** This can be on either pending processing or io lists in + fuse_conn */ struct list_head list; /** Entry on the background list */ @@ -144,15 +144,12 @@ struct fuse_req { /* * The following bitfields are either set once before the * request is queued or setting/clearing them is protected by - * fuse_lock + * fuse_conn->lock */ /** True if the request has reply */ unsigned isreply:1; - /** The request is preallocated */ - unsigned preallocated:1; - /** The request was interrupted */ unsigned interrupted:1; @@ -162,6 +159,9 @@ struct fuse_req { /** Data is being copied to/from the request */ unsigned locked:1; + /** Request is counted as "waiting" */ + unsigned waiting:1; + /** State of the request */ enum fuse_req_state state; @@ -213,6 +213,9 @@ struct fuse_req { * unmounted. */ struct fuse_conn { + /** Lock protecting accessess to members of this structure */ + spinlock_t lock; + /** The user id for this mount */ uid_t user_id; @@ -244,19 +247,20 @@ struct fuse_conn { interrupted request) */ struct list_head background; - /** Controls the maximum number of outstanding requests */ - struct semaphore outstanding_sem; + /** Number of requests currently in the background */ + unsigned num_background; - /** This counts the number of outstanding requests if - outstanding_sem would go negative */ - unsigned outstanding_debt; + /** Flag indicating if connection is blocked. This will be + the case before the INIT reply is received, and if there + are too many outstading backgrounds requests */ + int blocked; + + /** waitq for blocked connection */ + wait_queue_head_t blocked_waitq; /** RW semaphore for exclusion with fuse_put_super() */ struct rw_semaphore sbput_sem; - /** The list of unused requests */ - struct list_head unused_list; - /** The next unique request id */ u64 reqctr; @@ -318,6 +322,9 @@ struct fuse_conn { /** kobject */ struct kobject kobj; + + /** O_ASYNC requests */ + struct fasync_struct *fasync; }; static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) @@ -349,21 +356,6 @@ static inline u64 get_node_id(struct inode *inode) extern const struct file_operations fuse_dev_operations; /** - * This is the single global spinlock which protects FUSE's structures - * - * The following data is protected by this lock: - * - * - the private_data field of the device file - * - the s_fs_info field of the super block - * - unused_list, pending, processing lists in fuse_conn - * - background list in fuse_conn - * - the unique request ID counter reqctr in fuse_conn - * - the sb (super_block) field in fuse_conn - * - the file (device file) field in fuse_conn - */ -extern spinlock_t fuse_lock; - -/** * Get a filled in inode */ struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid, @@ -461,11 +453,11 @@ void fuse_reset_request(struct fuse_req *req); /** * Reserve a preallocated request */ -struct fuse_req *fuse_get_request(struct fuse_conn *fc); +struct fuse_req *fuse_get_req(struct fuse_conn *fc); /** - * Decrement reference count of a request. If count goes to zero put - * on unused list (preallocated) or free request (not preallocated). + * Decrement reference count of a request. If count goes to zero free + * the request. */ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); @@ -487,7 +479,7 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req); /** * Release inodes and file associated with background request */ -void fuse_release_background(struct fuse_req *req); +void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req); /* Abort all requests */ void fuse_abort_conn(struct fuse_conn *fc); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 879e6fba948..7627022446b 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2005 Miklos Szeredi <miklos@szeredi.hu> + Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -22,7 +22,6 @@ MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); MODULE_DESCRIPTION("Filesystem in Userspace"); MODULE_LICENSE("GPL"); -spinlock_t fuse_lock; static kmem_cache_t *fuse_inode_cachep; static struct subsystem connections_subsys; @@ -207,15 +206,17 @@ static void fuse_put_super(struct super_block *sb) down_write(&fc->sbput_sem); while (!list_empty(&fc->background)) - fuse_release_background(list_entry(fc->background.next, + fuse_release_background(fc, + list_entry(fc->background.next, struct fuse_req, bg_entry)); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); fc->mounted = 0; fc->connected = 0; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); up_write(&fc->sbput_sem); /* Flush all readers on this fs */ + kill_fasync(&fc->fasync, SIGIO, POLL_IN); wake_up_all(&fc->waitq); kobject_del(&fc->kobj); kobject_put(&fc->kobj); @@ -242,9 +243,9 @@ static int fuse_statfs(struct super_block *sb, struct kstatfs *buf) struct fuse_statfs_out outarg; int err; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&outarg, 0, sizeof(outarg)); req->in.numargs = 0; @@ -369,15 +370,7 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt) static void fuse_conn_release(struct kobject *kobj) { - struct fuse_conn *fc = get_fuse_conn_kobj(kobj); - - while (!list_empty(&fc->unused_list)) { - struct fuse_req *req; - req = list_entry(fc->unused_list.next, struct fuse_req, list); - list_del(&req->list); - fuse_request_free(req); - } - kfree(fc); + kfree(get_fuse_conn_kobj(kobj)); } static struct fuse_conn *new_conn(void) @@ -386,64 +379,25 @@ static struct fuse_conn *new_conn(void) fc = kzalloc(sizeof(*fc), GFP_KERNEL); if (fc) { - int i; + spin_lock_init(&fc->lock); init_waitqueue_head(&fc->waitq); + init_waitqueue_head(&fc->blocked_waitq); INIT_LIST_HEAD(&fc->pending); INIT_LIST_HEAD(&fc->processing); INIT_LIST_HEAD(&fc->io); - INIT_LIST_HEAD(&fc->unused_list); INIT_LIST_HEAD(&fc->background); - sema_init(&fc->outstanding_sem, 1); /* One for INIT */ init_rwsem(&fc->sbput_sem); kobj_set_kset_s(fc, connections_subsys); kobject_init(&fc->kobj); atomic_set(&fc->num_waiting, 0); - for (i = 0; i < FUSE_MAX_OUTSTANDING; i++) { - struct fuse_req *req = fuse_request_alloc(); - if (!req) { - kobject_put(&fc->kobj); - return NULL; - } - list_add(&req->list, &fc->unused_list); - } fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; fc->bdi.unplug_io_fn = default_unplug_io_fn; fc->reqctr = 0; + fc->blocked = 1; } return fc; } -static struct fuse_conn *get_conn(struct file *file, struct super_block *sb) -{ - struct fuse_conn *fc; - int err; - - err = -EINVAL; - if (file->f_op != &fuse_dev_operations) - goto out_err; - - err = -ENOMEM; - fc = new_conn(); - if (!fc) - goto out_err; - - spin_lock(&fuse_lock); - err = -EINVAL; - if (file->private_data) - goto out_unlock; - - kobject_get(&fc->kobj); - file->private_data = fc; - spin_unlock(&fuse_lock); - return fc; - - out_unlock: - spin_unlock(&fuse_lock); - kobject_put(&fc->kobj); - out_err: - return ERR_PTR(err); -} - static struct inode *get_root_inode(struct super_block *sb, unsigned mode) { struct fuse_attr attr; @@ -467,7 +421,6 @@ static struct super_operations fuse_super_operations = { static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) { - int i; struct fuse_init_out *arg = &req->misc.init_out; if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION) @@ -486,22 +439,13 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->minor = arg->minor; fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; } - - /* After INIT reply is received other requests can go - out. So do (FUSE_MAX_OUTSTANDING - 1) number of - up()s on outstanding_sem. The last up() is done in - fuse_putback_request() */ - for (i = 1; i < FUSE_MAX_OUTSTANDING; i++) - up(&fc->outstanding_sem); - fuse_put_request(fc, req); + fc->blocked = 0; + wake_up_all(&fc->blocked_waitq); } -static void fuse_send_init(struct fuse_conn *fc) +static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) { - /* This is called from fuse_read_super() so there's guaranteed - to be exactly one request available */ - struct fuse_req *req = fuse_get_request(fc); struct fuse_init_in *arg = &req->misc.init_in; arg->major = FUSE_KERNEL_VERSION; @@ -525,12 +469,9 @@ static void fuse_send_init(struct fuse_conn *fc) static unsigned long long conn_id(void) { + /* BKL is held for ->get_sb() */ static unsigned long long ctr = 1; - unsigned long long val; - spin_lock(&fuse_lock); - val = ctr++; - spin_unlock(&fuse_lock); - return val; + return ctr++; } static int fuse_fill_super(struct super_block *sb, void *data, int silent) @@ -540,6 +481,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) struct fuse_mount_data d; struct file *file; struct dentry *root_dentry; + struct fuse_req *init_req; int err; if (!parse_fuse_opt((char *) data, &d)) @@ -555,10 +497,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (!file) return -EINVAL; - fc = get_conn(file, sb); - fput(file); - if (IS_ERR(fc)) - return PTR_ERR(fc); + if (file->f_op != &fuse_dev_operations) + return -EINVAL; + + fc = new_conn(); + if (!fc) + return -ENOMEM; fc->flags = d.flags; fc->user_id = d.user_id; @@ -579,27 +523,48 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) goto err; } + init_req = fuse_request_alloc(); + if (!init_req) + goto err_put_root; + err = kobject_set_name(&fc->kobj, "%llu", conn_id()); if (err) - goto err_put_root; + goto err_free_req; err = kobject_add(&fc->kobj); if (err) - goto err_put_root; + goto err_free_req; + + /* Setting file->private_data can't race with other mount() + instances, since BKL is held for ->get_sb() */ + err = -EINVAL; + if (file->private_data) + goto err_kobject_del; sb->s_root = root_dentry; - spin_lock(&fuse_lock); fc->mounted = 1; fc->connected = 1; - spin_unlock(&fuse_lock); + kobject_get(&fc->kobj); + file->private_data = fc; + /* + * atomic_dec_and_test() in fput() provides the necessary + * memory barrier for file->private_data to be visible on all + * CPUs after this + */ + fput(file); - fuse_send_init(fc); + fuse_send_init(fc, init_req); return 0; + err_kobject_del: + kobject_del(&fc->kobj); + err_free_req: + fuse_request_free(init_req); err_put_root: dput(root_dentry); err: + fput(file); kobject_put(&fc->kobj); return err; } @@ -753,7 +718,6 @@ static int __init fuse_init(void) printk("fuse init (API version %i.%i)\n", FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); - spin_lock_init(&fuse_lock); res = fuse_fs_init(); if (res) goto err; diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 8f07e8fbd03..746abc9ecf7 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -466,8 +466,7 @@ void hfs_bnode_unhash(struct hfs_bnode *node) for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)]; *p && *p != node; p = &(*p)->next_hash) ; - if (!*p) - BUG(); + BUG_ON(!*p); *p = node->next_hash; node->tree->node_hash_cnt--; } @@ -622,8 +621,7 @@ void hfs_bnode_put(struct hfs_bnode *node) dprint(DBG_BNODE_REFS, "put_node(%d:%d): %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); - if (!atomic_read(&node->refcnt)) - BUG(); + BUG_ON(!atomic_read(&node->refcnt)); if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock)) return; for (i = 0; i < tree->pages_per_bnode; i++) { diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index a67edfa34e9..effa8991999 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -269,8 +269,7 @@ void hfs_bmap_free(struct hfs_bnode *node) u8 *data, byte, m; dprint(DBG_BNODE_MOD, "btree_free_node: %u\n", node->this); - if (!node->this) - BUG(); + BUG_ON(!node->this); tree = node->tree; nidx = node->this; node = hfs_bnode_find(tree, 0); diff --git a/fs/inode.c b/fs/inode.c index 32b7c337502..3a2446a27d2 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -172,8 +172,7 @@ static struct inode *alloc_inode(struct super_block *sb) void destroy_inode(struct inode *inode) { - if (inode_has_buffers(inode)) - BUG(); + BUG_ON(inode_has_buffers(inode)); security_inode_free(inode); if (inode->i_sb->s_op->destroy_inode) inode->i_sb->s_op->destroy_inode(inode); @@ -249,12 +248,9 @@ void clear_inode(struct inode *inode) might_sleep(); invalidate_inode_buffers(inode); - if (inode->i_data.nrpages) - BUG(); - if (!(inode->i_state & I_FREEING)) - BUG(); - if (inode->i_state & I_CLEAR) - BUG(); + BUG_ON(inode->i_data.nrpages); + BUG_ON(!(inode->i_state & I_FREEING)); + BUG_ON(inode->i_state & I_CLEAR); wait_on_inode(inode); DQUOT_DROP(inode); if (inode->i_sb && inode->i_sb->s_op->clear_inode) @@ -1054,8 +1050,7 @@ void generic_delete_inode(struct inode *inode) hlist_del_init(&inode->i_hash); spin_unlock(&inode_lock); wake_up_inode(inode); - if (inode->i_state != I_CLEAR) - BUG(); + BUG_ON(inode->i_state != I_CLEAR); destroy_inode(inode); } diff --git a/fs/inotify.c b/fs/inotify.c index 367c487c014..732ec4bd577 100644 --- a/fs/inotify.c +++ b/fs/inotify.c @@ -538,7 +538,7 @@ void inotify_d_instantiate(struct dentry *entry, struct inode *inode) WARN_ON(entry->d_flags & DCACHE_INOTIFY_PARENT_WATCHED); spin_lock(&entry->d_lock); parent = entry->d_parent; - if (inotify_inode_watched(parent->d_inode)) + if (parent->d_inode && inotify_inode_watched(parent->d_inode)) entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED; spin_unlock(&entry->d_lock); } @@ -848,7 +848,11 @@ static int inotify_release(struct inode *ignored, struct file *file) inode = watch->inode; mutex_lock(&inode->inotify_mutex); mutex_lock(&dev->mutex); - remove_watch_no_event(watch, dev); + + /* make sure we didn't race with another list removal */ + if (likely(idr_find(&dev->idr, watch->wd))) + remove_watch_no_event(watch, dev); + mutex_unlock(&dev->mutex); mutex_unlock(&inode->inotify_mutex); put_inotify_watch(watch); @@ -890,8 +894,7 @@ static int inotify_ignore(struct inotify_device *dev, s32 wd) mutex_lock(&dev->mutex); /* make sure that we did not race */ - watch = idr_find(&dev->idr, wd); - if (likely(watch)) + if (likely(idr_find(&dev->idr, wd) == watch)) remove_watch(watch, dev); mutex_unlock(&dev->mutex); diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c index 7b77a954112..ff2a872e80e 100644 --- a/fs/jffs2/background.c +++ b/fs/jffs2/background.c @@ -35,8 +35,7 @@ int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c) pid_t pid; int ret = 0; - if (c->gc_task) - BUG(); + BUG_ON(c->gc_task); init_completion(&c->gc_thread_start); init_completion(&c->gc_thread_exit); diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c index d4d0c41490c..1d46677afd1 100644 --- a/fs/jffs2/nodelist.c +++ b/fs/jffs2/nodelist.c @@ -438,7 +438,8 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info if (c->mtd->point) { err = c->mtd->point(c->mtd, ofs, len, &retlen, &buffer); if (!err && retlen < tn->csize) { - JFFS2_WARNING("MTD point returned len too short: %u instead of %u.\n", retlen, tn->csize); + JFFS2_WARNING("MTD point returned len too short: %zu " + "instead of %u.\n", retlen, tn->csize); c->mtd->unpoint(c->mtd, buffer, ofs, len); } else if (err) JFFS2_WARNING("MTD point failed: error code %d.\n", err); @@ -461,7 +462,8 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info } if (retlen != len) { - JFFS2_ERROR("short read at %#08x: %d instead of %d.\n", ofs, retlen, len); + JFFS2_ERROR("short read at %#08x: %zd instead of %d.\n", + ofs, retlen, len); err = -EIO; goto free_out; } diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index f28696f235c..2b220dd6b4e 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -542,7 +542,7 @@ add_failed: static int metapage_releasepage(struct page *page, gfp_t gfp_mask) { struct metapage *mp; - int busy = 0; + int ret = 1; unsigned int offset; for (offset = 0; offset < PAGE_CACHE_SIZE; offset += PSIZE) { @@ -552,30 +552,20 @@ static int metapage_releasepage(struct page *page, gfp_t gfp_mask) continue; jfs_info("metapage_releasepage: mp = 0x%p", mp); - if (mp->count || mp->nohomeok) { + if (mp->count || mp->nohomeok || + test_bit(META_dirty, &mp->flag)) { jfs_info("count = %ld, nohomeok = %d", mp->count, mp->nohomeok); - busy = 1; + ret = 0; continue; } - wait_on_page_writeback(page); - //WARN_ON(test_bit(META_dirty, &mp->flag)); - if (test_bit(META_dirty, &mp->flag)) { - dump_mem("dirty mp in metapage_releasepage", mp, - sizeof(struct metapage)); - dump_mem("page", page, sizeof(struct page)); - dump_stack(); - } if (mp->lsn) remove_from_logsync(mp); remove_metapage(page, mp); INCREMENT(mpStat.pagefree); free_metapage(mp); } - if (busy) - return -1; - - return 0; + return ret; } static void metapage_invalidatepage(struct page *page, unsigned long offset) diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index d2b66bad7d5..3ef739120df 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -650,7 +650,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data) svc_wake_up(block->b_daemon); } -void nlmsvc_grant_release(void *data) +static void nlmsvc_grant_release(void *data) { struct nlm_rqst *call = data; diff --git a/fs/locks.c b/fs/locks.c index dda83d6cd48..ab61a8b5482 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -446,15 +446,14 @@ static struct lock_manager_operations lease_manager_ops = { */ static int lease_init(struct file *filp, int type, struct file_lock *fl) { + if (assign_type(fl, type) != 0) + return -EINVAL; + fl->fl_owner = current->files; fl->fl_pid = current->tgid; fl->fl_file = filp; fl->fl_flags = FL_LEASE; - if (assign_type(fl, type) != 0) { - locks_free_lock(fl); - return -EINVAL; - } fl->fl_start = 0; fl->fl_end = OFFSET_MAX; fl->fl_ops = NULL; @@ -466,16 +465,19 @@ static int lease_init(struct file *filp, int type, struct file_lock *fl) static int lease_alloc(struct file *filp, int type, struct file_lock **flp) { struct file_lock *fl = locks_alloc_lock(); - int error; + int error = -ENOMEM; if (fl == NULL) - return -ENOMEM; + goto out; error = lease_init(filp, type, fl); - if (error) - return error; + if (error) { + locks_free_lock(fl); + fl = NULL; + } +out: *flp = fl; - return 0; + return error; } /* Check if two locks overlap each other. @@ -753,6 +755,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) if (request->fl_type == F_UNLCK) goto out; + error = -ENOMEM; new_fl = locks_alloc_lock(); if (new_fl == NULL) goto out; @@ -779,6 +782,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) locks_copy_lock(new_fl, request); locks_insert_lock(&inode->i_flock, new_fl); new_fl = NULL; + error = 0; out: unlock_kernel(); @@ -1372,6 +1376,7 @@ static int __setlease(struct file *filp, long arg, struct file_lock **flp) goto out; if (my_before != NULL) { + *flp = *my_before; error = lease->fl_lmops->fl_change(my_before, arg); goto out; } @@ -2230,7 +2235,12 @@ void steal_locks(fl_owner_t from) lock_kernel(); j = 0; - rcu_read_lock(); + + /* + * We are not taking a ref to the file structures, so + * we need to acquire ->file_lock. + */ + spin_lock(&files->file_lock); fdt = files_fdtable(files); for (;;) { unsigned long set; @@ -2248,7 +2258,7 @@ void steal_locks(fl_owner_t from) set >>= 1; } } - rcu_read_unlock(); + spin_unlock(&files->file_lock); unlock_kernel(); } EXPORT_SYMBOL(steal_locks); diff --git a/fs/namei.c b/fs/namei.c index 96723ae83c8..d6e2ee25173 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1080,8 +1080,8 @@ static int fastcall do_path_lookup(int dfd, const char *name, nd->flags = flags; nd->depth = 0; - read_lock(¤t->fs->lock); if (*name=='/') { + read_lock(¤t->fs->lock); if (current->fs->altroot && !(nd->flags & LOOKUP_NOALT)) { nd->mnt = mntget(current->fs->altrootmnt); nd->dentry = dget(current->fs->altroot); @@ -1092,33 +1092,35 @@ static int fastcall do_path_lookup(int dfd, const char *name, } nd->mnt = mntget(current->fs->rootmnt); nd->dentry = dget(current->fs->root); + read_unlock(¤t->fs->lock); } else if (dfd == AT_FDCWD) { + read_lock(¤t->fs->lock); nd->mnt = mntget(current->fs->pwdmnt); nd->dentry = dget(current->fs->pwd); + read_unlock(¤t->fs->lock); } else { struct dentry *dentry; file = fget_light(dfd, &fput_needed); retval = -EBADF; if (!file) - goto unlock_fail; + goto out_fail; dentry = file->f_dentry; retval = -ENOTDIR; if (!S_ISDIR(dentry->d_inode->i_mode)) - goto fput_unlock_fail; + goto fput_fail; retval = file_permission(file, MAY_EXEC); if (retval) - goto fput_unlock_fail; + goto fput_fail; nd->mnt = mntget(file->f_vfsmnt); nd->dentry = dget(dentry); fput_light(file, fput_needed); } - read_unlock(¤t->fs->lock); current->total_link_count = 0; retval = link_path_walk(name, nd); out: @@ -1127,13 +1129,12 @@ out: nd->dentry->d_inode)) audit_inode(name, nd->dentry->d_inode, flags); } +out_fail: return retval; -fput_unlock_fail: +fput_fail: fput_light(file, fput_needed); -unlock_fail: - read_unlock(¤t->fs->lock); - return retval; + goto out_fail; } int fastcall path_lookup(const char *name, unsigned int flags, diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index a23f3489416..cae74dd4c7f 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -128,15 +128,14 @@ struct inode_operations nfs4_dir_inode_operations = { static int nfs_opendir(struct inode *inode, struct file *filp) { - int res = 0; + int res; dfprintk(VFS, "NFS: opendir(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino); lock_kernel(); /* Call generic open code in order to cache credentials */ - if (!res) - res = nfs_open(inode, filp); + res = nfs_open(inode, filp); unlock_kernel(); return res; } diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 0f583cb16dd..3c72b0c0728 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -112,10 +112,9 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode */ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) { - struct dentry *dentry = iocb->ki_filp->f_dentry; - dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n", - dentry->d_name.name, (long long) pos, nr_segs); + iocb->ki_filp->f_dentry->d_name.name, + (long long) pos, nr_segs); return -EINVAL; } @@ -468,7 +467,6 @@ static const struct rpc_call_ops nfs_commit_direct_ops = { static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) { struct nfs_write_data *data = dreq->commit_data; - struct rpc_task *task = &data->task; data->inode = dreq->inode; data->cred = dreq->ctx->cred; @@ -489,7 +487,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ dreq->commit_data = NULL; - dprintk("NFS: %5u initiated commit call\n", task->tk_pid); + dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); lock_kernel(); rpc_execute(&data->task); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index f1df2c8d925..fade02c15e6 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -534,10 +534,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) */ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) { - struct inode * inode = filp->f_mapping->host; - dprintk("NFS: nfs_flock(f=%s/%ld, t=%x, fl=%x)\n", - inode->i_sb->s_id, inode->i_ino, + filp->f_dentry->d_inode->i_sb->s_id, + filp->f_dentry->d_inode->i_ino, fl->fl_type, fl->fl_flags); /* diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 2f7656b911b..d0b991a9232 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -700,12 +700,9 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) /* * Display superblock I/O counters */ - for (cpu = 0; cpu < NR_CPUS; cpu++) { + for_each_possible_cpu(cpu) { struct nfs_iostats *stats; - if (!cpu_possible(cpu)) - continue; - preempt_disable(); stats = per_cpu_ptr(nfss->io_stats, cpu); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 47ece1dd3c6..d86c0db7b1e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1218,7 +1218,7 @@ out: return status; } -static void nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, struct nfs4_state *state) +static int nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, struct nfs4_state *state) { struct file *filp; @@ -1227,8 +1227,10 @@ static void nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, st struct nfs_open_context *ctx; ctx = (struct nfs_open_context *)filp->private_data; ctx->state = state; - } else - nfs4_close_state(state, nd->intent.open.flags); + return 0; + } + nfs4_close_state(state, nd->intent.open.flags); + return PTR_ERR(filp); } struct dentry * @@ -1835,7 +1837,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, nfs_setattr_update_inode(state->inode, sattr); } if (status == 0 && nd != NULL && (nd->flags & LOOKUP_OPEN)) - nfs4_intent_set_file(nd, dentry, state); + status = nfs4_intent_set_file(nd, dentry, state); else nfs4_close_state(state, flags); out: diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index cfe9ce88161..6e92b0fe532 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -14,46 +14,46 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) { - struct svc_cred *cred = &rqstp->rq_cred; + struct svc_cred cred = rqstp->rq_cred; int i; int ret; if (exp->ex_flags & NFSEXP_ALLSQUASH) { - cred->cr_uid = exp->ex_anon_uid; - cred->cr_gid = exp->ex_anon_gid; - put_group_info(cred->cr_group_info); - cred->cr_group_info = groups_alloc(0); + cred.cr_uid = exp->ex_anon_uid; + cred.cr_gid = exp->ex_anon_gid; + cred.cr_group_info = groups_alloc(0); } else if (exp->ex_flags & NFSEXP_ROOTSQUASH) { struct group_info *gi; - if (!cred->cr_uid) - cred->cr_uid = exp->ex_anon_uid; - if (!cred->cr_gid) - cred->cr_gid = exp->ex_anon_gid; - gi = groups_alloc(cred->cr_group_info->ngroups); + if (!cred.cr_uid) + cred.cr_uid = exp->ex_anon_uid; + if (!cred.cr_gid) + cred.cr_gid = exp->ex_anon_gid; + gi = groups_alloc(cred.cr_group_info->ngroups); if (gi) - for (i = 0; i < cred->cr_group_info->ngroups; i++) { - if (!GROUP_AT(cred->cr_group_info, i)) + for (i = 0; i < cred.cr_group_info->ngroups; i++) { + if (!GROUP_AT(cred.cr_group_info, i)) GROUP_AT(gi, i) = exp->ex_anon_gid; else - GROUP_AT(gi, i) = GROUP_AT(cred->cr_group_info, i); + GROUP_AT(gi, i) = GROUP_AT(cred.cr_group_info, i); } - put_group_info(cred->cr_group_info); - cred->cr_group_info = gi; - } + cred.cr_group_info = gi; + } else + get_group_info(cred.cr_group_info); - if (cred->cr_uid != (uid_t) -1) - current->fsuid = cred->cr_uid; + if (cred.cr_uid != (uid_t) -1) + current->fsuid = cred.cr_uid; else current->fsuid = exp->ex_anon_uid; - if (cred->cr_gid != (gid_t) -1) - current->fsgid = cred->cr_gid; + if (cred.cr_gid != (gid_t) -1) + current->fsgid = cred.cr_gid; else current->fsgid = exp->ex_anon_gid; - if (!cred->cr_group_info) + if (!cred.cr_group_info) return -ENOMEM; - ret = set_current_groups(cred->cr_group_info); - if ((cred->cr_uid)) { + ret = set_current_groups(cred.cr_group_info); + put_group_info(cred.cr_group_info); + if ((cred.cr_uid)) { cap_t(current->cap_effective) &= ~CAP_NFSD_MASK; } else { cap_t(current->cap_effective) |= (CAP_NFSD_MASK & diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index c340be0a3f5..3eec30000f3 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -422,7 +422,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) goto out; err = path_lookup(buf, 0, &nd); - if (err) goto out; + if (err) goto out_no_path; exp.h.flags = 0; exp.ex_client = dom; @@ -475,6 +475,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) out: if (nd.dentry) path_release(&nd); + out_no_path: if (dom) auth_domain_put(dom); kfree(buf); @@ -1065,9 +1066,11 @@ exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp, rv = nfserr_perm; else if (IS_ERR(exp)) rv = nfserrno(PTR_ERR(exp)); - else + else { rv = fh_compose(fhp, exp, fsid_key->ek_dentry, NULL); + exp_put(exp); + } cache_put(&fsid_key->h, &svc_expkey_cache); return rv; } diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 6d2dfed1de0..f61142afea4 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -682,7 +682,7 @@ static struct svc_procedure nfsd_procedures3[22] = { PROC(lookup, dirop, dirop, fhandle2, RC_NOCACHE, ST+FH+pAT+pAT), PROC(access, access, access, fhandle, RC_NOCACHE, ST+pAT+1), PROC(readlink, readlink, readlink, fhandle, RC_NOCACHE, ST+pAT+1+NFS3_MAXPATHLEN/4), - PROC(read, read, read, fhandle, RC_NOCACHE, ST+pAT+4+NFSSVC_MAXBLKSIZE), + PROC(read, read, read, fhandle, RC_NOCACHE, ST+pAT+4+NFSSVC_MAXBLKSIZE/4), PROC(write, write, write, fhandle, RC_REPLBUFF, ST+WC+4), PROC(create, create, create, fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC), PROC(mkdir, mkdir, create, fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC), diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 7391f4aabed..edb107e61b9 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -710,9 +710,9 @@ calculate_posix_ace_count(struct nfs4_acl *n4acl) /* Also, the remaining entries are for named users and * groups, and come in threes (mask, allow, deny): */ if (n4acl->naces < 7) - return -1; + return -EINVAL; if ((n4acl->naces - 7) % 3) - return -1; + return -EINVAL; return 4 + (n4acl->naces - 7)/3; } } @@ -790,7 +790,7 @@ nfs4_acl_split(struct nfs4_acl *acl, struct nfs4_acl *dacl) continue; error = nfs4_acl_add_ace(dacl, ace->type, ace->flag, - ace->access_mask, ace->whotype, ace->who) == -1; + ace->access_mask, ace->whotype, ace->who); if (error < 0) goto out; @@ -866,7 +866,7 @@ nfs4_acl_add_ace(struct nfs4_acl *acl, u32 type, u32 flag, u32 access_mask, struct nfs4_ace *ace; if ((ace = kmalloc(sizeof(*ace), GFP_KERNEL)) == NULL) - return -1; + return -ENOMEM; ace->type = type; ace->flag = flag; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index c872bd07fc1..dbaf3f93f32 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -441,8 +441,9 @@ nfsd4_probe_callback(struct nfs4_client *clp) goto out_clnt; } - /* the task holds a reference to the nfs4_client struct */ cb->cb_client = clnt; + + /* the task holds a reference to the nfs4_client struct */ atomic_inc(&clp->cl_count); msg.rpc_cred = nfsd4_lookupcred(clp,0); @@ -460,13 +461,12 @@ nfsd4_probe_callback(struct nfs4_client *clp) out_rpciod: atomic_dec(&clp->cl_count); rpciod_down(); + cb->cb_client = NULL; out_clnt: rpc_shutdown_client(clnt); - goto out_err; out_err: dprintk("NFSD: warning: no callback path to client %.*s\n", (int)clp->cl_name.len, clp->cl_name.data); - cb->cb_client = NULL; } static void diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 6d63f1d9e5f..b0e095ea0c0 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -288,8 +288,6 @@ nfsd4_putrootfh(struct svc_rqst *rqstp, struct svc_fh *current_fh) fh_put(current_fh); status = exp_pseudoroot(rqstp->rq_client, current_fh, &rqstp->rq_chandle); - if (!status) - status = nfserrno(nfsd_setuser(rqstp, current_fh->fh_export)); return status; } @@ -975,7 +973,7 @@ struct nfsd4_voidargs { int dummy; }; */ static struct svc_procedure nfsd_procedures4[2] = { PROC(null, void, void, void, RC_NOCACHE, 1), - PROC(compound, compound, compound, compound, RC_NOCACHE, NFSD_BUFSIZE) + PROC(compound, compound, compound, compound, RC_NOCACHE, NFSD_BUFSIZE/4) }; struct svc_version nfsd_version4 = { diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 47ec112b266..96c7578cbe1 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -147,6 +147,42 @@ get_nfs4_file(struct nfs4_file *fi) kref_get(&fi->fi_ref); } +static int num_delegations; + +/* + * Open owner state (share locks) + */ + +/* hash tables for nfs4_stateowner */ +#define OWNER_HASH_BITS 8 +#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS) +#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1) + +#define ownerid_hashval(id) \ + ((id) & OWNER_HASH_MASK) +#define ownerstr_hashval(clientid, ownername) \ + (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK) + +static struct list_head ownerid_hashtbl[OWNER_HASH_SIZE]; +static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE]; + +/* hash table for nfs4_file */ +#define FILE_HASH_BITS 8 +#define FILE_HASH_SIZE (1 << FILE_HASH_BITS) +#define FILE_HASH_MASK (FILE_HASH_SIZE - 1) +/* hash table for (open)nfs4_stateid */ +#define STATEID_HASH_BITS 10 +#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) +#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) + +#define file_hashval(x) \ + hash_ptr(x, FILE_HASH_BITS) +#define stateid_hashval(owner_id, file_id) \ + (((owner_id) + (file_id)) & STATEID_HASH_MASK) + +static struct list_head file_hashtbl[FILE_HASH_SIZE]; +static struct list_head stateid_hashtbl[STATEID_HASH_SIZE]; + static struct nfs4_delegation * alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type) { @@ -155,9 +191,12 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f struct nfs4_callback *cb = &stp->st_stateowner->so_client->cl_callback; dprintk("NFSD alloc_init_deleg\n"); + if (num_delegations > STATEID_HASH_SIZE * 4) + return NULL; dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL); if (dp == NULL) return dp; + num_delegations++; INIT_LIST_HEAD(&dp->dl_perfile); INIT_LIST_HEAD(&dp->dl_perclnt); INIT_LIST_HEAD(&dp->dl_recall_lru); @@ -192,6 +231,7 @@ nfs4_put_delegation(struct nfs4_delegation *dp) dprintk("NFSD: freeing dp %p\n",dp); put_nfs4_file(dp->dl_file); kmem_cache_free(deleg_slab, dp); + num_delegations--; } } @@ -330,22 +370,29 @@ put_nfs4_client(struct nfs4_client *clp) } static void +shutdown_callback_client(struct nfs4_client *clp) +{ + struct rpc_clnt *clnt = clp->cl_callback.cb_client; + + /* shutdown rpc client, ending any outstanding recall rpcs */ + if (clnt) { + clp->cl_callback.cb_client = NULL; + rpc_shutdown_client(clnt); + rpciod_down(); + } +} + +static void expire_client(struct nfs4_client *clp) { struct nfs4_stateowner *sop; struct nfs4_delegation *dp; - struct nfs4_callback *cb = &clp->cl_callback; - struct rpc_clnt *clnt = clp->cl_callback.cb_client; struct list_head reaplist; dprintk("NFSD: expire_client cl_count %d\n", atomic_read(&clp->cl_count)); - /* shutdown rpc client, ending any outstanding recall rpcs */ - if (atomic_read(&cb->cb_set) == 1 && clnt) { - rpc_shutdown_client(clnt); - clnt = clp->cl_callback.cb_client = NULL; - } + shutdown_callback_client(clp); INIT_LIST_HEAD(&reaplist); spin_lock(&recall_lock); @@ -936,40 +983,6 @@ out: return status; } -/* - * Open owner state (share locks) - */ - -/* hash tables for nfs4_stateowner */ -#define OWNER_HASH_BITS 8 -#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS) -#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1) - -#define ownerid_hashval(id) \ - ((id) & OWNER_HASH_MASK) -#define ownerstr_hashval(clientid, ownername) \ - (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK) - -static struct list_head ownerid_hashtbl[OWNER_HASH_SIZE]; -static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE]; - -/* hash table for nfs4_file */ -#define FILE_HASH_BITS 8 -#define FILE_HASH_SIZE (1 << FILE_HASH_BITS) -#define FILE_HASH_MASK (FILE_HASH_SIZE - 1) -/* hash table for (open)nfs4_stateid */ -#define STATEID_HASH_BITS 10 -#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) -#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) - -#define file_hashval(x) \ - hash_ptr(x, FILE_HASH_BITS) -#define stateid_hashval(owner_id, file_id) \ - (((owner_id) + (file_id)) & STATEID_HASH_MASK) - -static struct list_head file_hashtbl[FILE_HASH_SIZE]; -static struct list_head stateid_hashtbl[STATEID_HASH_SIZE]; - /* OPEN Share state helper functions */ static inline struct nfs4_file * alloc_init_file(struct inode *ino) @@ -1186,8 +1199,7 @@ move_to_close_lru(struct nfs4_stateowner *sop) { dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop); - unhash_stateowner(sop); - list_add_tail(&sop->so_close_lru, &close_lru); + list_move_tail(&sop->so_close_lru, &close_lru); sop->so_time = get_seconds(); } @@ -1916,8 +1928,7 @@ nfs4_laundromat(void) } dprintk("NFSD: purging unused open stateowner (so_id %d)\n", sop->so_id); - list_del(&sop->so_close_lru); - nfs4_put_stateowner(sop); + release_stateowner(sop); } if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT) clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT; @@ -2495,36 +2506,27 @@ nfs4_transform_lock_offset(struct file_lock *lock) lock->fl_end = OFFSET_MAX; } -static int -nfs4_verify_lock_stateowner(struct nfs4_stateowner *sop, unsigned int hashval) -{ - struct nfs4_stateowner *local = NULL; - int status = 0; - - if (hashval >= LOCK_HASH_SIZE) - goto out; - list_for_each_entry(local, &lock_ownerid_hashtbl[hashval], so_idhash) { - if (local == sop) { - status = 1; - goto out; - } - } -out: - return status; -} - +/* Hack!: For now, we're defining this just so we can use a pointer to it + * as a unique cookie to identify our (NFSv4's) posix locks. */ +static struct lock_manager_operations nfsd_posix_mng_ops = { +}; static inline void nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) { - struct nfs4_stateowner *sop = (struct nfs4_stateowner *) fl->fl_owner; - unsigned int hval = lockownerid_hashval(sop->so_id); + struct nfs4_stateowner *sop; + unsigned int hval; - deny->ld_sop = NULL; - if (nfs4_verify_lock_stateowner(sop, hval)) { + if (fl->fl_lmops == &nfsd_posix_mng_ops) { + sop = (struct nfs4_stateowner *) fl->fl_owner; + hval = lockownerid_hashval(sop->so_id); kref_get(&sop->so_ref); deny->ld_sop = sop; deny->ld_clientid = sop->so_client->cl_clientid; + } else { + deny->ld_sop = NULL; + deny->ld_clientid.cl_boot = 0; + deny->ld_clientid.cl_id = 0; } deny->ld_start = fl->fl_start; deny->ld_length = ~(u64)0; @@ -2736,6 +2738,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock file_lock.fl_pid = current->tgid; file_lock.fl_file = filp; file_lock.fl_flags = FL_POSIX; + file_lock.fl_lmops = &nfsd_posix_mng_ops; file_lock.fl_start = lock->lk_offset; if ((lock->lk_length == ~(u64)0) || @@ -2841,6 +2844,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner; file_lock.fl_pid = current->tgid; file_lock.fl_flags = FL_POSIX; + file_lock.fl_lmops = &nfsd_posix_mng_ops; file_lock.fl_start = lockt->lt_offset; if ((lockt->lt_length == ~(u64)0) || LOFF_OVERFLOW(lockt->lt_offset, lockt->lt_length)) @@ -2900,6 +2904,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock file_lock.fl_pid = current->tgid; file_lock.fl_file = filp; file_lock.fl_flags = FL_POSIX; + file_lock.fl_lmops = &nfsd_posix_mng_ops; file_lock.fl_start = locku->lu_offset; if ((locku->lu_length == ~(u64)0) || LOFF_OVERFLOW(locku->lu_offset, locku->lu_length)) @@ -3211,15 +3216,8 @@ __nfs4_state_shutdown(void) int i; struct nfs4_client *clp = NULL; struct nfs4_delegation *dp = NULL; - struct nfs4_stateowner *sop = NULL; struct list_head *pos, *next, reaplist; - list_for_each_safe(pos, next, &close_lru) { - sop = list_entry(pos, struct nfs4_stateowner, so_close_lru); - list_del(&sop->so_close_lru); - nfs4_put_stateowner(sop); - } - for (i = 0; i < CLIENT_HASH_SIZE; i++) { while (!list_empty(&conf_id_hashtbl[i])) { clp = list_entry(conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash); @@ -3244,8 +3242,6 @@ __nfs4_state_shutdown(void) } cancel_delayed_work(&laundromat_work); - flush_workqueue(laundry_wq); - destroy_workqueue(laundry_wq); nfsd4_shutdown_recdir(); nfs4_init = 0; } @@ -3253,6 +3249,8 @@ __nfs4_state_shutdown(void) void nfs4_state_shutdown(void) { + cancel_rearming_delayed_workqueue(laundry_wq, &laundromat_work); + destroy_workqueue(laundry_wq); nfs4_lock_state(); nfs4_release_reclaim(); __nfs4_state_shutdown(); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 03857fd8112..de3998f15f1 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -299,11 +299,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia buf, dummy32, &ace.who); if (status) goto out_nfserr; - if (nfs4_acl_add_ace(*acl, ace.type, ace.flag, - ace.access_mask, ace.whotype, ace.who) != 0) { - status = -ENOMEM; + status = nfs4_acl_add_ace(*acl, ace.type, ace.flag, + ace.access_mask, ace.whotype, ace.who); + if (status) goto out_nfserr; - } } } else *acl = NULL; @@ -2085,27 +2084,20 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_read WRITE32(eof); WRITE32(maxcount); ADJUST_ARGS(); - resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base; - + resp->xbuf->head[0].iov_len = (char*)p + - (char*)resp->xbuf->head[0].iov_base; resp->xbuf->page_len = maxcount; - /* read zero bytes -> don't set up tail */ - if(!maxcount) - return 0; - - /* set up page for remaining responses */ - svc_take_page(resp->rqstp); - resp->xbuf->tail[0].iov_base = - page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); - resp->rqstp->rq_restailpage = resp->rqstp->rq_resused-1; + /* Use rest of head for padding and remaining ops: */ + resp->rqstp->rq_restailpage = 0; + resp->xbuf->tail[0].iov_base = p; resp->xbuf->tail[0].iov_len = 0; - resp->p = resp->xbuf->tail[0].iov_base; - resp->end = resp->p + PAGE_SIZE/4; - if (maxcount&3) { - *(resp->p)++ = 0; + RESERVE_SPACE(4); + WRITE32(0); resp->xbuf->tail[0].iov_base += maxcount&3; resp->xbuf->tail[0].iov_len = 4 - (maxcount&3); + ADJUST_ARGS(); } return 0; } @@ -2142,21 +2134,20 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_r WRITE32(maxcount); ADJUST_ARGS(); - resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base; + resp->xbuf->head[0].iov_len = (char*)p + - (char*)resp->xbuf->head[0].iov_base; + resp->xbuf->page_len = maxcount; - svc_take_page(resp->rqstp); - resp->xbuf->tail[0].iov_base = - page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); - resp->rqstp->rq_restailpage = resp->rqstp->rq_resused-1; + /* Use rest of head for padding and remaining ops: */ + resp->rqstp->rq_restailpage = 0; + resp->xbuf->tail[0].iov_base = p; resp->xbuf->tail[0].iov_len = 0; - resp->p = resp->xbuf->tail[0].iov_base; - resp->end = resp->p + PAGE_SIZE/4; - - resp->xbuf->page_len = maxcount; if (maxcount&3) { - *(resp->p)++ = 0; + RESERVE_SPACE(4); + WRITE32(0); resp->xbuf->tail[0].iov_base += maxcount&3; resp->xbuf->tail[0].iov_len = 4 - (maxcount&3); + ADJUST_ARGS(); } return 0; } @@ -2166,7 +2157,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re { int maxcount; loff_t offset; - u32 *page, *savep; + u32 *page, *savep, *tailbase; ENCODE_HEAD; if (nfserr) @@ -2182,6 +2173,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re WRITE32(0); ADJUST_ARGS(); resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base; + tailbase = p; maxcount = PAGE_SIZE; if (maxcount > readdir->rd_maxcount) @@ -2226,14 +2218,12 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re *p++ = htonl(readdir->common.err == nfserr_eof); resp->xbuf->page_len = ((char*)p) - (char*)page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); - /* allocate a page for the tail */ - svc_take_page(resp->rqstp); - resp->xbuf->tail[0].iov_base = - page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); - resp->rqstp->rq_restailpage = resp->rqstp->rq_resused-1; + /* Use rest of head for padding and remaining ops: */ + resp->rqstp->rq_restailpage = 0; + resp->xbuf->tail[0].iov_base = tailbase; resp->xbuf->tail[0].iov_len = 0; resp->p = resp->xbuf->tail[0].iov_base; - resp->end = resp->p + PAGE_SIZE/4; + resp->end = resp->p + (PAGE_SIZE - resp->xbuf->head[0].iov_len)/4; return 0; err_no_verf: diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 3e6b75cd90f..06cd0db0f32 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -553,7 +553,7 @@ static struct svc_procedure nfsd_procedures2[18] = { PROC(none, void, void, none, RC_NOCACHE, ST), PROC(lookup, diropargs, diropres, fhandle, RC_NOCACHE, ST+FH+AT), PROC(readlink, readlinkargs, readlinkres, none, RC_NOCACHE, ST+1+NFS_MAXPATHLEN/4), - PROC(read, readargs, readres, fhandle, RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE), + PROC(read, readargs, readres, fhandle, RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE/4), PROC(none, void, void, none, RC_NOCACHE, ST), PROC(write, writeargs, attrstat, fhandle, RC_REPLBUFF, ST+AT), PROC(create, createargs, diropres, fhandle, RC_REPLBUFF, ST+FH+AT), diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 31018333dc3..1d65f13f458 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -371,7 +371,6 @@ out_nfserr: static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf) { ssize_t buflen; - int error; buflen = vfs_getxattr(dentry, key, NULL, 0); if (buflen <= 0) @@ -381,10 +380,7 @@ static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf) if (!*buf) return -ENOMEM; - error = vfs_getxattr(dentry, key, *buf, buflen); - if (error < 0) - return error; - return buflen; + return vfs_getxattr(dentry, key, *buf, buflen); } #endif @@ -1926,11 +1922,10 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl) value = kmalloc(size, GFP_KERNEL); if (!value) return -ENOMEM; - size = posix_acl_to_xattr(acl, value, size); - if (size < 0) { - error = size; + error = posix_acl_to_xattr(acl, value, size); + if (error < 0) goto getout; - } + size = error; } else size = 0; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 0d858d0b25b..47152bf9a7f 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -276,13 +276,29 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) return ret; } +/* This can also be called from ocfs2_write_zero_page() which has done + * it's own cluster locking. */ +int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, + unsigned from, unsigned to) +{ + int ret; + + down_read(&OCFS2_I(inode)->ip_alloc_sem); + + ret = block_prepare_write(page, from, to, ocfs2_get_block); + + up_read(&OCFS2_I(inode)->ip_alloc_sem); + + return ret; +} + /* * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called * from loopback. It must be able to perform its own locking around * ocfs2_get_block(). */ -int ocfs2_prepare_write(struct file *file, struct page *page, - unsigned from, unsigned to) +static int ocfs2_prepare_write(struct file *file, struct page *page, + unsigned from, unsigned to) { struct inode *inode = page->mapping->host; int ret; @@ -295,11 +311,7 @@ int ocfs2_prepare_write(struct file *file, struct page *page, goto out; } - down_read(&OCFS2_I(inode)->ip_alloc_sem); - - ret = block_prepare_write(page, from, to, ocfs2_get_block); - - up_read(&OCFS2_I(inode)->ip_alloc_sem); + ret = ocfs2_prepare_write_nolock(inode, page, from, to); ocfs2_meta_unlock(inode, 0); out: @@ -625,11 +637,31 @@ static ssize_t ocfs2_direct_IO(int rw, int ret; mlog_entry_void(); + + /* + * We get PR data locks even for O_DIRECT. This allows + * concurrent O_DIRECT I/O but doesn't let O_DIRECT with + * extending and buffered zeroing writes race. If they did + * race then the buffered zeroing could be written back after + * the O_DIRECT I/O. It's one thing to tell people not to mix + * buffered and O_DIRECT writes, but expecting them to + * understand that file extension is also an implicit buffered + * write is too much. By getting the PR we force writeback of + * the buffered zeroing before proceeding. + */ + ret = ocfs2_data_lock(inode, 0); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + ocfs2_data_unlock(inode, 0); + ret = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ocfs2_direct_IO_get_blocks, ocfs2_dio_end_io); +out: mlog_exit(ret); return ret; } diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index d40456d509a..e88c3f0b8fa 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -22,8 +22,8 @@ #ifndef OCFS2_AOPS_H #define OCFS2_AOPS_H -int ocfs2_prepare_write(struct file *file, struct page *page, - unsigned from, unsigned to); +int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, + unsigned from, unsigned to); struct ocfs2_journal_handle *ocfs2_start_walk_page_trans(struct inode *inode, struct page *page, diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index bff0f0d0686..21f38accd03 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -153,6 +153,7 @@ struct o2hb_region { struct o2hb_bio_wait_ctxt { atomic_t wc_num_reqs; struct completion wc_io_complete; + int wc_error; }; static void o2hb_write_timeout(void *arg) @@ -186,6 +187,7 @@ static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc, { atomic_set(&wc->wc_num_reqs, num_ios); init_completion(&wc->wc_io_complete); + wc->wc_error = 0; } /* Used in error paths too */ @@ -218,8 +220,10 @@ static int o2hb_bio_end_io(struct bio *bio, { struct o2hb_bio_wait_ctxt *wc = bio->bi_private; - if (error) + if (error) { mlog(ML_ERROR, "IO Error %d\n", error); + wc->wc_error = error; + } if (bio->bi_size) return 1; @@ -390,6 +394,8 @@ static int o2hb_read_slots(struct o2hb_region *reg, bail_and_wait: o2hb_wait_on_io(reg, &wc); + if (wc.wc_error && !status) + status = wc.wc_error; if (bios) { for(i = 0; i < num_bios; i++) @@ -790,20 +796,24 @@ static int o2hb_highest_node(unsigned long *nodes, return highest; } -static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) +static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) { int i, ret, highest_node, change = 0; unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; struct bio *write_bio; struct o2hb_bio_wait_ctxt write_wc; - if (o2nm_configured_node_map(configured_nodes, sizeof(configured_nodes))) - return; + ret = o2nm_configured_node_map(configured_nodes, + sizeof(configured_nodes)); + if (ret) { + mlog_errno(ret); + return ret; + } highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); if (highest_node >= O2NM_MAX_NODES) { mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); - return; + return -EINVAL; } /* No sense in reading the slots of nodes that don't exist @@ -813,7 +823,7 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) ret = o2hb_read_slots(reg, highest_node + 1); if (ret < 0) { mlog_errno(ret); - return; + return ret; } /* With an up to date view of the slots, we can check that no @@ -831,7 +841,7 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) ret = o2hb_issue_node_write(reg, &write_bio, &write_wc); if (ret < 0) { mlog_errno(ret); - return; + return ret; } i = -1; @@ -847,6 +857,15 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) */ o2hb_wait_on_io(reg, &write_wc); bio_put(write_bio); + if (write_wc.wc_error) { + /* Do not re-arm the write timeout on I/O error - we + * can't be sure that the new block ever made it to + * disk */ + mlog(ML_ERROR, "Write error %d on device \"%s\"\n", + write_wc.wc_error, reg->hr_dev_name); + return write_wc.wc_error; + } + o2hb_arm_write_timeout(reg); /* let the person who launched us know when things are steady */ @@ -854,6 +873,8 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) if (atomic_dec_and_test(®->hr_steady_iterations)) wake_up(&o2hb_steady_queue); } + + return 0; } /* Subtract b from a, storing the result in a. a *must* have a larger @@ -913,7 +934,10 @@ static int o2hb_thread(void *data) * likely to time itself out. */ do_gettimeofday(&before_hb); - o2hb_do_disk_heartbeat(reg); + i = 0; + do { + ret = o2hb_do_disk_heartbeat(reg); + } while (ret && ++i < 2); do_gettimeofday(&after_hb); elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c index c3764f4744e..74ca4e5f976 100644 --- a/fs/ocfs2/dlm/userdlm.c +++ b/fs/ocfs2/dlm/userdlm.c @@ -139,6 +139,10 @@ static void user_ast(void *opaque) return; } + mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE, + "Lockres %s, requested ivmode. flags 0x%x\n", + lockres->l_name, lockres->l_flags); + /* we're downconverting. */ if (lockres->l_requested < lockres->l_level) { if (lockres->l_requested <= @@ -229,23 +233,42 @@ static void user_unlock_ast(void *opaque, enum dlm_status status) mlog(0, "UNLOCK AST called on lock %s\n", lockres->l_name); - if (status != DLM_NORMAL) + if (status != DLM_NORMAL && status != DLM_CANCELGRANT) mlog(ML_ERROR, "Dlm returns status %d\n", status); spin_lock(&lockres->l_lock); - if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) + /* The teardown flag gets set early during the unlock process, + * so test the cancel flag to make sure that this ast isn't + * for a concurrent cancel. */ + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN + && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) { lockres->l_level = LKM_IVMODE; - else { + } else if (status == DLM_CANCELGRANT) { + mlog(0, "Lock %s, cancel fails, flags 0x%x\n", + lockres->l_name, lockres->l_flags); + /* We tried to cancel a convert request, but it was + * already granted. Don't clear the busy flag - the + * ast should've done this already. */ + BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); + lockres->l_flags &= ~USER_LOCK_IN_CANCEL; + goto out_noclear; + } else { + BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); + /* Cancel succeeded, we want to re-queue */ + mlog(0, "Lock %s, cancel succeeds, flags 0x%x\n", + lockres->l_name, lockres->l_flags); lockres->l_requested = LKM_IVMODE; /* cancel an * upconvert * request. */ lockres->l_flags &= ~USER_LOCK_IN_CANCEL; /* we want the unblock thread to look at it again * now. */ - __user_dlm_queue_lockres(lockres); + if (lockres->l_flags & USER_LOCK_BLOCKED) + __user_dlm_queue_lockres(lockres); } lockres->l_flags &= ~USER_LOCK_BUSY; +out_noclear: spin_unlock(&lockres->l_lock); wake_up(&lockres->l_event); @@ -268,13 +291,26 @@ static void user_dlm_unblock_lock(void *opaque) spin_lock(&lockres->l_lock); - BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED)); - BUG_ON(!(lockres->l_flags & USER_LOCK_QUEUED)); + mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED), + "Lockres %s, flags 0x%x\n", + lockres->l_name, lockres->l_flags); - /* notice that we don't clear USER_LOCK_BLOCKED here. That's - * for user_ast to do. */ + /* notice that we don't clear USER_LOCK_BLOCKED here. If it's + * set, we want user_ast clear it. */ lockres->l_flags &= ~USER_LOCK_QUEUED; + /* It's valid to get here and no longer be blocked - if we get + * several basts in a row, we might be queued by the first + * one, the unblock thread might run and clear the queued + * flag, and finally we might get another bast which re-queues + * us before our ast for the downconvert is called. */ + if (!(lockres->l_flags & USER_LOCK_BLOCKED)) { + mlog(0, "Lockres %s, flags 0x%x: queued but not blocking\n", + lockres->l_name, lockres->l_flags); + spin_unlock(&lockres->l_lock); + goto drop_ref; + } + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { mlog(0, "lock is in teardown so we do nothing\n"); spin_unlock(&lockres->l_lock); @@ -282,7 +318,9 @@ static void user_dlm_unblock_lock(void *opaque) } if (lockres->l_flags & USER_LOCK_BUSY) { - mlog(0, "BUSY flag detected...\n"); + mlog(0, "Cancel lock %s, flags 0x%x\n", + lockres->l_name, lockres->l_flags); + if (lockres->l_flags & USER_LOCK_IN_CANCEL) { spin_unlock(&lockres->l_lock); goto drop_ref; @@ -296,14 +334,7 @@ static void user_dlm_unblock_lock(void *opaque) LKM_CANCEL, user_unlock_ast, lockres); - if (status == DLM_CANCELGRANT) { - /* If we got this, then the ast was fired - * before we could cancel. We cleanup our - * state, and restart the function. */ - spin_lock(&lockres->l_lock); - lockres->l_flags &= ~USER_LOCK_IN_CANCEL; - spin_unlock(&lockres->l_lock); - } else if (status != DLM_NORMAL) + if (status != DLM_NORMAL) user_log_dlm_error("dlmunlock", status, lockres); goto drop_ref; } @@ -581,6 +612,14 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres) mlog(0, "asked to destroy %s\n", lockres->l_name); spin_lock(&lockres->l_lock); + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { + mlog(0, "Lock is already torn down\n"); + spin_unlock(&lockres->l_lock); + return 0; + } + + lockres->l_flags |= USER_LOCK_IN_TEARDOWN; + while (lockres->l_flags & USER_LOCK_BUSY) { spin_unlock(&lockres->l_lock); @@ -606,7 +645,6 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres) lockres->l_flags &= ~USER_LOCK_ATTACHED; lockres->l_flags |= USER_LOCK_BUSY; - lockres->l_flags |= USER_LOCK_IN_TEARDOWN; spin_unlock(&lockres->l_lock); mlog(0, "unlocking lockres %s\n", lockres->l_name); diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 4601fc256f1..1a5c69071df 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -569,7 +569,7 @@ static int ocfs2_extent_map_insert(struct inode *inode, ret = -ENOMEM; ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep, - GFP_KERNEL); + GFP_NOFS); if (!ctxt.new_ent) { mlog_errno(ret); return ret; @@ -583,14 +583,14 @@ static int ocfs2_extent_map_insert(struct inode *inode, if (ctxt.need_left && !ctxt.left_ent) { ctxt.left_ent = kmem_cache_alloc(ocfs2_em_ent_cachep, - GFP_KERNEL); + GFP_NOFS); if (!ctxt.left_ent) break; } if (ctxt.need_right && !ctxt.right_ent) { ctxt.right_ent = kmem_cache_alloc(ocfs2_em_ent_cachep, - GFP_KERNEL); + GFP_NOFS); if (!ctxt.right_ent) break; } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 34e903a6a46..a9559c87453 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -260,6 +260,17 @@ static int ocfs2_truncate_file(struct inode *inode, if (new_i_size == le64_to_cpu(fe->i_size)) goto bail; + /* This forces other nodes to sync and drop their pages. Do + * this even if we have a truncate without allocation change - + * ocfs2 cluster sizes can be much greater than page size, so + * we have to truncate them anyway. */ + status = ocfs2_data_lock(inode, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + ocfs2_data_unlock(inode, 1); + if (le32_to_cpu(fe->i_clusters) == ocfs2_clusters_for_bytes(osb->sb, new_i_size)) { mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n", @@ -272,14 +283,6 @@ static int ocfs2_truncate_file(struct inode *inode, goto bail; } - /* This forces other nodes to sync and drop their pages */ - status = ocfs2_data_lock(inode, 1); - if (status < 0) { - mlog_errno(status); - goto bail; - } - ocfs2_data_unlock(inode, 1); - /* alright, we're going to need to do a full blown alloc size * change. Orphan the inode so that recovery can complete the * truncate if necessary. This does the task of marking @@ -610,7 +613,8 @@ leave: /* Some parts of this taken from generic_cont_expand, which turned out * to be too fragile to do exactly what we need without us having to - * worry about recursive locking in ->commit_write(). */ + * worry about recursive locking in ->prepare_write() and + * ->commit_write(). */ static int ocfs2_write_zero_page(struct inode *inode, u64 size) { @@ -638,7 +642,7 @@ static int ocfs2_write_zero_page(struct inode *inode, goto out; } - ret = ocfs2_prepare_write(NULL, page, offset, offset); + ret = ocfs2_prepare_write_nolock(inode, page, offset, offset); if (ret < 0) { mlog_errno(ret); goto out_unlock; @@ -692,13 +696,26 @@ out: return ret; } +/* + * A tail_to_skip value > 0 indicates that we're being called from + * ocfs2_file_aio_write(). This has the following implications: + * + * - we don't want to update i_size + * - di_bh will be NULL, which is fine because it's only used in the + * case where we want to update i_size. + * - ocfs2_zero_extend() will then only be filling the hole created + * between i_size and the start of the write. + */ static int ocfs2_extend_file(struct inode *inode, struct buffer_head *di_bh, - u64 new_i_size) + u64 new_i_size, + size_t tail_to_skip) { int ret = 0; u32 clusters_to_add; + BUG_ON(!tail_to_skip && !di_bh); + /* setattr sometimes calls us like this. */ if (new_i_size == 0) goto out; @@ -711,27 +728,44 @@ static int ocfs2_extend_file(struct inode *inode, OCFS2_I(inode)->ip_clusters; if (clusters_to_add) { - ret = ocfs2_extend_allocation(inode, clusters_to_add); + /* + * protect the pages that ocfs2_zero_extend is going to + * be pulling into the page cache.. we do this before the + * metadata extend so that we don't get into the situation + * where we've extended the metadata but can't get the data + * lock to zero. + */ + ret = ocfs2_data_lock(inode, 1); if (ret < 0) { mlog_errno(ret); goto out; } - ret = ocfs2_zero_extend(inode, new_i_size); + ret = ocfs2_extend_allocation(inode, clusters_to_add); if (ret < 0) { mlog_errno(ret); - goto out; + goto out_unlock; } - } - /* No allocation required, we just use this helper to - * do a trivial update of i_size. */ - ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); - if (ret < 0) { - mlog_errno(ret); - goto out; + ret = ocfs2_zero_extend(inode, (u64)new_i_size - tail_to_skip); + if (ret < 0) { + mlog_errno(ret); + goto out_unlock; + } } + if (!tail_to_skip) { + /* We're being called from ocfs2_setattr() which wants + * us to update i_size */ + ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); + if (ret < 0) + mlog_errno(ret); + } + +out_unlock: + if (clusters_to_add) /* this is the only case in which we lock */ + ocfs2_data_unlock(inode, 1); + out: return ret; } @@ -790,7 +824,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) if (i_size_read(inode) > attr->ia_size) status = ocfs2_truncate_file(inode, bh, attr->ia_size); else - status = ocfs2_extend_file(inode, bh, attr->ia_size); + status = ocfs2_extend_file(inode, bh, attr->ia_size, 0); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -1046,21 +1080,12 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, if (!clusters) break; - ret = ocfs2_extend_allocation(inode, clusters); + ret = ocfs2_extend_file(inode, NULL, newsize, count); if (ret < 0) { if (ret != -ENOSPC) mlog_errno(ret); goto out; } - - /* Fill any holes which would've been created by this - * write. If we're O_APPEND, this will wind up - * (correctly) being a noop. */ - ret = ocfs2_zero_extend(inode, (u64) newsize - count); - if (ret < 0) { - mlog_errno(ret); - goto out; - } break; } @@ -1143,6 +1168,22 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, ocfs2_iocb_set_rw_locked(iocb); } + /* + * We're fine letting folks race truncates and extending + * writes with read across the cluster, just like they can + * locally. Hence no rw_lock during read. + * + * Take and drop the meta data lock to update inode fields + * like i_size. This allows the checks down below + * generic_file_aio_read() a chance of actually working. + */ + ret = ocfs2_meta_lock(inode, NULL, NULL, 0); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + ocfs2_meta_unlock(inode, 0); + ret = generic_file_aio_read(iocb, buf, count, iocb->ki_pos); if (ret == -EINVAL) mlog(ML_ERROR, "generic_file_aio_read returned -EINVAL\n"); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 6a610ae5358..eebc3cfa6be 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -117,7 +117,7 @@ struct ocfs2_journal_handle *ocfs2_alloc_handle(struct ocfs2_super *osb) { struct ocfs2_journal_handle *retval = NULL; - retval = kcalloc(1, sizeof(*retval), GFP_KERNEL); + retval = kcalloc(1, sizeof(*retval), GFP_NOFS); if (!retval) { mlog(ML_ERROR, "Failed to allocate memory for journal " "handle!\n"); @@ -870,9 +870,11 @@ static int ocfs2_force_read_journal(struct inode *inode) if (p_blocks > CONCURRENT_JOURNAL_FILL) p_blocks = CONCURRENT_JOURNAL_FILL; + /* We are reading journal data which should not + * be put in the uptodate cache */ status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb), p_blkno, p_blocks, bhs, 0, - inode); + NULL); if (status < 0) { mlog_errno(status); goto bail; @@ -982,7 +984,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, { struct ocfs2_la_recovery_item *item; - item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_KERNEL); + item = kmalloc(sizeof(struct ocfs2_la_recovery_item), GFP_NOFS); if (!item) { /* Though we wish to avoid it, we are in fact safe in * skipping local alloc cleanup as fsck.ocfs2 is more diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index 04a684dfdd9..b8a00a79332 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c @@ -337,7 +337,7 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi, (unsigned long long)oi->ip_blkno, (unsigned long long)block, expand_tree); - new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_KERNEL); + new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS); if (!new) { mlog_errno(-ENOMEM); return; @@ -349,7 +349,7 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_inode_info *oi, * has no way of tracking that. */ for(i = 0; i < OCFS2_INODE_MAX_CACHE_ARRAY; i++) { tree[i] = kmem_cache_alloc(ocfs2_uptodate_cachep, - GFP_KERNEL); + GFP_NOFS); if (!tree[i]) { mlog_errno(-ENOMEM); goto out_free; diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c index 53049a20419..ee42765a855 100644 --- a/fs/ocfs2/vote.c +++ b/fs/ocfs2/vote.c @@ -586,7 +586,7 @@ static struct ocfs2_net_wait_ctxt *ocfs2_new_net_wait_ctxt(unsigned int response { struct ocfs2_net_wait_ctxt *w; - w = kcalloc(1, sizeof(*w), GFP_KERNEL); + w = kcalloc(1, sizeof(*w), GFP_NOFS); if (!w) { mlog_errno(-ENOMEM); goto bail; @@ -749,7 +749,7 @@ static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb, BUG_ON(!ocfs2_is_valid_vote_request(type)); - request = kcalloc(1, sizeof(*request), GFP_KERNEL); + request = kcalloc(1, sizeof(*request), GFP_NOFS); if (!request) { mlog_errno(-ENOMEM); } else { @@ -1129,7 +1129,7 @@ static int ocfs2_handle_vote_message(struct o2net_msg *msg, struct ocfs2_super *osb = data; struct ocfs2_vote_work *work; - work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_KERNEL); + work = kmalloc(sizeof(struct ocfs2_vote_work), GFP_NOFS); if (!work) { status = -ENOMEM; mlog_errno(status); diff --git a/fs/open.c b/fs/open.c index c32c89d6d8d..317b7c7f38a 100644 --- a/fs/open.c +++ b/fs/open.c @@ -331,7 +331,10 @@ out: asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length) { - return do_sys_ftruncate(fd, length, 1); + long ret = do_sys_ftruncate(fd, length, 1); + /* avoid REGPARM breakage on x86: */ + prevent_tail_call(ret); + return ret; } /* LFS versions of truncate are only needed on 32 bit machines */ @@ -343,7 +346,10 @@ asmlinkage long sys_truncate64(const char __user * path, loff_t length) asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length) { - return do_sys_ftruncate(fd, length, 0); + long ret = do_sys_ftruncate(fd, length, 0); + /* avoid REGPARM breakage on x86: */ + prevent_tail_call(ret); + return ret; } #endif @@ -1093,22 +1099,31 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode) asmlinkage long sys_open(const char __user *filename, int flags, int mode) { + long ret; + if (force_o_largefile()) flags |= O_LARGEFILE; - return do_sys_open(AT_FDCWD, filename, flags, mode); + ret = do_sys_open(AT_FDCWD, filename, flags, mode); + /* avoid REGPARM breakage on x86: */ + prevent_tail_call(ret); + return ret; } EXPORT_SYMBOL_GPL(sys_open); asmlinkage long sys_openat(int dfd, const char __user *filename, int flags, int mode) { + long ret; + if (force_o_largefile()) flags |= O_LARGEFILE; - return do_sys_open(dfd, filename, flags, mode); + ret = do_sys_open(dfd, filename, flags, mode); + /* avoid REGPARM breakage on x86: */ + prevent_tail_call(ret); + return ret; } -EXPORT_SYMBOL_GPL(sys_openat); #ifndef __alpha__ diff --git a/fs/partitions/check.c b/fs/partitions/check.c index af0cb4b9e78..7ef1f094de9 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -331,7 +331,9 @@ void delete_partition(struct gendisk *disk, int part) devfs_remove("%s/part%d", disk->devfs_name, part); if (p->holder_dir) kobject_unregister(p->holder_dir); - kobject_unregister(&p->kobj); + kobject_uevent(&p->kobj, KOBJ_REMOVE); + kobject_del(&p->kobj); + kobject_put(&p->kobj); } void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len) @@ -357,7 +359,10 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len) snprintf(p->kobj.name,KOBJ_NAME_LEN,"%s%d",disk->kobj.name,part); p->kobj.parent = &disk->kobj; p->kobj.ktype = &ktype_part; - kobject_register(&p->kobj); + kobject_init(&p->kobj); + kobject_add(&p->kobj); + if (!disk->part_uevent_suppress) + kobject_uevent(&p->kobj, KOBJ_ADD); partition_sysfs_add_subdir(p); disk->part[part-1] = p; } @@ -367,6 +372,7 @@ static char *make_block_name(struct gendisk *disk) char *name; static char *block_str = "block:"; int size; + char *s; size = strlen(block_str) + strlen(disk->disk_name) + 1; name = kmalloc(size, GFP_KERNEL); @@ -374,6 +380,10 @@ static char *make_block_name(struct gendisk *disk) return NULL; strcpy(name, block_str); strcat(name, disk->disk_name); + /* ewww... some of these buggers have / in name... */ + s = strchr(name, '/'); + if (s) + *s = '!'; return name; } @@ -395,6 +405,8 @@ void register_disk(struct gendisk *disk) { struct block_device *bdev; char *s; + int i; + struct hd_struct *p; int err; strlcpy(disk->kobj.name,disk->disk_name,KOBJ_NAME_LEN); @@ -406,13 +418,12 @@ void register_disk(struct gendisk *disk) return; disk_sysfs_symlinks(disk); disk_sysfs_add_subdirs(disk); - kobject_uevent(&disk->kobj, KOBJ_ADD); /* No minors to use for partitions */ if (disk->minors == 1) { if (disk->devfs_name[0] != '\0') devfs_add_disk(disk); - return; + goto exit; } /* always add handle for the whole disk */ @@ -420,16 +431,32 @@ void register_disk(struct gendisk *disk) /* No such device (e.g., media were just removed) */ if (!get_capacity(disk)) - return; + goto exit; bdev = bdget_disk(disk, 0); if (!bdev) - return; + goto exit; + /* scan partition table, but suppress uevents */ bdev->bd_invalidated = 1; - if (blkdev_get(bdev, FMODE_READ, 0) < 0) - return; + disk->part_uevent_suppress = 1; + err = blkdev_get(bdev, FMODE_READ, 0); + disk->part_uevent_suppress = 0; + if (err < 0) + goto exit; blkdev_put(bdev); + +exit: + /* announce disk after possible partitions are already created */ + kobject_uevent(&disk->kobj, KOBJ_ADD); + + /* announce possible partitions */ + for (i = 1; i < disk->minors; i++) { + p = disk->part[i-1]; + if (!p || !p->nr_sects) + continue; + kobject_uevent(&p->kobj, KOBJ_ADD); + } } int rescan_partitions(struct gendisk *disk, struct block_device *bdev) @@ -506,6 +533,7 @@ void del_gendisk(struct gendisk *disk) devfs_remove_disk(disk); + kobject_uevent(&disk->kobj, KOBJ_REMOVE); if (disk->holder_dir) kobject_unregister(disk->holder_dir); if (disk->slave_dir) @@ -518,7 +546,7 @@ void del_gendisk(struct gendisk *disk) kfree(disk_name); } put_device(disk->driverfs_dev); + disk->driverfs_dev = NULL; } - kobject_uevent(&disk->kobj, KOBJ_REMOVE); kobject_del(&disk->kobj); } diff --git a/fs/pipe.c b/fs/pipe.c index 109a102c150..5acd8954aaa 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -36,7 +36,7 @@ */ /* Drop the inode semaphore and wait for a pipe event, atomically */ -void pipe_wait(struct inode * inode) +void pipe_wait(struct pipe_inode_info *pipe) { DEFINE_WAIT(wait); @@ -44,15 +44,19 @@ void pipe_wait(struct inode * inode) * Pipes are system-local resources, so sleeping on them * is considered a noninteractive wait: */ - prepare_to_wait(PIPE_WAIT(*inode), &wait, TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE); - mutex_unlock(PIPE_MUTEX(*inode)); + prepare_to_wait(&pipe->wait, &wait, + TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); schedule(); - finish_wait(PIPE_WAIT(*inode), &wait); - mutex_lock(PIPE_MUTEX(*inode)); + finish_wait(&pipe->wait, &wait); + if (pipe->inode) + mutex_lock(&pipe->inode->i_mutex); } static int -pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len) +pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len, + int atomic) { unsigned long copy; @@ -61,8 +65,13 @@ pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len) iov++; copy = min_t(unsigned long, len, iov->iov_len); - if (copy_from_user(to, iov->iov_base, copy)) - return -EFAULT; + if (atomic) { + if (__copy_from_user_inatomic(to, iov->iov_base, copy)) + return -EFAULT; + } else { + if (copy_from_user(to, iov->iov_base, copy)) + return -EFAULT; + } to += copy; len -= copy; iov->iov_base += copy; @@ -72,7 +81,8 @@ pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len) } static int -pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len) +pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len, + int atomic) { unsigned long copy; @@ -81,8 +91,13 @@ pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len) iov++; copy = min_t(unsigned long, len, iov->iov_len); - if (copy_to_user(iov->iov_base, from, copy)) - return -EFAULT; + if (atomic) { + if (__copy_to_user_inatomic(iov->iov_base, from, copy)) + return -EFAULT; + } else { + if (copy_to_user(iov->iov_base, from, copy)) + return -EFAULT; + } from += copy; len -= copy; iov->iov_base += copy; @@ -91,49 +106,115 @@ pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len) return 0; } -static void anon_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buffer *buf) +/* + * Attempt to pre-fault in the user memory, so we can use atomic copies. + * Returns the number of bytes not faulted in. + */ +static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len) +{ + while (!iov->iov_len) + iov++; + + while (len > 0) { + unsigned long this_len; + + this_len = min_t(unsigned long, len, iov->iov_len); + if (fault_in_pages_writeable(iov->iov_base, this_len)) + break; + + len -= this_len; + iov++; + } + + return len; +} + +/* + * Pre-fault in the user memory, so we can use atomic copies. + */ +static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len) +{ + while (!iov->iov_len) + iov++; + + while (len > 0) { + unsigned long this_len; + + this_len = min_t(unsigned long, len, iov->iov_len); + fault_in_pages_readable(iov->iov_base, this_len); + len -= this_len; + iov++; + } +} + +static void anon_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { struct page *page = buf->page; /* * If nobody else uses this page, and we don't already have a * temporary page, let's keep track of it as a one-deep - * allocation cache + * allocation cache. (Otherwise just release our reference to it) */ - if (page_count(page) == 1 && !info->tmp_page) { - info->tmp_page = page; - return; + if (page_count(page) == 1 && !pipe->tmp_page) + pipe->tmp_page = page; + else + page_cache_release(page); +} + +void *generic_pipe_buf_map(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, int atomic) +{ + if (atomic) { + buf->flags |= PIPE_BUF_FLAG_ATOMIC; + return kmap_atomic(buf->page, KM_USER0); } - /* - * Otherwise just release our reference to it - */ - page_cache_release(page); + return kmap(buf->page); } -static void *anon_pipe_buf_map(struct file *file, struct pipe_inode_info *info, struct pipe_buffer *buf) +void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, void *map_data) { - return kmap(buf->page); + if (buf->flags & PIPE_BUF_FLAG_ATOMIC) { + buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; + kunmap_atomic(map_data, KM_USER0); + } else + kunmap(buf->page); } -static void anon_pipe_buf_unmap(struct pipe_inode_info *info, struct pipe_buffer *buf) +int generic_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { - kunmap(buf->page); + struct page *page = buf->page; + + if (page_count(page) == 1) { + lock_page(page); + return 0; + } + + return 1; +} + +void generic_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf) +{ + page_cache_get(buf->page); } -static int anon_pipe_buf_steal(struct pipe_inode_info *info, - struct pipe_buffer *buf) +int generic_pipe_buf_pin(struct pipe_inode_info *info, struct pipe_buffer *buf) { - buf->stolen = 1; return 0; } static struct pipe_buf_operations anon_pipe_buf_ops = { .can_merge = 1, - .map = anon_pipe_buf_map, - .unmap = anon_pipe_buf_unmap, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .pin = generic_pipe_buf_pin, .release = anon_pipe_buf_release, - .steal = anon_pipe_buf_steal, + .steal = generic_pipe_buf_steal, + .get = generic_pipe_buf_get, }; static ssize_t @@ -141,7 +222,7 @@ pipe_readv(struct file *filp, const struct iovec *_iov, unsigned long nr_segs, loff_t *ppos) { struct inode *inode = filp->f_dentry->d_inode; - struct pipe_inode_info *info; + struct pipe_inode_info *pipe; int do_wakeup; ssize_t ret; struct iovec *iov = (struct iovec *)_iov; @@ -154,31 +235,43 @@ pipe_readv(struct file *filp, const struct iovec *_iov, do_wakeup = 0; ret = 0; - mutex_lock(PIPE_MUTEX(*inode)); - info = inode->i_pipe; + mutex_lock(&inode->i_mutex); + pipe = inode->i_pipe; for (;;) { - int bufs = info->nrbufs; + int bufs = pipe->nrbufs; if (bufs) { - int curbuf = info->curbuf; - struct pipe_buffer *buf = info->bufs + curbuf; + int curbuf = pipe->curbuf; + struct pipe_buffer *buf = pipe->bufs + curbuf; struct pipe_buf_operations *ops = buf->ops; void *addr; size_t chars = buf->len; - int error; + int error, atomic; if (chars > total_len) chars = total_len; - addr = ops->map(filp, info, buf); - if (IS_ERR(addr)) { + error = ops->pin(pipe, buf); + if (error) { if (!ret) - ret = PTR_ERR(addr); + error = ret; break; } - error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars); - ops->unmap(info, buf); + + atomic = !iov_fault_in_pages_write(iov, chars); +redo: + addr = ops->map(pipe, buf, atomic); + error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic); + ops->unmap(pipe, buf, addr); if (unlikely(error)) { - if (!ret) ret = -EFAULT; + /* + * Just retry with the slow path if we failed. + */ + if (atomic) { + atomic = 0; + goto redo; + } + if (!ret) + ret = error; break; } ret += chars; @@ -186,10 +279,10 @@ pipe_readv(struct file *filp, const struct iovec *_iov, buf->len -= chars; if (!buf->len) { buf->ops = NULL; - ops->release(info, buf); + ops->release(pipe, buf); curbuf = (curbuf + 1) & (PIPE_BUFFERS-1); - info->curbuf = curbuf; - info->nrbufs = --bufs; + pipe->curbuf = curbuf; + pipe->nrbufs = --bufs; do_wakeup = 1; } total_len -= chars; @@ -198,9 +291,9 @@ pipe_readv(struct file *filp, const struct iovec *_iov, } if (bufs) /* More to do? */ continue; - if (!PIPE_WRITERS(*inode)) + if (!pipe->writers) break; - if (!PIPE_WAITING_WRITERS(*inode)) { + if (!pipe->waiting_writers) { /* syscall merging: Usually we must not sleep * if O_NONBLOCK is set, or if we got some data. * But if a writer sleeps in kernel space, then @@ -214,20 +307,22 @@ pipe_readv(struct file *filp, const struct iovec *_iov, } } if (signal_pending(current)) { - if (!ret) ret = -ERESTARTSYS; + if (!ret) + ret = -ERESTARTSYS; break; } if (do_wakeup) { - wake_up_interruptible_sync(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - pipe_wait(inode); + pipe_wait(pipe); } - mutex_unlock(PIPE_MUTEX(*inode)); - /* Signal writers asynchronously that there is more room. */ + mutex_unlock(&inode->i_mutex); + + /* Signal writers asynchronously that there is more room. */ if (do_wakeup) { - wake_up_interruptible(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } if (ret > 0) file_accessed(filp); @@ -238,6 +333,7 @@ static ssize_t pipe_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { struct iovec iov = { .iov_base = buf, .iov_len = count }; + return pipe_readv(filp, &iov, 1, ppos); } @@ -246,7 +342,7 @@ pipe_writev(struct file *filp, const struct iovec *_iov, unsigned long nr_segs, loff_t *ppos) { struct inode *inode = filp->f_dentry->d_inode; - struct pipe_inode_info *info; + struct pipe_inode_info *pipe; ssize_t ret; int do_wakeup; struct iovec *iov = (struct iovec *)_iov; @@ -260,10 +356,10 @@ pipe_writev(struct file *filp, const struct iovec *_iov, do_wakeup = 0; ret = 0; - mutex_lock(PIPE_MUTEX(*inode)); - info = inode->i_pipe; + mutex_lock(&inode->i_mutex); + pipe = inode->i_pipe; - if (!PIPE_READERS(*inode)) { + if (!pipe->readers) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; @@ -271,27 +367,36 @@ pipe_writev(struct file *filp, const struct iovec *_iov, /* We try to merge small writes */ chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ - if (info->nrbufs && chars != 0) { - int lastbuf = (info->curbuf + info->nrbufs - 1) & (PIPE_BUFFERS-1); - struct pipe_buffer *buf = info->bufs + lastbuf; + if (pipe->nrbufs && chars != 0) { + int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & + (PIPE_BUFFERS-1); + struct pipe_buffer *buf = pipe->bufs + lastbuf; struct pipe_buf_operations *ops = buf->ops; int offset = buf->offset + buf->len; + if (ops->can_merge && offset + chars <= PAGE_SIZE) { + int error, atomic = 1; void *addr; - int error; - addr = ops->map(filp, info, buf); - if (IS_ERR(addr)) { - error = PTR_ERR(addr); + error = ops->pin(pipe, buf); + if (error) goto out; - } + + iov_fault_in_pages_read(iov, chars); +redo1: + addr = ops->map(pipe, buf, atomic); error = pipe_iov_copy_from_user(offset + addr, iov, - chars); - ops->unmap(info, buf); + chars, atomic); + ops->unmap(pipe, buf, addr); ret = error; do_wakeup = 1; - if (error) + if (error) { + if (atomic) { + atomic = 0; + goto redo1; + } goto out; + } buf->len += chars; total_len -= chars; ret = chars; @@ -302,17 +407,20 @@ pipe_writev(struct file *filp, const struct iovec *_iov, for (;;) { int bufs; - if (!PIPE_READERS(*inode)) { + + if (!pipe->readers) { send_sig(SIGPIPE, current, 0); - if (!ret) ret = -EPIPE; + if (!ret) + ret = -EPIPE; break; } - bufs = info->nrbufs; + bufs = pipe->nrbufs; if (bufs < PIPE_BUFFERS) { - int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS-1); - struct pipe_buffer *buf = info->bufs + newbuf; - struct page *page = info->tmp_page; - int error; + int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1); + struct pipe_buffer *buf = pipe->bufs + newbuf; + struct page *page = pipe->tmp_page; + char *src; + int error, atomic = 1; if (!page) { page = alloc_page(GFP_HIGHUSER); @@ -320,9 +428,9 @@ pipe_writev(struct file *filp, const struct iovec *_iov, ret = ret ? : -ENOMEM; break; } - info->tmp_page = page; + pipe->tmp_page = page; } - /* Always wakeup, even if the copy fails. Otherwise + /* Always wake up, even if the copy fails. Otherwise * we lock up (O_NONBLOCK-)readers that sleep due to * syscall merging. * FIXME! Is this really true? @@ -332,10 +440,27 @@ pipe_writev(struct file *filp, const struct iovec *_iov, if (chars > total_len) chars = total_len; - error = pipe_iov_copy_from_user(kmap(page), iov, chars); - kunmap(page); + iov_fault_in_pages_read(iov, chars); +redo2: + if (atomic) + src = kmap_atomic(page, KM_USER0); + else + src = kmap(page); + + error = pipe_iov_copy_from_user(src, iov, chars, + atomic); + if (atomic) + kunmap_atomic(src, KM_USER0); + else + kunmap(page); + if (unlikely(error)) { - if (!ret) ret = -EFAULT; + if (atomic) { + atomic = 0; + goto redo2; + } + if (!ret) + ret = error; break; } ret += chars; @@ -345,8 +470,8 @@ pipe_writev(struct file *filp, const struct iovec *_iov, buf->ops = &anon_pipe_buf_ops; buf->offset = 0; buf->len = chars; - info->nrbufs = ++bufs; - info->tmp_page = NULL; + pipe->nrbufs = ++bufs; + pipe->tmp_page = NULL; total_len -= chars; if (!total_len) @@ -355,27 +480,29 @@ pipe_writev(struct file *filp, const struct iovec *_iov, if (bufs < PIPE_BUFFERS) continue; if (filp->f_flags & O_NONBLOCK) { - if (!ret) ret = -EAGAIN; + if (!ret) + ret = -EAGAIN; break; } if (signal_pending(current)) { - if (!ret) ret = -ERESTARTSYS; + if (!ret) + ret = -ERESTARTSYS; break; } if (do_wakeup) { - wake_up_interruptible_sync(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); do_wakeup = 0; } - PIPE_WAITING_WRITERS(*inode)++; - pipe_wait(inode); - PIPE_WAITING_WRITERS(*inode)--; + pipe->waiting_writers++; + pipe_wait(pipe); + pipe->waiting_writers--; } out: - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); if (do_wakeup) { - wake_up_interruptible(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } if (ret > 0) file_update_time(filp); @@ -387,6 +514,7 @@ pipe_write(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) { struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; + return pipe_writev(filp, &iov, 1, ppos); } @@ -397,7 +525,8 @@ bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos) } static ssize_t -bad_pipe_w(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) +bad_pipe_w(struct file *filp, const char __user *buf, size_t count, + loff_t *ppos) { return -EBADF; } @@ -407,21 +536,22 @@ pipe_ioctl(struct inode *pino, struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = filp->f_dentry->d_inode; - struct pipe_inode_info *info; + struct pipe_inode_info *pipe; int count, buf, nrbufs; switch (cmd) { case FIONREAD: - mutex_lock(PIPE_MUTEX(*inode)); - info = inode->i_pipe; + mutex_lock(&inode->i_mutex); + pipe = inode->i_pipe; count = 0; - buf = info->curbuf; - nrbufs = info->nrbufs; + buf = pipe->curbuf; + nrbufs = pipe->nrbufs; while (--nrbufs >= 0) { - count += info->bufs[buf].len; + count += pipe->bufs[buf].len; buf = (buf+1) & (PIPE_BUFFERS-1); } - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); + return put_user(count, (int __user *)arg); default: return -EINVAL; @@ -434,17 +564,17 @@ pipe_poll(struct file *filp, poll_table *wait) { unsigned int mask; struct inode *inode = filp->f_dentry->d_inode; - struct pipe_inode_info *info = inode->i_pipe; + struct pipe_inode_info *pipe = inode->i_pipe; int nrbufs; - poll_wait(filp, PIPE_WAIT(*inode), wait); + poll_wait(filp, &pipe->wait, wait); /* Reading only -- no need for acquiring the semaphore. */ - nrbufs = info->nrbufs; + nrbufs = pipe->nrbufs; mask = 0; if (filp->f_mode & FMODE_READ) { mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0; - if (!PIPE_WRITERS(*inode) && filp->f_version != PIPE_WCOUNTER(*inode)) + if (!pipe->writers && filp->f_version != pipe->w_counter) mask |= POLLHUP; } @@ -454,7 +584,7 @@ pipe_poll(struct file *filp, poll_table *wait) * Most Unices do not set POLLERR for FIFOs but on Linux they * behave exactly like pipes for poll(). */ - if (!PIPE_READERS(*inode)) + if (!pipe->readers) mask |= POLLERR; } @@ -464,17 +594,21 @@ pipe_poll(struct file *filp, poll_table *wait) static int pipe_release(struct inode *inode, int decr, int decw) { - mutex_lock(PIPE_MUTEX(*inode)); - PIPE_READERS(*inode) -= decr; - PIPE_WRITERS(*inode) -= decw; - if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) { + struct pipe_inode_info *pipe; + + mutex_lock(&inode->i_mutex); + pipe = inode->i_pipe; + pipe->readers -= decr; + pipe->writers -= decw; + + if (!pipe->readers && !pipe->writers) { free_pipe_info(inode); } else { - wake_up_interruptible(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); - kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); return 0; } @@ -485,9 +619,9 @@ pipe_read_fasync(int fd, struct file *filp, int on) struct inode *inode = filp->f_dentry->d_inode; int retval; - mutex_lock(PIPE_MUTEX(*inode)); - retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode)); - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); + retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers); + mutex_unlock(&inode->i_mutex); if (retval < 0) return retval; @@ -502,9 +636,9 @@ pipe_write_fasync(int fd, struct file *filp, int on) struct inode *inode = filp->f_dentry->d_inode; int retval; - mutex_lock(PIPE_MUTEX(*inode)); - retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode)); - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); + retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers); + mutex_unlock(&inode->i_mutex); if (retval < 0) return retval; @@ -517,16 +651,17 @@ static int pipe_rdwr_fasync(int fd, struct file *filp, int on) { struct inode *inode = filp->f_dentry->d_inode; + struct pipe_inode_info *pipe = inode->i_pipe; int retval; - mutex_lock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode)); + retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); if (retval >= 0) - retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode)); + retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); if (retval < 0) return retval; @@ -565,9 +700,9 @@ pipe_read_open(struct inode *inode, struct file *filp) { /* We could have perhaps used atomic_t, but this and friends below are the only places. So it doesn't seem worthwhile. */ - mutex_lock(PIPE_MUTEX(*inode)); - PIPE_READERS(*inode)++; - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); + inode->i_pipe->readers++; + mutex_unlock(&inode->i_mutex); return 0; } @@ -575,9 +710,9 @@ pipe_read_open(struct inode *inode, struct file *filp) static int pipe_write_open(struct inode *inode, struct file *filp) { - mutex_lock(PIPE_MUTEX(*inode)); - PIPE_WRITERS(*inode)++; - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); + inode->i_pipe->writers++; + mutex_unlock(&inode->i_mutex); return 0; } @@ -585,12 +720,12 @@ pipe_write_open(struct inode *inode, struct file *filp) static int pipe_rdwr_open(struct inode *inode, struct file *filp) { - mutex_lock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); if (filp->f_mode & FMODE_READ) - PIPE_READERS(*inode)++; + inode->i_pipe->readers++; if (filp->f_mode & FMODE_WRITE) - PIPE_WRITERS(*inode)++; - mutex_unlock(PIPE_MUTEX(*inode)); + inode->i_pipe->writers++; + mutex_unlock(&inode->i_mutex); return 0; } @@ -673,37 +808,38 @@ static struct file_operations rdwr_pipe_fops = { .fasync = pipe_rdwr_fasync, }; -void free_pipe_info(struct inode *inode) +struct pipe_inode_info * alloc_pipe_info(struct inode *inode) +{ + struct pipe_inode_info *pipe; + + pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); + if (pipe) { + init_waitqueue_head(&pipe->wait); + pipe->r_counter = pipe->w_counter = 1; + pipe->inode = inode; + } + + return pipe; +} + +void __free_pipe_info(struct pipe_inode_info *pipe) { int i; - struct pipe_inode_info *info = inode->i_pipe; - inode->i_pipe = NULL; for (i = 0; i < PIPE_BUFFERS; i++) { - struct pipe_buffer *buf = info->bufs + i; + struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) - buf->ops->release(info, buf); + buf->ops->release(pipe, buf); } - if (info->tmp_page) - __free_page(info->tmp_page); - kfree(info); + if (pipe->tmp_page) + __free_page(pipe->tmp_page); + kfree(pipe); } -struct inode* pipe_new(struct inode* inode) +void free_pipe_info(struct inode *inode) { - struct pipe_inode_info *info; - - info = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); - if (!info) - goto fail_page; - inode->i_pipe = info; - - init_waitqueue_head(PIPE_WAIT(*inode)); - PIPE_RCOUNTER(*inode) = PIPE_WCOUNTER(*inode) = 1; - - return inode; -fail_page: - return NULL; + __free_pipe_info(inode->i_pipe); + inode->i_pipe = NULL; } static struct vfsmount *pipe_mnt __read_mostly; @@ -711,6 +847,7 @@ static int pipefs_delete_dentry(struct dentry *dentry) { return 1; } + static struct dentry_operations pipefs_dentry_operations = { .d_delete = pipefs_delete_dentry, }; @@ -718,13 +855,17 @@ static struct dentry_operations pipefs_dentry_operations = { static struct inode * get_pipe_inode(void) { struct inode *inode = new_inode(pipe_mnt->mnt_sb); + struct pipe_inode_info *pipe; if (!inode) goto fail_inode; - if(!pipe_new(inode)) + pipe = alloc_pipe_info(inode); + if (!pipe) goto fail_iput; - PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 1; + inode->i_pipe = pipe; + + pipe->readers = pipe->writers = 1; inode->i_fop = &rdwr_pipe_fops; /* @@ -739,10 +880,12 @@ static struct inode * get_pipe_inode(void) inode->i_gid = current->fsgid; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_blksize = PAGE_SIZE; + return inode; fail_iput: iput(inode); + fail_inode: return NULL; } @@ -755,7 +898,7 @@ int do_pipe(int *fd) struct inode * inode; struct file *f1, *f2; int error; - int i,j; + int i, j; error = -ENFILE; f1 = get_empty_filp(); @@ -788,6 +931,7 @@ int do_pipe(int *fd) dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &this); if (!dentry) goto close_f12_inode_i_j; + dentry->d_op = &pipefs_dentry_operations; d_add(dentry, inode); f1->f_vfsmnt = f2->f_vfsmnt = mntget(mntget(pipe_mnt)); @@ -811,6 +955,7 @@ int do_pipe(int *fd) fd_install(j, f2); fd[0] = i; fd[1] = j; + return 0; close_f12_inode_i_j: @@ -835,8 +980,9 @@ no_files: * d_name - pipe: will go nicely and kill the special-casing in procfs. */ -static struct super_block *pipefs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static struct super_block * +pipefs_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) { return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC); } @@ -850,6 +996,7 @@ static struct file_system_type pipe_fs_type = { static int __init init_pipe_fs(void) { int err = register_filesystem(&pipe_fs_type); + if (!err) { pipe_mnt = kern_mount(&pipe_fs_type); if (IS_ERR(pipe_mnt)) { diff --git a/fs/proc/base.c b/fs/proc/base.c index a3a3eecef68..6cc77dc3f3f 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -297,16 +297,20 @@ static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsm files = get_files_struct(task); if (files) { - rcu_read_lock(); + /* + * We are not taking a ref to the file structure, so we must + * hold ->file_lock. + */ + spin_lock(&files->file_lock); file = fcheck_files(files, fd); if (file) { *mnt = mntget(file->f_vfsmnt); *dentry = dget(file->f_dentry); - rcu_read_unlock(); + spin_unlock(&files->file_lock); put_files_struct(files); return 0; } - rcu_read_unlock(); + spin_unlock(&files->file_lock); put_files_struct(files); } return -ENOENT; @@ -1523,7 +1527,12 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, if (!files) goto out_unlock; inode->i_mode = S_IFLNK; - rcu_read_lock(); + + /* + * We are not taking a ref to the file structure, so we must + * hold ->file_lock. + */ + spin_lock(&files->file_lock); file = fcheck_files(files, fd); if (!file) goto out_unlock2; @@ -1531,7 +1540,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, inode->i_mode |= S_IRUSR | S_IXUSR; if (file->f_mode & 2) inode->i_mode |= S_IWUSR | S_IXUSR; - rcu_read_unlock(); + spin_unlock(&files->file_lock); put_files_struct(files); inode->i_op = &proc_pid_link_inode_operations; inode->i_size = 64; @@ -1541,7 +1550,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, return NULL; out_unlock2: - rcu_read_unlock(); + spin_unlock(&files->file_lock); put_files_struct(files); out_unlock: iput(inode); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 7efa73d44c9..20d4b2237fc 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -103,8 +103,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) { ssize_t acc = 0, tmp; - size_t tsz, nr_bytes; - u64 start; + size_t tsz; + u64 start, nr_bytes; struct vmcore *curr_m = NULL; if (buflen == 0 || *fpos >= vmcore_size) diff --git a/fs/read_write.c b/fs/read_write.c index 6256ca81a71..5bc0e9234f9 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -202,7 +202,7 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count goto Einval; inode = file->f_dentry->d_inode; - if (inode->i_flock && MANDATORY_LOCK(inode)) { + if (unlikely(inode->i_flock && MANDATORY_LOCK(inode))) { int retval = locks_mandatory_area( read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE, inode, file, pos, count); diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 58c418fbca2..97ae1b92bc4 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -408,8 +408,9 @@ int reiserfs_cache_default_acl(struct inode *inode) acl = reiserfs_get_acl(inode, ACL_TYPE_DEFAULT); reiserfs_read_unlock_xattrs(inode->i_sb); reiserfs_read_unlock_xattr_i(inode); - ret = acl ? 1 : 0; - posix_acl_release(acl); + ret = (acl && !IS_ERR(acl)); + if (ret) + posix_acl_release(acl); } return ret; diff --git a/fs/select.c b/fs/select.c index 071660fa7b0..a8109baa5e4 100644 --- a/fs/select.c +++ b/fs/select.c @@ -310,8 +310,9 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, s64 *timeout) { fd_set_bits fds; - char *bits; - int ret, size, max_fdset; + void *bits; + int ret, max_fdset; + unsigned int size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; @@ -333,20 +334,21 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, * since we used fdset we need to allocate memory in units of * long-words. */ - ret = -ENOMEM; size = FDS_BYTES(n); - if (6*size < SELECT_STACK_ALLOC) - bits = stack_fds; - else + bits = stack_fds; + if (size > sizeof(stack_fds) / 6) { + /* Not enough space in on-stack array; must use kmalloc */ + ret = -ENOMEM; bits = kmalloc(6 * size, GFP_KERNEL); - if (!bits) - goto out_nofds; - fds.in = (unsigned long *) bits; - fds.out = (unsigned long *) (bits + size); - fds.ex = (unsigned long *) (bits + 2*size); - fds.res_in = (unsigned long *) (bits + 3*size); - fds.res_out = (unsigned long *) (bits + 4*size); - fds.res_ex = (unsigned long *) (bits + 5*size); + if (!bits) + goto out_nofds; + } + fds.in = bits; + fds.out = bits + size; + fds.ex = bits + 2*size; + fds.res_in = bits + 3*size; + fds.res_out = bits + 4*size; + fds.res_ex = bits + 5*size; if ((ret = get_fd_set(n, inp, fds.in)) || (ret = get_fd_set(n, outp, fds.out)) || diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c index 34c7a11d91f..70d9c5a37f5 100644 --- a/fs/smbfs/dir.c +++ b/fs/smbfs/dir.c @@ -434,6 +434,11 @@ smb_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) if (dentry->d_name.len > SMB_MAXNAMELEN) goto out; + /* Do not allow lookup of names with backslashes in */ + error = -EINVAL; + if (memchr(dentry->d_name.name, '\\', dentry->d_name.len)) + goto out; + lock_kernel(); error = smb_proc_getattr(dentry, &finfo); #ifdef SMBFS_PARANOIA diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c index c56bd99a970..ed9a24d19d7 100644 --- a/fs/smbfs/file.c +++ b/fs/smbfs/file.c @@ -178,11 +178,9 @@ smb_writepage(struct page *page, struct writeback_control *wbc) unsigned offset = PAGE_CACHE_SIZE; int err; - if (!mapping) - BUG(); + BUG_ON(!mapping); inode = mapping->host; - if (!inode) - BUG(); + BUG_ON(!inode); end_index = inode->i_size >> PAGE_CACHE_SHIFT; diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c index c71c375863c..c71dd2760d3 100644 --- a/fs/smbfs/request.c +++ b/fs/smbfs/request.c @@ -339,9 +339,11 @@ int smb_add_request(struct smb_request *req) /* * On timeout or on interrupt we want to try and remove the * request from the recvq/xmitq. + * First check if the request is still part of a queue. (May + * have been removed by some error condition) */ smb_lock_server(server); - if (!(req->rq_flags & SMB_REQ_RECEIVED)) { + if (!list_empty(&req->rq_queue)) { list_del_init(&req->rq_queue); smb_rput(req); } diff --git a/fs/splice.c b/fs/splice.c index 7c2bbf18d7a..a285fd746dc 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -9,11 +9,12 @@ * that transfers data buffers to or from a pipe buffer. * * Named by Larry McVoy, original implementation from Linus, extended by - * Jens to support splicing to files and fixing the initial implementation - * bugs. + * Jens to support splicing to files, network, direct splicing, etc and + * fixing lots of bugs. * - * Copyright (C) 2005 Jens Axboe <axboe@suse.de> - * Copyright (C) 2005 Linus Torvalds <torvalds@osdl.org> + * Copyright (C) 2005-2006 Jens Axboe <axboe@suse.de> + * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> + * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> * */ #include <linux/fs.h> @@ -22,143 +23,196 @@ #include <linux/pipe_fs_i.h> #include <linux/mm_inline.h> #include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/buffer_head.h> #include <linux/module.h> +#include <linux/syscalls.h> +#include <linux/uio.h> + +struct partial_page { + unsigned int offset; + unsigned int len; +}; /* - * Passed to the actors + * Passed to splice_to_pipe */ -struct splice_desc { - unsigned int len, total_len; /* current and remaining length */ +struct splice_pipe_desc { + struct page **pages; /* page map */ + struct partial_page *partial; /* pages[] may not be contig */ + int nr_pages; /* number of pages in map */ unsigned int flags; /* splice flags */ - struct file *file; /* file to read/write */ - loff_t pos; /* file position */ + struct pipe_buf_operations *ops;/* ops associated with output pipe */ }; -static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, +/* + * Attempt to steal a page from a pipe buffer. This should perhaps go into + * a vm helper function, it's already simplified quite a bit by the + * addition of remove_mapping(). If success is returned, the caller may + * attempt to reuse this page for another destination. + */ +static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; + struct address_space *mapping = page_mapping(page); + + lock_page(page); - WARN_ON(!PageLocked(page)); WARN_ON(!PageUptodate(page)); - if (!remove_mapping(page_mapping(page), page)) - return 1; + /* + * At least for ext2 with nobh option, we need to wait on writeback + * completing on this page, since we'll remove it from the pagecache. + * Otherwise truncate wont wait on the page, allowing the disk + * blocks to be reused by someone else before we actually wrote our + * data to them. fs corruption ensues. + */ + wait_on_page_writeback(page); - if (PageLRU(page)) { - struct zone *zone = page_zone(page); + if (PagePrivate(page)) + try_to_release_page(page, mapping_gfp_mask(mapping)); - spin_lock_irq(&zone->lru_lock); - BUG_ON(!PageLRU(page)); - __ClearPageLRU(page); - del_page_from_lru(zone, page); - spin_unlock_irq(&zone->lru_lock); + if (!remove_mapping(mapping, page)) { + unlock_page(page); + return 1; } - buf->stolen = 1; + buf->flags |= PIPE_BUF_FLAG_LRU; return 0; } -static void page_cache_pipe_buf_release(struct pipe_inode_info *info, +static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { page_cache_release(buf->page); - buf->page = NULL; - buf->stolen = 0; + buf->flags &= ~PIPE_BUF_FLAG_LRU; } -static void *page_cache_pipe_buf_map(struct file *file, - struct pipe_inode_info *info, - struct pipe_buffer *buf) +static int page_cache_pipe_buf_pin(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { struct page *page = buf->page; - - lock_page(page); + int err; if (!PageUptodate(page)) { - unlock_page(page); - return ERR_PTR(-EIO); - } + lock_page(page); + + /* + * Page got truncated/unhashed. This will cause a 0-byte + * splice, if this is the first page. + */ + if (!page->mapping) { + err = -ENODATA; + goto error; + } + + /* + * Uh oh, read-error from disk. + */ + if (!PageUptodate(page)) { + err = -EIO; + goto error; + } - if (!page->mapping) { + /* + * Page is ok afterall, we are done. + */ unlock_page(page); - return ERR_PTR(-ENODATA); } - return kmap(buf->page); + return 0; +error: + unlock_page(page); + return err; } -static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info, - struct pipe_buffer *buf) +static struct pipe_buf_operations page_cache_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .pin = page_cache_pipe_buf_pin, + .release = page_cache_pipe_buf_release, + .steal = page_cache_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { - if (!buf->stolen) - unlock_page(buf->page); - kunmap(buf->page); + if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) + return 1; + + buf->flags |= PIPE_BUF_FLAG_LRU; + return generic_pipe_buf_steal(pipe, buf); } -static struct pipe_buf_operations page_cache_pipe_buf_ops = { +static struct pipe_buf_operations user_page_pipe_buf_ops = { .can_merge = 0, - .map = page_cache_pipe_buf_map, - .unmap = page_cache_pipe_buf_unmap, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .pin = generic_pipe_buf_pin, .release = page_cache_pipe_buf_release, - .steal = page_cache_pipe_buf_steal, + .steal = user_page_pipe_buf_steal, + .get = generic_pipe_buf_get, }; -static ssize_t move_to_pipe(struct inode *inode, struct page **pages, - int nr_pages, unsigned long offset, - unsigned long len) +/* + * Pipe output worker. This sets up our pipe format with the page cache + * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). + */ +static ssize_t splice_to_pipe(struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) { - struct pipe_inode_info *info; - int ret, do_wakeup, i; + int ret, do_wakeup, page_nr; ret = 0; do_wakeup = 0; - i = 0; + page_nr = 0; - mutex_lock(PIPE_MUTEX(*inode)); + if (pipe->inode) + mutex_lock(&pipe->inode->i_mutex); - info = inode->i_pipe; for (;;) { - int bufs; - - if (!PIPE_READERS(*inode)) { + if (!pipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } - bufs = info->nrbufs; - if (bufs < PIPE_BUFFERS) { - int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS - 1); - struct pipe_buffer *buf = info->bufs + newbuf; - struct page *page = pages[i++]; - unsigned long this_len; - - this_len = PAGE_CACHE_SIZE - offset; - if (this_len > len) - this_len = len; - - buf->page = page; - buf->offset = offset; - buf->len = this_len; - buf->ops = &page_cache_pipe_buf_ops; - info->nrbufs = ++bufs; - do_wakeup = 1; - - ret += this_len; - len -= this_len; - offset = 0; - if (!--nr_pages) - break; - if (!len) + if (pipe->nrbufs < PIPE_BUFFERS) { + int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); + struct pipe_buffer *buf = pipe->bufs + newbuf; + + buf->page = spd->pages[page_nr]; + buf->offset = spd->partial[page_nr].offset; + buf->len = spd->partial[page_nr].len; + buf->ops = spd->ops; + if (spd->flags & SPLICE_F_GIFT) + buf->flags |= PIPE_BUF_FLAG_GIFT; + + pipe->nrbufs++; + page_nr++; + ret += buf->len; + + if (pipe->inode) + do_wakeup = 1; + + if (!--spd->nr_pages) break; - if (bufs < PIPE_BUFFERS) + if (pipe->nrbufs < PIPE_BUFFERS) continue; break; } + if (spd->flags & SPLICE_F_NONBLOCK) { + if (!ret) + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { if (!ret) ret = -ERESTARTSYS; @@ -166,137 +220,272 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages, } if (do_wakeup) { - wake_up_interruptible_sync(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, - POLL_IN); + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); do_wakeup = 0; } - PIPE_WAITING_WRITERS(*inode)++; - pipe_wait(inode); - PIPE_WAITING_WRITERS(*inode)--; + pipe->waiting_writers++; + pipe_wait(pipe); + pipe->waiting_writers--; } - mutex_unlock(PIPE_MUTEX(*inode)); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); if (do_wakeup) { - wake_up_interruptible(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } - while (i < nr_pages) - page_cache_release(pages[i++]); + while (page_nr < spd->nr_pages) + page_cache_release(spd->pages[page_nr++]); return ret; } -static int __generic_file_splice_read(struct file *in, struct inode *pipe, - size_t len) +static int +__generic_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { struct address_space *mapping = in->f_mapping; - unsigned int offset, nr_pages; - struct page *pages[PIPE_BUFFERS], *shadow[PIPE_BUFFERS]; + unsigned int loff, nr_pages; + struct page *pages[PIPE_BUFFERS]; + struct partial_page partial[PIPE_BUFFERS]; struct page *page; - pgoff_t index, pidx; - int i, j; - - index = in->f_pos >> PAGE_CACHE_SHIFT; - offset = in->f_pos & ~PAGE_CACHE_MASK; - nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + pgoff_t index, end_index; + loff_t isize; + size_t total_len; + int error, page_nr; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .flags = flags, + .ops = &page_cache_pipe_buf_ops, + }; + + index = *ppos >> PAGE_CACHE_SHIFT; + loff = *ppos & ~PAGE_CACHE_MASK; + nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (nr_pages > PIPE_BUFFERS) nr_pages = PIPE_BUFFERS; /* - * initiate read-ahead on this page range + * Initiate read-ahead on this page range. however, don't call into + * read-ahead if this is a non-zero offset (we are likely doing small + * chunk splice and the page is already there) for a single page. */ - do_page_cache_readahead(mapping, in, index, nr_pages); + if (!loff || nr_pages > 1) + page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages); /* - * Get as many pages from the page cache as possible.. - * Start IO on the page cache entries we create (we - * can assume that any pre-existing ones we find have - * already had IO started on them). + * Now fill in the holes: */ - i = find_get_pages(mapping, index, nr_pages, pages); + error = 0; + total_len = 0; /* - * common case - we found all pages and they are contiguous, - * kick them off + * Lookup the (hopefully) full range of pages we need. */ - if (i && (pages[i - 1]->index == index + i - 1)) - goto splice_them; + spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); /* - * fill shadow[] with pages at the right locations, so we only - * have to fill holes + * If find_get_pages_contig() returned fewer pages than we needed, + * allocate the rest. */ - memset(shadow, 0, i * sizeof(struct page *)); - for (j = 0, pidx = index; j < i; pidx++, j++) - shadow[pages[j]->index - pidx] = pages[j]; + index += spd.nr_pages; + while (spd.nr_pages < nr_pages) { + /* + * Page could be there, find_get_pages_contig() breaks on + * the first hole. + */ + page = find_get_page(mapping, index); + if (!page) { + /* + * Make sure the read-ahead engine is notified + * about this failure. + */ + handle_ra_miss(mapping, &in->f_ra, index); + + /* + * page didn't exist, allocate one. + */ + page = page_cache_alloc_cold(mapping); + if (!page) + break; + + error = add_to_page_cache_lru(page, mapping, index, + mapping_gfp_mask(mapping)); + if (unlikely(error)) { + page_cache_release(page); + if (error == -EEXIST) + continue; + break; + } + /* + * add_to_page_cache() locks the page, unlock it + * to avoid convoluting the logic below even more. + */ + unlock_page(page); + } + + pages[spd.nr_pages++] = page; + index++; + } /* - * now fill in the holes + * Now loop over the map and see if we need to start IO on any + * pages, fill in the partial map, etc. */ - for (i = 0, pidx = index; i < nr_pages; pidx++, i++) { - int error; + index = *ppos >> PAGE_CACHE_SHIFT; + nr_pages = spd.nr_pages; + spd.nr_pages = 0; + for (page_nr = 0; page_nr < nr_pages; page_nr++) { + unsigned int this_len; - if (shadow[i]) - continue; + if (!len) + break; /* - * no page there, look one up / create it + * this_len is the max we'll use from this page */ - page = find_or_create_page(mapping, pidx, - mapping_gfp_mask(mapping)); - if (!page) - break; + this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); + page = pages[page_nr]; - if (PageUptodate(page)) - unlock_page(page); - else { - error = mapping->a_ops->readpage(in, page); + /* + * If the page isn't uptodate, we may need to start io on it + */ + if (!PageUptodate(page)) { + /* + * If in nonblock mode then dont block on waiting + * for an in-flight io page + */ + if (flags & SPLICE_F_NONBLOCK) + break; + lock_page(page); + + /* + * page was truncated, stop here. if this isn't the + * first page, we'll just complete what we already + * added + */ + if (!page->mapping) { + unlock_page(page); + break; + } + /* + * page was already under io and is now done, great + */ + if (PageUptodate(page)) { + unlock_page(page); + goto fill_it; + } + + /* + * need to read in the page + */ + error = mapping->a_ops->readpage(in, page); if (unlikely(error)) { - page_cache_release(page); + /* + * We really should re-lookup the page here, + * but it complicates things a lot. Instead + * lets just do what we already stored, and + * we'll get it the next time we are called. + */ + if (error == AOP_TRUNCATED_PAGE) + error = 0; + break; } - } - shadow[i] = page; - } - if (!i) { - for (i = 0; i < nr_pages; i++) { - if (shadow[i]) - page_cache_release(shadow[i]); + /* + * i_size must be checked after ->readpage(). + */ + isize = i_size_read(mapping->host); + end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(!isize || index > end_index)) + break; + + /* + * if this is the last page, see if we need to shrink + * the length and stop + */ + if (end_index == index) { + loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK); + if (total_len + loff > isize) + break; + /* + * force quit after adding this page + */ + len = this_len; + this_len = min(this_len, loff); + loff = 0; + } } - return 0; +fill_it: + partial[page_nr].offset = loff; + partial[page_nr].len = this_len; + len -= this_len; + total_len += this_len; + loff = 0; + spd.nr_pages++; + index++; } - memcpy(pages, shadow, i * sizeof(struct page *)); - /* - * Now we splice them into the pipe.. + * Release any pages at the end, if we quit early. 'i' is how far + * we got, 'nr_pages' is how many pages are in the map. */ -splice_them: - return move_to_pipe(pipe, pages, i, offset, len); + while (page_nr < nr_pages) + page_cache_release(pages[page_nr++]); + + if (spd.nr_pages) + return splice_to_pipe(pipe, &spd); + + return error; } -ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, - size_t len, unsigned int flags) +/** + * generic_file_splice_read - splice data from file to a pipe + * @in: file to splice from + * @pipe: pipe to splice to + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Will read pages from given file and fill them into a pipe. + */ +ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { ssize_t spliced; int ret; ret = 0; spliced = 0; + while (len) { - ret = __generic_file_splice_read(in, pipe, len); + ret = __generic_file_splice_read(in, ppos, pipe, len, flags); - if (ret <= 0) + if (ret < 0) break; + else if (!ret) { + if (spliced) + break; + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + } - in->f_pos += ret; + *ppos += ret; len -= ret; spliced += ret; } @@ -307,38 +496,28 @@ ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, return ret; } +EXPORT_SYMBOL(generic_file_splice_read); + /* - * Send 'len' bytes to socket from 'file' at position 'pos' using sendpage(). + * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' + * using sendpage(). Return the number of bytes sent. */ -static int pipe_to_sendpage(struct pipe_inode_info *info, +static int pipe_to_sendpage(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { struct file *file = sd->file; loff_t pos = sd->pos; - unsigned int offset; - ssize_t ret; - void *ptr; - - /* - * sub-optimal, but we are limited by the pipe ->map. we don't - * need a kmap'ed buffer here, we just want to make sure we - * have the page pinned if the pipe page originates from the - * page cache - */ - ptr = buf->ops->map(file, info, buf); - if (IS_ERR(ptr)) - return PTR_ERR(ptr); + int ret, more; - offset = pos & ~PAGE_CACHE_MASK; + ret = buf->ops->pin(pipe, buf); + if (!ret) { + more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; - ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos, - sd->len < sd->total_len); - - buf->ops->unmap(info, buf); - if (ret == sd->len) - return 0; + ret = file->f_op->sendpage(file, buf->page, buf->offset, + sd->len, &pos, more); + } - return -EIO; + return ret; } /* @@ -354,58 +533,87 @@ static int pipe_to_sendpage(struct pipe_inode_info *info, * - Destination page does not exist, we can add the pipe page to * the page cache and avoid the copy. * - * For now we just do the slower thing and always copy pages over, it's - * easier than migrating pages from the pipe to the target file. For the - * case of doing file | file splicing, the migrate approach had some LRU - * nastiness... + * If asked to move pages to the output file (SPLICE_F_MOVE is set in + * sd->flags), we attempt to migrate pages from the pipe to the output + * file address space page cache. This is possible if no one else has + * the pipe page referenced outside of the pipe and page cache. If + * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create + * a new page in the output file page cache and fill/dirty that. */ -static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, +static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { struct file *file = sd->file; struct address_space *mapping = file->f_mapping; - unsigned int offset; + gfp_t gfp_mask = mapping_gfp_mask(mapping); + unsigned int offset, this_len; struct page *page; pgoff_t index; - char *src; int ret; /* - * after this, page will be locked and unmapped + * make sure the data in this buffer is uptodate */ - src = buf->ops->map(file, info, buf); - if (IS_ERR(src)) - return PTR_ERR(src); + ret = buf->ops->pin(pipe, buf); + if (unlikely(ret)) + return ret; index = sd->pos >> PAGE_CACHE_SHIFT; offset = sd->pos & ~PAGE_CACHE_MASK; + this_len = sd->len; + if (this_len + offset > PAGE_CACHE_SIZE) + this_len = PAGE_CACHE_SIZE - offset; + /* - * reuse buf page, if SPLICE_F_MOVE is set + * Reuse buf page, if SPLICE_F_MOVE is set and we are doing a full + * page. */ - if (sd->flags & SPLICE_F_MOVE) { - if (buf->ops->steal(info, buf)) + if ((sd->flags & SPLICE_F_MOVE) && this_len == PAGE_CACHE_SIZE) { + /* + * If steal succeeds, buf->page is now pruned from the + * pagecache and we can reuse it. The page will also be + * locked on successful return. + */ + if (buf->ops->steal(pipe, buf)) goto find_page; page = buf->page; - if (add_to_page_cache_lru(page, mapping, index, - mapping_gfp_mask(mapping))) + if (add_to_page_cache(page, mapping, index, gfp_mask)) { + unlock_page(page); goto find_page; + } + + page_cache_get(page); + + if (!(buf->flags & PIPE_BUF_FLAG_LRU)) + lru_cache_add(page); } else { find_page: - ret = -ENOMEM; - page = find_or_create_page(mapping, index, - mapping_gfp_mask(mapping)); - if (!page) - goto out; + page = find_lock_page(mapping, index); + if (!page) { + ret = -ENOMEM; + page = page_cache_alloc_cold(mapping); + if (unlikely(!page)) + goto out_nomem; + + /* + * This will also lock the page + */ + ret = add_to_page_cache_lru(page, mapping, index, + gfp_mask); + if (unlikely(ret)) + goto out; + } /* - * If the page is uptodate, it is also locked. If it isn't - * uptodate, we can mark it uptodate if we are filling the - * full page. Otherwise we need to read it in first... + * We get here with the page locked. If the page is also + * uptodate, we don't need to do more. If it isn't, we + * may need to bring it in if we are not going to overwrite + * the full page. */ if (!PageUptodate(page)) { - if (sd->len < PAGE_CACHE_SIZE) { + if (this_len < PAGE_CACHE_SIZE) { ret = mapping->a_ops->readpage(file, page); if (unlikely(ret)) goto out; @@ -414,7 +622,7 @@ find_page: if (!PageUptodate(page)) { /* - * page got invalidated, repeat + * Page got invalidated, repeat. */ if (!page->mapping) { unlock_page(page); @@ -424,48 +632,73 @@ find_page: ret = -EIO; goto out; } - } else { - WARN_ON(!PageLocked(page)); + } else SetPageUptodate(page); - } } } - ret = mapping->a_ops->prepare_write(file, page, 0, sd->len); - if (ret) + ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); + if (unlikely(ret)) { + loff_t isize = i_size_read(mapping->host); + + if (ret != AOP_TRUNCATED_PAGE) + unlock_page(page); + page_cache_release(page); + if (ret == AOP_TRUNCATED_PAGE) + goto find_page; + + /* + * prepare_write() may have instantiated a few blocks + * outside i_size. Trim these off again. + */ + if (sd->pos + this_len > isize) + vmtruncate(mapping->host, isize); + goto out; + } - if (!buf->stolen) { - char *dst = kmap_atomic(page, KM_USER0); + if (buf->page != page) { + /* + * Careful, ->map() uses KM_USER0! + */ + char *src = buf->ops->map(pipe, buf, 1); + char *dst = kmap_atomic(page, KM_USER1); - memcpy(dst + offset, src + buf->offset, sd->len); + memcpy(dst + offset, src + buf->offset, this_len); flush_dcache_page(page); - kunmap_atomic(dst, KM_USER0); + kunmap_atomic(dst, KM_USER1); + buf->ops->unmap(pipe, buf, src); } - ret = mapping->a_ops->commit_write(file, page, 0, sd->len); - if (ret < 0) - goto out; - - set_page_dirty(page); - ret = write_one_page(page, 0); -out: - if (ret < 0) - unlock_page(page); - if (!buf->stolen) + ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); + if (!ret) { + /* + * Return the number of bytes written and mark page as + * accessed, we are now done! + */ + ret = this_len; + mark_page_accessed(page); + balance_dirty_pages_ratelimited(mapping); + } else if (ret == AOP_TRUNCATED_PAGE) { page_cache_release(page); - buf->ops->unmap(info, buf); + goto find_page; + } +out: + page_cache_release(page); + unlock_page(page); +out_nomem: return ret; } -typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, - struct splice_desc *); - -static ssize_t move_from_pipe(struct inode *inode, struct file *out, - size_t len, unsigned int flags, - splice_actor *actor) +/* + * Pipe input worker. Most of this logic works like a regular pipe, the + * key here is the 'actor' worker passed in that actually moves the data + * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. + */ +ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags, + splice_actor *actor) { - struct pipe_inode_info *info; int ret, do_wakeup, err; struct splice_desc sd; @@ -475,58 +708,66 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, sd.total_len = len; sd.flags = flags; sd.file = out; - sd.pos = out->f_pos; + sd.pos = *ppos; - mutex_lock(PIPE_MUTEX(*inode)); + if (pipe->inode) + mutex_lock(&pipe->inode->i_mutex); - info = inode->i_pipe; for (;;) { - int bufs = info->nrbufs; - - if (bufs) { - int curbuf = info->curbuf; - struct pipe_buffer *buf = info->bufs + curbuf; + if (pipe->nrbufs) { + struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; struct pipe_buf_operations *ops = buf->ops; sd.len = buf->len; if (sd.len > sd.total_len) sd.len = sd.total_len; - err = actor(info, buf, &sd); - if (err) { + err = actor(pipe, buf, &sd); + if (err <= 0) { if (!ret && err != -ENODATA) ret = err; break; } - ret += sd.len; - buf->offset += sd.len; - buf->len -= sd.len; + ret += err; + buf->offset += err; + buf->len -= err; + + sd.len -= err; + sd.pos += err; + sd.total_len -= err; + if (sd.len) + continue; + if (!buf->len) { buf->ops = NULL; - ops->release(info, buf); - curbuf = (curbuf + 1) & (PIPE_BUFFERS - 1); - info->curbuf = curbuf; - info->nrbufs = --bufs; - do_wakeup = 1; + ops->release(pipe, buf); + pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); + pipe->nrbufs--; + if (pipe->inode) + do_wakeup = 1; } - sd.pos += sd.len; - sd.total_len -= sd.len; if (!sd.total_len) break; } - if (bufs) + if (pipe->nrbufs) continue; - if (!PIPE_WRITERS(*inode)) + if (!pipe->writers) break; - if (!PIPE_WAITING_WRITERS(*inode)) { + if (!pipe->waiting_writers) { if (ret) break; } + if (flags & SPLICE_F_NONBLOCK) { + if (!ret) + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { if (!ret) ret = -ERESTARTSYS; @@ -534,108 +775,497 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, } if (do_wakeup) { - wake_up_interruptible_sync(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_WRITERS(*inode),SIGIO,POLL_OUT); + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); do_wakeup = 0; } - pipe_wait(inode); + pipe_wait(pipe); } - mutex_unlock(PIPE_MUTEX(*inode)); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); if (do_wakeup) { - wake_up_interruptible(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - mutex_lock(&out->f_mapping->host->i_mutex); - out->f_pos = sd.pos; - mutex_unlock(&out->f_mapping->host->i_mutex); return ret; - } -ssize_t generic_file_splice_write(struct inode *inode, struct file *out, - size_t len, unsigned int flags) +/** + * generic_file_splice_write - splice data from a pipe to a file + * @pipe: pipe info + * @out: file to write to + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Will either move or copy pages (determined by @flags options) from + * the given pipe inode to the given file. + * + */ +ssize_t +generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) { - return move_from_pipe(inode, out, len, flags, pipe_to_file); + struct address_space *mapping = out->f_mapping; + ssize_t ret; + + ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); + if (ret > 0) { + struct inode *inode = mapping->host; + + *ppos += ret; + + /* + * If file or inode is SYNC and we actually wrote some data, + * sync it. + */ + if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { + int err; + + mutex_lock(&inode->i_mutex); + err = generic_osync_inode(inode, mapping, + OSYNC_METADATA|OSYNC_DATA); + mutex_unlock(&inode->i_mutex); + + if (err) + ret = err; + } + } + + return ret; } -ssize_t generic_splice_sendpage(struct inode *inode, struct file *out, - size_t len, unsigned int flags) +EXPORT_SYMBOL(generic_file_splice_write); + +/** + * generic_splice_sendpage - splice data from a pipe to a socket + * @inode: pipe inode + * @out: socket to write to + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Will send @len bytes from the pipe to a network socket. No data copying + * is involved. + * + */ +ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) { - return move_from_pipe(inode, out, len, flags, pipe_to_sendpage); + return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); } -EXPORT_SYMBOL(generic_file_splice_write); -EXPORT_SYMBOL(generic_file_splice_read); +EXPORT_SYMBOL(generic_splice_sendpage); -static long do_splice_from(struct inode *pipe, struct file *out, size_t len, - unsigned int flags) +/* + * Attempt to initiate a splice from pipe to file. + */ +static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) { - loff_t pos; int ret; - if (!out->f_op || !out->f_op->splice_write) + if (unlikely(!out->f_op || !out->f_op->splice_write)) return -EINVAL; - if (!(out->f_mode & FMODE_WRITE)) + if (unlikely(!(out->f_mode & FMODE_WRITE))) return -EBADF; - pos = out->f_pos; - ret = rw_verify_area(WRITE, out, &pos, len); + ret = rw_verify_area(WRITE, out, ppos, len); if (unlikely(ret < 0)) return ret; - return out->f_op->splice_write(pipe, out, len, flags); + return out->f_op->splice_write(pipe, out, ppos, len, flags); } -static long do_splice_to(struct file *in, struct inode *pipe, size_t len, +/* + * Attempt to initiate a splice from a file to a pipe. + */ +static long do_splice_to(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, unsigned int flags) { - loff_t pos, isize, left; + loff_t isize, left; int ret; - if (!in->f_op || !in->f_op->splice_read) + if (unlikely(!in->f_op || !in->f_op->splice_read)) return -EINVAL; - if (!(in->f_mode & FMODE_READ)) + if (unlikely(!(in->f_mode & FMODE_READ))) return -EBADF; - pos = in->f_pos; - ret = rw_verify_area(READ, in, &pos, len); + ret = rw_verify_area(READ, in, ppos, len); if (unlikely(ret < 0)) return ret; isize = i_size_read(in->f_mapping->host); - if (unlikely(in->f_pos >= isize)) + if (unlikely(*ppos >= isize)) return 0; - left = isize - in->f_pos; - if (left < len) + left = isize - *ppos; + if (unlikely(left < len)) len = left; - return in->f_op->splice_read(in, pipe, len, flags); + return in->f_op->splice_read(in, ppos, pipe, len, flags); } -static long do_splice(struct file *in, struct file *out, size_t len, - unsigned int flags) +long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + size_t len, unsigned int flags) { - struct inode *pipe; + struct pipe_inode_info *pipe; + long ret, bytes; + loff_t out_off; + umode_t i_mode; + int i; - pipe = in->f_dentry->d_inode; - if (pipe->i_pipe) - return do_splice_from(pipe, out, len, flags); + /* + * We require the input being a regular file, as we don't want to + * randomly drop data for eg socket -> socket splicing. Use the + * piped splicing for that! + */ + i_mode = in->f_dentry->d_inode->i_mode; + if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) + return -EINVAL; + + /* + * neither in nor out is a pipe, setup an internal pipe attached to + * 'out' and transfer the wanted data from 'in' to 'out' through that + */ + pipe = current->splice_pipe; + if (unlikely(!pipe)) { + pipe = alloc_pipe_info(NULL); + if (!pipe) + return -ENOMEM; - pipe = out->f_dentry->d_inode; - if (pipe->i_pipe) - return do_splice_to(in, pipe, len, flags); + /* + * We don't have an immediate reader, but we'll read the stuff + * out of the pipe right after the splice_to_pipe(). So set + * PIPE_READERS appropriately. + */ + pipe->readers = 1; + + current->splice_pipe = pipe; + } + + /* + * Do the splice. + */ + ret = 0; + bytes = 0; + out_off = 0; + + while (len) { + size_t read_len, max_read_len; + + /* + * Do at most PIPE_BUFFERS pages worth of transfer: + */ + max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE)); + + ret = do_splice_to(in, ppos, pipe, max_read_len, flags); + if (unlikely(ret < 0)) + goto out_release; + + read_len = ret; + + /* + * NOTE: nonblocking mode only applies to the input. We + * must not do the output in nonblocking mode as then we + * could get stuck data in the internal pipe: + */ + ret = do_splice_from(pipe, out, &out_off, read_len, + flags & ~SPLICE_F_NONBLOCK); + if (unlikely(ret < 0)) + goto out_release; + + bytes += ret; + len -= ret; + + /* + * In nonblocking mode, if we got back a short read then + * that was due to either an IO error or due to the + * pagecache entry not being there. In the IO error case + * the _next_ splice attempt will produce a clean IO error + * return value (not a short read), so in both cases it's + * correct to break out of the loop here: + */ + if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len)) + break; + } + + pipe->nrbufs = pipe->curbuf = 0; + + return bytes; + +out_release: + /* + * If we did an incomplete transfer we must release + * the pipe buffers in question: + */ + for (i = 0; i < PIPE_BUFFERS; i++) { + struct pipe_buffer *buf = pipe->bufs + i; + + if (buf->ops) { + buf->ops->release(pipe, buf); + buf->ops = NULL; + } + } + pipe->nrbufs = pipe->curbuf = 0; + + /* + * If we transferred some data, return the number of bytes: + */ + if (bytes > 0) + return bytes; + + return ret; +} + +EXPORT_SYMBOL(do_splice_direct); + +/* + * Determine where to splice to/from. + */ +static long do_splice(struct file *in, loff_t __user *off_in, + struct file *out, loff_t __user *off_out, + size_t len, unsigned int flags) +{ + struct pipe_inode_info *pipe; + loff_t offset, *off; + long ret; + + pipe = in->f_dentry->d_inode->i_pipe; + if (pipe) { + if (off_in) + return -ESPIPE; + if (off_out) { + if (out->f_op->llseek == no_llseek) + return -EINVAL; + if (copy_from_user(&offset, off_out, sizeof(loff_t))) + return -EFAULT; + off = &offset; + } else + off = &out->f_pos; + + ret = do_splice_from(pipe, out, off, len, flags); + + if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) + ret = -EFAULT; + + return ret; + } + + pipe = out->f_dentry->d_inode->i_pipe; + if (pipe) { + if (off_out) + return -ESPIPE; + if (off_in) { + if (in->f_op->llseek == no_llseek) + return -EINVAL; + if (copy_from_user(&offset, off_in, sizeof(loff_t))) + return -EFAULT; + off = &offset; + } else + off = &in->f_pos; + + ret = do_splice_to(in, off, pipe, len, flags); + + if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) + ret = -EFAULT; + + return ret; + } return -EINVAL; } -asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags) +/* + * Map an iov into an array of pages and offset/length tupples. With the + * partial_page structure, we can map several non-contiguous ranges into + * our ones pages[] map instead of splitting that operation into pieces. + * Could easily be exported as a generic helper for other users, in which + * case one would probably want to add a 'max_nr_pages' parameter as well. + */ +static int get_iovec_page_array(const struct iovec __user *iov, + unsigned int nr_vecs, struct page **pages, + struct partial_page *partial, int aligned) +{ + int buffers = 0, error = 0; + + /* + * It's ok to take the mmap_sem for reading, even + * across a "get_user()". + */ + down_read(¤t->mm->mmap_sem); + + while (nr_vecs) { + unsigned long off, npages; + void __user *base; + size_t len; + int i; + + /* + * Get user address base and length for this iovec. + */ + error = get_user(base, &iov->iov_base); + if (unlikely(error)) + break; + error = get_user(len, &iov->iov_len); + if (unlikely(error)) + break; + + /* + * Sanity check this iovec. 0 read succeeds. + */ + if (unlikely(!len)) + break; + error = -EFAULT; + if (unlikely(!base)) + break; + + /* + * Get this base offset and number of pages, then map + * in the user pages. + */ + off = (unsigned long) base & ~PAGE_MASK; + + /* + * If asked for alignment, the offset must be zero and the + * length a multiple of the PAGE_SIZE. + */ + error = -EINVAL; + if (aligned && (off || len & ~PAGE_MASK)) + break; + + npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (npages > PIPE_BUFFERS - buffers) + npages = PIPE_BUFFERS - buffers; + + error = get_user_pages(current, current->mm, + (unsigned long) base, npages, 0, 0, + &pages[buffers], NULL); + + if (unlikely(error <= 0)) + break; + + /* + * Fill this contiguous range into the partial page map. + */ + for (i = 0; i < error; i++) { + const int plen = min_t(size_t, len, PAGE_SIZE - off); + + partial[buffers].offset = off; + partial[buffers].len = plen; + + off = 0; + len -= plen; + buffers++; + } + + /* + * We didn't complete this iov, stop here since it probably + * means we have to move some of this into a pipe to + * be able to continue. + */ + if (len) + break; + + /* + * Don't continue if we mapped fewer pages than we asked for, + * or if we mapped the max number of pages that we have + * room for. + */ + if (error < npages || buffers == PIPE_BUFFERS) + break; + + nr_vecs--; + iov++; + } + + up_read(¤t->mm->mmap_sem); + + if (buffers) + return buffers; + + return error; +} + +/* + * vmsplice splices a user address range into a pipe. It can be thought of + * as splice-from-memory, where the regular splice is splice-from-file (or + * to file). In both cases the output is a pipe, naturally. + * + * Note that vmsplice only supports splicing _from_ user memory to a pipe, + * not the other way around. Splicing from user memory is a simple operation + * that can be supported without any funky alignment restrictions or nasty + * vm tricks. We simply map in the user memory and fill them into a pipe. + * The reverse isn't quite as easy, though. There are two possible solutions + * for that: + * + * - memcpy() the data internally, at which point we might as well just + * do a regular read() on the buffer anyway. + * - Lots of nasty vm tricks, that are neither fast nor flexible (it + * has restriction limitations on both ends of the pipe). + * + * Alas, it isn't here. + * + */ +static long do_vmsplice(struct file *file, const struct iovec __user *iov, + unsigned long nr_segs, unsigned int flags) +{ + struct pipe_inode_info *pipe = file->f_dentry->d_inode->i_pipe; + struct page *pages[PIPE_BUFFERS]; + struct partial_page partial[PIPE_BUFFERS]; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .flags = flags, + .ops = &user_page_pipe_buf_ops, + }; + + if (unlikely(!pipe)) + return -EBADF; + if (unlikely(nr_segs > UIO_MAXIOV)) + return -EINVAL; + else if (unlikely(!nr_segs)) + return 0; + + spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, + flags & SPLICE_F_GIFT); + if (spd.nr_pages <= 0) + return spd.nr_pages; + + return splice_to_pipe(pipe, &spd); +} + +asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, + unsigned long nr_segs, unsigned int flags) +{ + struct file *file; + long error; + int fput; + + error = -EBADF; + file = fget_light(fd, &fput); + if (file) { + if (file->f_mode & FMODE_WRITE) + error = do_vmsplice(file, iov, nr_segs, flags); + + fput_light(file, fput); + } + + return error; +} + +asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, + int fd_out, loff_t __user *off_out, + size_t len, unsigned int flags) { long error; struct file *in, *out; @@ -645,13 +1275,15 @@ asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags) return 0; error = -EBADF; - in = fget_light(fdin, &fput_in); + in = fget_light(fd_in, &fput_in); if (in) { if (in->f_mode & FMODE_READ) { - out = fget_light(fdout, &fput_out); + out = fget_light(fd_out, &fput_out); if (out) { if (out->f_mode & FMODE_WRITE) - error = do_splice(in, out, len, flags); + error = do_splice(in, off_in, + out, off_out, + len, flags); fput_light(out, fput_out); } } @@ -661,3 +1293,198 @@ asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags) return error; } + +/* + * Link contents of ipipe to opipe. + */ +static int link_pipe(struct pipe_inode_info *ipipe, + struct pipe_inode_info *opipe, + size_t len, unsigned int flags) +{ + struct pipe_buffer *ibuf, *obuf; + int ret, do_wakeup, i, ipipe_first; + + ret = do_wakeup = ipipe_first = 0; + + /* + * Potential ABBA deadlock, work around it by ordering lock + * grabbing by inode address. Otherwise two different processes + * could deadlock (one doing tee from A -> B, the other from B -> A). + */ + if (ipipe->inode < opipe->inode) { + ipipe_first = 1; + mutex_lock(&ipipe->inode->i_mutex); + mutex_lock(&opipe->inode->i_mutex); + } else { + mutex_lock(&opipe->inode->i_mutex); + mutex_lock(&ipipe->inode->i_mutex); + } + + for (i = 0;; i++) { + if (!opipe->readers) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; + break; + } + if (ipipe->nrbufs - i) { + ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); + + /* + * If we have room, fill this buffer + */ + if (opipe->nrbufs < PIPE_BUFFERS) { + int nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); + + /* + * Get a reference to this pipe buffer, + * so we can copy the contents over. + */ + ibuf->ops->get(ipipe, ibuf); + + obuf = opipe->bufs + nbuf; + *obuf = *ibuf; + + /* + * Don't inherit the gift flag, we need to + * prevent multiple steals of this page. + */ + obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + + if (obuf->len > len) + obuf->len = len; + + opipe->nrbufs++; + do_wakeup = 1; + ret += obuf->len; + len -= obuf->len; + + if (!len) + break; + if (opipe->nrbufs < PIPE_BUFFERS) + continue; + } + + /* + * We have input available, but no output room. + * If we already copied data, return that. If we + * need to drop the opipe lock, it must be ordered + * last to avoid deadlocks. + */ + if ((flags & SPLICE_F_NONBLOCK) || !ipipe_first) { + if (!ret) + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + if (!ret) + ret = -ERESTARTSYS; + break; + } + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&opipe->wait)) + wake_up_interruptible(&opipe->wait); + kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); + do_wakeup = 0; + } + + opipe->waiting_writers++; + pipe_wait(opipe); + opipe->waiting_writers--; + continue; + } + + /* + * No input buffers, do the usual checks for available + * writers and blocking and wait if necessary + */ + if (!ipipe->writers) + break; + if (!ipipe->waiting_writers) { + if (ret) + break; + } + /* + * pipe_wait() drops the ipipe mutex. To avoid deadlocks + * with another process, we can only safely do that if + * the ipipe lock is ordered last. + */ + if ((flags & SPLICE_F_NONBLOCK) || ipipe_first) { + if (!ret) + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + if (!ret) + ret = -ERESTARTSYS; + break; + } + + if (waitqueue_active(&ipipe->wait)) + wake_up_interruptible_sync(&ipipe->wait); + kill_fasync(&ipipe->fasync_writers, SIGIO, POLL_OUT); + + pipe_wait(ipipe); + } + + mutex_unlock(&ipipe->inode->i_mutex); + mutex_unlock(&opipe->inode->i_mutex); + + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&opipe->wait)) + wake_up_interruptible(&opipe->wait); + kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); + } + + return ret; +} + +/* + * This is a tee(1) implementation that works on pipes. It doesn't copy + * any data, it simply references the 'in' pages on the 'out' pipe. + * The 'flags' used are the SPLICE_F_* variants, currently the only + * applicable one is SPLICE_F_NONBLOCK. + */ +static long do_tee(struct file *in, struct file *out, size_t len, + unsigned int flags) +{ + struct pipe_inode_info *ipipe = in->f_dentry->d_inode->i_pipe; + struct pipe_inode_info *opipe = out->f_dentry->d_inode->i_pipe; + + /* + * Link ipipe to the two output pipes, consuming as we go along. + */ + if (ipipe && opipe) + return link_pipe(ipipe, opipe, len, flags); + + return -EINVAL; +} + +asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) +{ + struct file *in; + int error, fput_in; + + if (unlikely(!len)) + return 0; + + error = -EBADF; + in = fget_light(fdin, &fput_in); + if (in) { + if (in->f_mode & FMODE_READ) { + int fput_out; + struct file *out = fget_light(fdout, &fput_out); + + if (out) { + if (out->f_mode & FMODE_WRITE) + error = do_tee(in, out, len, flags); + fput_light(out, fput_out); + } + } + fput_light(in, fput_in); + } + + return error; +} diff --git a/fs/stat.c b/fs/stat.c index 9948cc1685a..0f282face32 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -261,7 +261,7 @@ asmlinkage long sys_newlstat(char __user *filename, struct stat __user *statbuf) return error; } -#ifndef __ARCH_WANT_STAT64 +#if !defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_SYS_NEWFSTATAT) asmlinkage long sys_newfstatat(int dfd, char __user *filename, struct stat __user *statbuf, int flag) { diff --git a/fs/sync.c b/fs/sync.c index 8616006d209..aab5ffe77e9 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -61,7 +61,7 @@ * will be available after a crash. */ asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes, - int flags) + unsigned int flags) { int ret; struct file *file; @@ -126,7 +126,7 @@ out: * `endbyte' is inclusive */ int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte, - int flags) + unsigned int flags) { int ret; struct address_space *mapping; diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index f26880a4785..610b5bdbe75 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -43,6 +43,7 @@ static struct sysfs_dirent * sysfs_new_dirent(struct sysfs_dirent * parent_sd, memset(sd, 0, sizeof(*sd)); atomic_set(&sd->s_count, 1); + atomic_set(&sd->s_event, 0); INIT_LIST_HEAD(&sd->s_children); list_add(&sd->s_sibling, &parent_sd->s_children); sd->s_element = element; @@ -50,7 +51,7 @@ static struct sysfs_dirent * sysfs_new_dirent(struct sysfs_dirent * parent_sd, return sd; } -/** +/* * * Return -EEXIST if there is already a sysfs element with the same name for * the same parent. diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 830f76fa098..cf3786625bf 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -6,6 +6,7 @@ #include <linux/fsnotify.h> #include <linux/kobject.h> #include <linux/namei.h> +#include <linux/poll.h> #include <asm/uaccess.h> #include <asm/semaphore.h> @@ -57,6 +58,7 @@ struct sysfs_buffer { struct sysfs_ops * ops; struct semaphore sem; int needs_read_fill; + int event; }; @@ -72,6 +74,7 @@ struct sysfs_buffer { */ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer) { + struct sysfs_dirent * sd = dentry->d_fsdata; struct attribute * attr = to_attr(dentry); struct kobject * kobj = to_kobj(dentry->d_parent); struct sysfs_ops * ops = buffer->ops; @@ -83,6 +86,7 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer if (!buffer->page) return -ENOMEM; + buffer->event = atomic_read(&sd->s_event); count = ops->show(kobj,attr,buffer->page); buffer->needs_read_fill = 0; BUG_ON(count > (ssize_t)PAGE_SIZE); @@ -183,7 +187,7 @@ fill_write_buffer(struct sysfs_buffer * buffer, const char __user * buf, size_t return -ENOMEM; if (count >= PAGE_SIZE) - count = PAGE_SIZE; + count = PAGE_SIZE - 1; error = copy_from_user(buffer->page,buf,count); buffer->needs_read_fill = 1; return error ? -EFAULT : count; @@ -348,12 +352,84 @@ static int sysfs_release(struct inode * inode, struct file * filp) return 0; } +/* Sysfs attribute files are pollable. The idea is that you read + * the content and then you use 'poll' or 'select' to wait for + * the content to change. When the content changes (assuming the + * manager for the kobject supports notification), poll will + * return POLLERR|POLLPRI, and select will return the fd whether + * it is waiting for read, write, or exceptions. + * Once poll/select indicates that the value has changed, you + * need to close and re-open the file, as simply seeking and reading + * again will not get new data, or reset the state of 'poll'. + * Reminder: this only works for attributes which actively support + * it, and it is not possible to test an attribute from userspace + * to see if it supports poll (Nether 'poll' or 'select' return + * an appropriate error code). When in doubt, set a suitable timeout value. + */ +static unsigned int sysfs_poll(struct file *filp, poll_table *wait) +{ + struct sysfs_buffer * buffer = filp->private_data; + struct kobject * kobj = to_kobj(filp->f_dentry->d_parent); + struct sysfs_dirent * sd = filp->f_dentry->d_fsdata; + int res = 0; + + poll_wait(filp, &kobj->poll, wait); + + if (buffer->event != atomic_read(&sd->s_event)) { + res = POLLERR|POLLPRI; + buffer->needs_read_fill = 1; + } + + return res; +} + + +static struct dentry *step_down(struct dentry *dir, const char * name) +{ + struct dentry * de; + + if (dir == NULL || dir->d_inode == NULL) + return NULL; + + mutex_lock(&dir->d_inode->i_mutex); + de = lookup_one_len(name, dir, strlen(name)); + mutex_unlock(&dir->d_inode->i_mutex); + dput(dir); + if (IS_ERR(de)) + return NULL; + if (de->d_inode == NULL) { + dput(de); + return NULL; + } + return de; +} + +void sysfs_notify(struct kobject * k, char *dir, char *attr) +{ + struct dentry *de = k->dentry; + if (de) + dget(de); + if (de && dir) + de = step_down(de, dir); + if (de && attr) + de = step_down(de, attr); + if (de) { + struct sysfs_dirent * sd = de->d_fsdata; + if (sd) + atomic_inc(&sd->s_event); + wake_up_interruptible(&k->poll); + dput(de); + } +} +EXPORT_SYMBOL_GPL(sysfs_notify); + const struct file_operations sysfs_file_operations = { .read = sysfs_read_file, .write = sysfs_write_file, .llseek = generic_file_llseek, .open = sysfs_open_file, .release = sysfs_release, + .poll = sysfs_poll, }; diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 4c29ac41ac3..f0b347bd12c 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -175,8 +175,7 @@ const unsigned char * sysfs_get_name(struct sysfs_dirent *sd) struct bin_attribute * bin_attr; struct sysfs_symlink * sl; - if (!sd || !sd->s_element) - BUG(); + BUG_ON(!sd || !sd->s_element); switch (sd->s_type) { case SYSFS_DIR: diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 32958a7c50e..3651ffb5ec0 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -11,6 +11,7 @@ extern int sysfs_make_dirent(struct sysfs_dirent *, struct dentry *, void *, extern int sysfs_add_file(struct dentry *, const struct attribute *, int); extern void sysfs_hash_and_remove(struct dentry * dir, const char * name); +extern struct sysfs_dirent *sysfs_find(struct sysfs_dirent *dir, const char * name); extern int sysfs_create_subdir(struct kobject *, const char *, struct dentry **); extern void sysfs_remove_subdir(struct dentry *); diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 8c66e9270dd..d7074341ee8 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -253,8 +253,7 @@ int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page) lock_page(page); err = mapping->a_ops->prepare_write(NULL, page, from, to); - if (err) - BUG(); + BUG_ON(err); de->inode = 0; err = dir_commit_chunk(page, from, to); dir_put_page(page); @@ -353,8 +352,7 @@ void sysv_set_link(struct sysv_dir_entry *de, struct page *page, lock_page(page); err = page->mapping->a_ops->prepare_write(NULL, page, from, to); - if (err) - BUG(); + BUG_ON(err); de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); err = dir_commit_chunk(page, from, to); dir_put_page(page); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 81e0e8459af..2983afd5e7f 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -312,12 +312,10 @@ static int udf_get_block(struct inode *inode, sector_t block, struct buffer_head err = 0; bh = inode_getblk(inode, block, &err, &phys, &new); - if (bh) - BUG(); + BUG_ON(bh); if (err) goto abort; - if (!phys) - BUG(); + BUG_ON(!phys); if (new) set_buffer_new(bh_result); diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 6cbbd165c60..4d191ef39b6 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -870,12 +870,14 @@ xfs_page_state_convert( pgoff_t end_index, last_index, tlast; ssize_t size, len; int flags, err, iomap_valid = 0, uptodate = 1; - int page_dirty, count = 0, trylock_flag = 0; + int page_dirty, count = 0; + int trylock = 0; int all_bh = unmapped; - /* wait for other IO threads? */ - if (startio && (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)) - trylock_flag |= BMAPI_TRYLOCK; + if (startio) { + if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking) + trylock |= BMAPI_TRYLOCK; + } /* Is this page beyond the end of the file? */ offset = i_size_read(inode); @@ -956,15 +958,13 @@ xfs_page_state_convert( if (buffer_unwritten(bh)) { type = IOMAP_UNWRITTEN; - flags = BMAPI_WRITE|BMAPI_IGNSTATE; + flags = BMAPI_WRITE | BMAPI_IGNSTATE; } else if (buffer_delay(bh)) { type = IOMAP_DELAY; - flags = BMAPI_ALLOCATE; - if (!startio) - flags |= trylock_flag; + flags = BMAPI_ALLOCATE | trylock; } else { type = IOMAP_NEW; - flags = BMAPI_WRITE|BMAPI_MMAP; + flags = BMAPI_WRITE | BMAPI_MMAP; } if (!iomap_valid) { diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 9fb0312665c..26fed0756f0 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -182,7 +182,7 @@ free_address( { a_list_t *aentry; - aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH); + aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT); if (likely(aentry)) { spin_lock(&as_lock); aentry->next = as_free_head; diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 85997b1205f..c847416f6d1 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -69,7 +69,6 @@ __xfs_file_read( return rval; } - STATIC ssize_t xfs_file_aio_read( struct kiocb *iocb, @@ -90,7 +89,6 @@ xfs_file_aio_read_invis( return __xfs_file_read(iocb, buf, IO_ISAIO|IO_INVIS, count, pos); } - STATIC inline ssize_t __xfs_file_write( struct kiocb *iocb, @@ -113,7 +111,6 @@ __xfs_file_write( return rval; } - STATIC ssize_t xfs_file_aio_write( struct kiocb *iocb, @@ -134,7 +131,6 @@ xfs_file_aio_write_invis( return __xfs_file_write(iocb, buf, IO_ISAIO|IO_INVIS, count, pos); } - STATIC inline ssize_t __xfs_file_readv( struct file *file, @@ -179,7 +175,6 @@ xfs_file_readv_invis( return __xfs_file_readv(file, iov, IO_INVIS, nr_segs, ppos); } - STATIC inline ssize_t __xfs_file_writev( struct file *file, @@ -204,7 +199,6 @@ __xfs_file_writev( return rval; } - STATIC ssize_t xfs_file_writev( struct file *file, @@ -228,7 +222,7 @@ xfs_file_writev_invis( STATIC ssize_t xfs_file_sendfile( struct file *filp, - loff_t *ppos, + loff_t *pos, size_t count, read_actor_t actor, void *target) @@ -236,10 +230,84 @@ xfs_file_sendfile( vnode_t *vp = vn_from_inode(filp->f_dentry->d_inode); ssize_t rval; - VOP_SENDFILE(vp, filp, ppos, 0, count, actor, target, NULL, rval); + VOP_SENDFILE(vp, filp, pos, 0, count, actor, target, NULL, rval); + return rval; +} + +STATIC ssize_t +xfs_file_sendfile_invis( + struct file *filp, + loff_t *pos, + size_t count, + read_actor_t actor, + void *target) +{ + vnode_t *vp = vn_from_inode(filp->f_dentry->d_inode); + ssize_t rval; + + VOP_SENDFILE(vp, filp, pos, IO_INVIS, count, actor, target, NULL, rval); + return rval; +} + +STATIC ssize_t +xfs_file_splice_read( + struct file *infilp, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, + unsigned int flags) +{ + vnode_t *vp = vn_from_inode(infilp->f_dentry->d_inode); + ssize_t rval; + + VOP_SPLICE_READ(vp, infilp, ppos, pipe, len, flags, 0, NULL, rval); + return rval; +} + +STATIC ssize_t +xfs_file_splice_read_invis( + struct file *infilp, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, + unsigned int flags) +{ + vnode_t *vp = vn_from_inode(infilp->f_dentry->d_inode); + ssize_t rval; + + VOP_SPLICE_READ(vp, infilp, ppos, pipe, len, flags, IO_INVIS, NULL, rval); return rval; } +STATIC ssize_t +xfs_file_splice_write( + struct pipe_inode_info *pipe, + struct file *outfilp, + loff_t *ppos, + size_t len, + unsigned int flags) +{ + vnode_t *vp = vn_from_inode(outfilp->f_dentry->d_inode); + ssize_t rval; + + VOP_SPLICE_WRITE(vp, pipe, outfilp, ppos, len, flags, 0, NULL, rval); + return rval; +} + +STATIC ssize_t +xfs_file_splice_write_invis( + struct pipe_inode_info *pipe, + struct file *outfilp, + loff_t *ppos, + size_t len, + unsigned int flags) +{ + vnode_t *vp = vn_from_inode(outfilp->f_dentry->d_inode); + ssize_t rval; + + VOP_SPLICE_WRITE(vp, pipe, outfilp, ppos, len, flags, IO_INVIS, NULL, rval); + return rval; +} STATIC int xfs_file_open( @@ -251,13 +319,10 @@ xfs_file_open( if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) return -EFBIG; - - ASSERT(vp); VOP_OPEN(vp, NULL, error); return -error; } - STATIC int xfs_file_release( struct inode *inode, @@ -271,7 +336,6 @@ xfs_file_release( return -error; } - STATIC int xfs_file_fsync( struct file *filp, @@ -285,21 +349,11 @@ xfs_file_fsync( if (datasync) flags |= FSYNC_DATA; - - ASSERT(vp); VOP_FSYNC(vp, flags, NULL, (xfs_off_t)0, (xfs_off_t)-1, error); return -error; } -/* - * xfs_file_readdir maps to VOP_READDIR(). - * We need to build a uio, cred, ... - */ - -#define nextdp(dp) ((struct xfs_dirent *)((char *)(dp) + (dp)->d_reclen)) - #ifdef CONFIG_XFS_DMAPI - STATIC struct page * xfs_vm_nopage( struct vm_area_struct *area, @@ -319,10 +373,8 @@ xfs_vm_nopage( return filemap_nopage(area, address, type); } - #endif /* CONFIG_XFS_DMAPI */ - STATIC int xfs_file_readdir( struct file *filp, @@ -330,7 +382,7 @@ xfs_file_readdir( filldir_t filldir) { int error = 0; - vnode_t *vp; + vnode_t *vp = vn_from_inode(filp->f_dentry->d_inode); uio_t uio; iovec_t iov; int eof = 0; @@ -340,9 +392,6 @@ xfs_file_readdir( xfs_off_t start_offset, curr_offset; xfs_dirent_t *dbp = NULL; - vp = vn_from_inode(filp->f_dentry->d_inode); - ASSERT(vp); - /* Try fairly hard to get memory */ do { if ((read_buf = (caddr_t)kmalloc(rlen, GFP_KERNEL))) @@ -387,7 +436,7 @@ xfs_file_readdir( } size -= dbp->d_reclen; curr_offset = (loff_t)dbp->d_off /* & 0x7fffffff */; - dbp = nextdp(dbp); + dbp = (xfs_dirent_t *)((char *)dbp + dbp->d_reclen); } } done: @@ -402,7 +451,6 @@ done: return -error; } - STATIC int xfs_file_mmap( struct file *filp, @@ -457,11 +505,10 @@ xfs_file_ioctl_invis( unsigned int cmd, unsigned long arg) { - int error; struct inode *inode = filp->f_dentry->d_inode; vnode_t *vp = vn_from_inode(inode); + int error; - ASSERT(vp); VOP_IOCTL(vp, inode, filp, IO_INVIS, cmd, (void __user *)arg, error); VMODIFY(vp); @@ -537,6 +584,8 @@ const struct file_operations xfs_file_operations = { .aio_read = xfs_file_aio_read, .aio_write = xfs_file_aio_write, .sendfile = xfs_file_sendfile, + .splice_read = xfs_file_splice_read, + .splice_write = xfs_file_splice_write, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_ioctl, @@ -558,7 +607,9 @@ const struct file_operations xfs_invis_file_operations = { .writev = xfs_file_writev_invis, .aio_read = xfs_file_aio_read_invis, .aio_write = xfs_file_aio_write_invis, - .sendfile = xfs_file_sendfile, + .sendfile = xfs_file_sendfile_invis, + .splice_read = xfs_file_splice_read_invis, + .splice_write = xfs_file_splice_write_invis, .unlocked_ioctl = xfs_file_ioctl_invis, #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_invis_ioctl, diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 149237304fb..2e2e275c786 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -673,8 +673,7 @@ xfs_vn_setattr( if (ia_valid & ATTR_ATIME) { vattr.va_mask |= XFS_AT_ATIME; vattr.va_atime = attr->ia_atime; - if (ia_valid & ATTR_ATIME_SET) - inode->i_atime = attr->ia_atime; + inode->i_atime = attr->ia_atime; } if (ia_valid & ATTR_MTIME) { vattr.va_mask |= XFS_AT_MTIME; diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 1fe09f2d651..e9fe43d7476 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -103,6 +103,7 @@ */ #undef HAVE_REFCACHE /* reference cache not needed for NFS in 2.6 */ #define HAVE_SENDFILE /* sendfile(2) exists in 2.6, but not in 2.4 */ +#define HAVE_SPLICE /* a splice(2) exists in 2.6, but not in 2.4 */ #ifdef CONFIG_SMP #define HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */ #else diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 84ddf189389..67efe330898 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -301,36 +301,23 @@ xfs_sendfile( void *target, cred_t *credp) { + xfs_inode_t *ip = XFS_BHVTOI(bdp); + xfs_mount_t *mp = ip->i_mount; ssize_t ret; - xfs_fsize_t n; - xfs_inode_t *ip; - xfs_mount_t *mp; - vnode_t *vp; - - ip = XFS_BHVTOI(bdp); - vp = BHV_TO_VNODE(bdp); - mp = ip->i_mount; XFS_STATS_INC(xs_read_calls); - - n = XFS_MAXIOFFSET(mp) - *offset; - if ((n <= 0) || (count == 0)) - return 0; - - if (n < count) - count = n; - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) && + if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_READ) && (!(ioflags & IO_INVIS))) { vrwlock_t locktype = VRWLOCK_READ; int error; - error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), *offset, count, + error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), + *offset, count, FILP_DELAY_FLAG(filp), &locktype); if (error) { xfs_iunlock(ip, XFS_IOLOCK_SHARED); @@ -340,12 +327,98 @@ xfs_sendfile( xfs_rw_enter_trace(XFS_SENDFILE_ENTER, &ip->i_iocore, (void *)(unsigned long)target, count, *offset, ioflags); ret = generic_file_sendfile(filp, offset, count, actor, target); + if (ret > 0) + XFS_STATS_ADD(xs_read_bytes, ret); xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return ret; +} +ssize_t +xfs_splice_read( + bhv_desc_t *bdp, + struct file *infilp, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t count, + int flags, + int ioflags, + cred_t *credp) +{ + xfs_inode_t *ip = XFS_BHVTOI(bdp); + xfs_mount_t *mp = ip->i_mount; + ssize_t ret; + + XFS_STATS_INC(xs_read_calls); + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + xfs_ilock(ip, XFS_IOLOCK_SHARED); + + if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_READ) && + (!(ioflags & IO_INVIS))) { + vrwlock_t locktype = VRWLOCK_READ; + int error; + + error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), + *ppos, count, + FILP_DELAY_FLAG(infilp), &locktype); + if (error) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return -error; + } + } + xfs_rw_enter_trace(XFS_SPLICE_READ_ENTER, &ip->i_iocore, + pipe, count, *ppos, ioflags); + ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return ret; +} + +ssize_t +xfs_splice_write( + bhv_desc_t *bdp, + struct pipe_inode_info *pipe, + struct file *outfilp, + loff_t *ppos, + size_t count, + int flags, + int ioflags, + cred_t *credp) +{ + xfs_inode_t *ip = XFS_BHVTOI(bdp); + xfs_mount_t *mp = ip->i_mount; + ssize_t ret; + + XFS_STATS_INC(xs_write_calls); + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_WRITE) && + (!(ioflags & IO_INVIS))) { + vrwlock_t locktype = VRWLOCK_WRITE; + int error; + + error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, BHV_TO_VNODE(bdp), + *ppos, count, + FILP_DELAY_FLAG(outfilp), &locktype); + if (error) { + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return -error; + } + } + xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, &ip->i_iocore, + pipe, count, *ppos, ioflags); + ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); + if (ret > 0) + XFS_STATS_ADD(xs_write_bytes, ret); + + xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } @@ -363,7 +436,7 @@ xfs_zero_last_block( xfs_fsize_t end_size) { xfs_fileoff_t last_fsb; - xfs_mount_t *mp; + xfs_mount_t *mp = io->io_mount; int nimaps; int zero_offset; int zero_len; @@ -373,8 +446,6 @@ xfs_zero_last_block( ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0); - mp = io->io_mount; - zero_offset = XFS_B_FSB_OFFSET(mp, isize); if (zero_offset == 0) { /* @@ -405,10 +476,9 @@ xfs_zero_last_block( * don't deadlock when the buffer cache calls back to us. */ XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD); - loff = XFS_FSB_TO_B(mp, last_fsb); + loff = XFS_FSB_TO_B(mp, last_fsb); zero_len = mp->m_sb.sb_blocksize - zero_offset; - error = xfs_iozero(ip, loff + zero_offset, zero_len, end_size); XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); @@ -441,7 +511,7 @@ xfs_zero_eof( xfs_fileoff_t zero_count_fsb; xfs_fileoff_t last_fsb; xfs_extlen_t buf_len_fsb; - xfs_mount_t *mp; + xfs_mount_t *mp = io->io_mount; int nimaps; int error = 0; xfs_bmbt_irec_t imap; @@ -450,8 +520,6 @@ xfs_zero_eof( ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); ASSERT(offset > isize); - mp = io->io_mount; - /* * First handle zeroing the block on which isize resides. * We only zero a part of that block so it is handled specially. diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h index 38864a88d42..8f453995235 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.h +++ b/fs/xfs/linux-2.6/xfs_lrw.h @@ -60,6 +60,8 @@ struct xfs_iomap; #define XFS_IOMAP_ALLOC_ENTER 25 #define XFS_IOMAP_ALLOC_MAP 26 #define XFS_IOMAP_UNWRITTEN 27 +#define XFS_SPLICE_READ_ENTER 28 +#define XFS_SPLICE_WRITE_ENTER 29 extern void xfs_rw_enter_trace(int, struct xfs_iocore *, void *, size_t, loff_t, int); extern void xfs_inval_cached_trace(struct xfs_iocore *, @@ -78,6 +80,7 @@ extern int xfs_bmap(struct bhv_desc *, xfs_off_t, ssize_t, int, struct xfs_iomap *, int *); extern int xfsbdstrat(struct xfs_mount *, struct xfs_buf *); extern int xfs_bdstrat_cb(struct xfs_buf *); +extern int xfs_dev_is_read_only(struct xfs_mount *, char *); extern int xfs_zero_eof(struct vnode *, struct xfs_iocore *, xfs_off_t, xfs_fsize_t, xfs_fsize_t); @@ -90,7 +93,11 @@ extern ssize_t xfs_write(struct bhv_desc *, struct kiocb *, extern ssize_t xfs_sendfile(struct bhv_desc *, struct file *, loff_t *, int, size_t, read_actor_t, void *, struct cred *); - -extern int xfs_dev_is_read_only(struct xfs_mount *, char *); +extern ssize_t xfs_splice_read(struct bhv_desc *, struct file *, loff_t *, + struct pipe_inode_info *, size_t, int, int, + struct cred *); +extern ssize_t xfs_splice_write(struct bhv_desc *, struct pipe_inode_info *, + struct file *, loff_t *, size_t, int, int, + struct cred *); #endif /* __XFS_LRW_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 1884300417e..68f4793e8a1 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -67,7 +67,8 @@ mempool_t *xfs_ioend_pool; STATIC struct xfs_mount_args * xfs_args_allocate( - struct super_block *sb) + struct super_block *sb, + int silent) { struct xfs_mount_args *args; @@ -80,8 +81,8 @@ xfs_args_allocate( args->flags |= XFSMNT_DIRSYNC; if (sb->s_flags & MS_SYNCHRONOUS) args->flags |= XFSMNT_WSYNC; - - /* Default to 32 bit inodes on Linux all the time */ + if (silent) + args->flags |= XFSMNT_QUIET; args->flags |= XFSMNT_32BITINODES; return args; @@ -719,7 +720,7 @@ xfs_fs_remount( char *options) { vfs_t *vfsp = vfs_from_sb(sb); - struct xfs_mount_args *args = xfs_args_allocate(sb); + struct xfs_mount_args *args = xfs_args_allocate(sb, 0); int error; VFS_PARSEARGS(vfsp, options, args, 1, error); @@ -825,7 +826,7 @@ xfs_fs_fill_super( { vnode_t *rootvp; struct vfs *vfsp = vfs_allocate(sb); - struct xfs_mount_args *args = xfs_args_allocate(sb); + struct xfs_mount_args *args = xfs_args_allocate(sb, silent); struct kstatfs statvfs; int error, error2; diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index 06f5845e956..2a8e16c2235 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -173,6 +173,12 @@ typedef ssize_t (*vop_write_t)(bhv_desc_t *, struct kiocb *, typedef ssize_t (*vop_sendfile_t)(bhv_desc_t *, struct file *, loff_t *, int, size_t, read_actor_t, void *, struct cred *); +typedef ssize_t (*vop_splice_read_t)(bhv_desc_t *, struct file *, loff_t *, + struct pipe_inode_info *, size_t, int, int, + struct cred *); +typedef ssize_t (*vop_splice_write_t)(bhv_desc_t *, struct pipe_inode_info *, + struct file *, loff_t *, size_t, int, int, + struct cred *); typedef int (*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *, int, unsigned int, void __user *); typedef int (*vop_getattr_t)(bhv_desc_t *, struct vattr *, int, @@ -231,6 +237,8 @@ typedef struct vnodeops { vop_read_t vop_read; vop_write_t vop_write; vop_sendfile_t vop_sendfile; + vop_splice_read_t vop_splice_read; + vop_splice_write_t vop_splice_write; vop_ioctl_t vop_ioctl; vop_getattr_t vop_getattr; vop_setattr_t vop_setattr; @@ -276,6 +284,10 @@ typedef struct vnodeops { rv = _VOP_(vop_write, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr) #define VOP_SENDFILE(vp,f,off,ioflags,cnt,act,targ,cr,rv) \ rv = _VOP_(vop_sendfile, vp)((vp)->v_fbhv,f,off,ioflags,cnt,act,targ,cr) +#define VOP_SPLICE_READ(vp,f,o,pipe,cnt,fl,iofl,cr,rv) \ + rv = _VOP_(vop_splice_read, vp)((vp)->v_fbhv,f,o,pipe,cnt,fl,iofl,cr) +#define VOP_SPLICE_WRITE(vp,f,o,pipe,cnt,fl,iofl,cr,rv) \ + rv = _VOP_(vop_splice_write, vp)((vp)->v_fbhv,f,o,pipe,cnt,fl,iofl,cr) #define VOP_BMAP(vp,of,sz,rw,b,n,rv) \ rv = _VOP_(vop_bmap, vp)((vp)->v_fbhv,of,sz,rw,b,n) #define VOP_OPEN(vp, cr, rv) \ diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 73c1e5e80c0..7fb5eca9bd5 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -2624,7 +2624,7 @@ xfs_qm_vop_chown_reserve( { int error; xfs_mount_t *mp; - uint delblks, blkflags; + uint delblks, blkflags, prjflags = 0; xfs_dquot_t *unresudq, *unresgdq, *delblksudq, *delblksgdq; ASSERT(XFS_ISLOCKED_INODE(ip)); @@ -2650,10 +2650,13 @@ xfs_qm_vop_chown_reserve( } } if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) { - if ((XFS_IS_GQUOTA_ON(ip->i_mount) && - ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id)) || - (XFS_IS_PQUOTA_ON(ip->i_mount) && - ip->i_d.di_projid != be32_to_cpu(gdqp->q_core.d_id))) { + if (XFS_IS_PQUOTA_ON(ip->i_mount) && + ip->i_d.di_projid != be32_to_cpu(gdqp->q_core.d_id)) + prjflags = XFS_QMOPT_ENOSPC; + + if (prjflags || + (XFS_IS_GQUOTA_ON(ip->i_mount) && + ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id))) { delblksgdq = gdqp; if (delblks) { ASSERT(ip->i_gdquot); @@ -2664,7 +2667,7 @@ xfs_qm_vop_chown_reserve( if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, delblksudq, delblksgdq, ip->i_d.di_nblocks, 1, - flags | blkflags))) + flags | blkflags | prjflags))) return (error); /* @@ -2681,7 +2684,7 @@ xfs_qm_vop_chown_reserve( ASSERT(unresudq || unresgdq); if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0, - flags | blkflags))) + flags | blkflags | prjflags))) return (error); xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0, diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c index d8e131ec0aa..9168918db25 100644 --- a/fs/xfs/quota/xfs_trans_dquot.c +++ b/fs/xfs/quota/xfs_trans_dquot.c @@ -595,12 +595,19 @@ xfs_trans_unreserve_and_mod_dquots( } } +STATIC int +xfs_quota_error(uint flags) +{ + if (flags & XFS_QMOPT_ENOSPC) + return ENOSPC; + return EDQUOT; +} + /* * This reserves disk blocks and inodes against a dquot. * Flags indicate if the dquot is to be locked here and also * if the blk reservation is for RT or regular blocks. * Sending in XFS_QMOPT_FORCE_RES flag skips the quota check. - * Returns EDQUOT if quota is exceeded. */ STATIC int xfs_trans_dqresv( @@ -666,19 +673,15 @@ xfs_trans_dqresv( */ if (hardlimit > 0ULL && (hardlimit <= nblks + *resbcountp)) { - error = EDQUOT; + error = xfs_quota_error(flags); goto error_return; } if (softlimit > 0ULL && (softlimit <= nblks + *resbcountp)) { - /* - * If timer or warnings has expired, - * return EDQUOT - */ if ((timer != 0 && get_seconds() > timer) || (warns != 0 && warns >= warnlimit)) { - error = EDQUOT; + error = xfs_quota_error(flags); goto error_return; } } @@ -695,16 +698,12 @@ xfs_trans_dqresv( if (!softlimit) softlimit = q->qi_isoftlimit; if (hardlimit > 0ULL && count >= hardlimit) { - error = EDQUOT; + error = xfs_quota_error(flags); goto error_return; } else if (softlimit > 0ULL && count >= softlimit) { - /* - * If timer or warnings has expired, - * return EDQUOT - */ if ((timer != 0 && get_seconds() > timer) || (warns != 0 && warns >= warnlimit)) { - error = EDQUOT; + error = xfs_quota_error(flags); goto error_return; } } @@ -751,13 +750,14 @@ error_return: /* - * Given a dquot(s), make disk block and/or inode reservations against them. + * Given dquot(s), make disk block and/or inode reservations against them. * The fact that this does the reservation against both the usr and - * grp quotas is important, because this follows a both-or-nothing + * grp/prj quotas is important, because this follows a both-or-nothing * approach. * * flags = XFS_QMOPT_DQLOCK indicate if dquot(s) need to be locked. * XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown. + * XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT. Used by pquota. * XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks * XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks * dquots are unlocked on return, if they were not locked by caller. @@ -772,25 +772,27 @@ xfs_trans_reserve_quota_bydquots( long ninos, uint flags) { - int resvd; + int resvd = 0, error; - if (! XFS_IS_QUOTA_ON(mp)) - return (0); + if (!XFS_IS_QUOTA_ON(mp)) + return 0; if (tp && tp->t_dqinfo == NULL) xfs_trans_alloc_dqinfo(tp); ASSERT(flags & XFS_QMOPT_RESBLK_MASK); - resvd = 0; if (udqp) { - if (xfs_trans_dqresv(tp, mp, udqp, nblks, ninos, flags)) - return (EDQUOT); + error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos, + (flags & ~XFS_QMOPT_ENOSPC)); + if (error) + return error; resvd = 1; } if (gdqp) { - if (xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags)) { + error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags); + if (error) { /* * can't do it, so backout previous reservation */ @@ -799,14 +801,14 @@ xfs_trans_reserve_quota_bydquots( xfs_trans_dqresv(tp, mp, udqp, -nblks, -ninos, flags); } - return (EDQUOT); + return error; } } /* * Didn't change anything critical, so, no need to log */ - return (0); + return 0; } @@ -814,8 +816,6 @@ xfs_trans_reserve_quota_bydquots( * Lock the dquot and change the reservation if we can. * This doesn't change the actual usage, just the reservation. * The inode sent in is locked. - * - * Returns 0 on success, EDQUOT or other errors otherwise */ STATIC int xfs_trans_reserve_quota_nblks( @@ -824,20 +824,24 @@ xfs_trans_reserve_quota_nblks( xfs_inode_t *ip, long nblks, long ninos, - uint type) + uint flags) { int error; if (!XFS_IS_QUOTA_ON(mp)) - return (0); + return 0; + if (XFS_IS_PQUOTA_ON(mp)) + flags |= XFS_QMOPT_ENOSPC; ASSERT(ip->i_ino != mp->m_sb.sb_uquotino); ASSERT(ip->i_ino != mp->m_sb.sb_gquotino); ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount)); - ASSERT((type & ~XFS_QMOPT_FORCE_RES) == XFS_TRANS_DQ_RES_RTBLKS || - (type & ~XFS_QMOPT_FORCE_RES) == XFS_TRANS_DQ_RES_BLKS); + ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == + XFS_TRANS_DQ_RES_RTBLKS || + (flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == + XFS_TRANS_DQ_RES_BLKS); /* * Reserve nblks against these dquots, with trans as the mediator. @@ -845,8 +849,8 @@ xfs_trans_reserve_quota_nblks( error = xfs_trans_reserve_quota_bydquots(tp, mp, ip->i_udquot, ip->i_gdquot, nblks, ninos, - type); - return (error); + flags); + return error; } /* diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index 64ee07db0d5..8558226281c 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -1942,8 +1942,10 @@ xfs_alloc_fix_freelist( /* * Allocate as many blocks as possible at once. */ - if ((error = xfs_alloc_ag_vextent(&targs))) + if ((error = xfs_alloc_ag_vextent(&targs))) { + xfs_trans_brelse(tp, agflbp); return error; + } /* * Stop if we run out. Won't happen if callers are obeying * the restrictions correctly. Can happen for free calls @@ -1960,6 +1962,7 @@ xfs_alloc_fix_freelist( return error; } } + xfs_trans_brelse(tp, agflbp); args->agbp = agbp; return 0; } diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index d384e489705..26939d364bc 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -4719,18 +4719,17 @@ xfs_bmapi( /* * Make a transaction-less quota reservation for * delayed allocation blocks. This number gets - * adjusted later. - * We return EDQUOT if we haven't allocated - * blks already inside this loop; + * adjusted later. We return if we haven't + * allocated blocks already inside this loop. */ - if (XFS_TRANS_RESERVE_QUOTA_NBLKS( + if ((error = XFS_TRANS_RESERVE_QUOTA_NBLKS( mp, NULL, ip, (long)alen, 0, rt ? XFS_QMOPT_RES_RTBLKS : - XFS_QMOPT_RES_REGBLKS)) { + XFS_QMOPT_RES_REGBLKS))) { if (n == 0) { *nmap = 0; ASSERT(cur == NULL); - return XFS_ERROR(EDQUOT); + return error; } break; } diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index f83399c89ce..8e0d73d9ccc 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -353,10 +353,11 @@ xfs_check_nostate_extents( xfs_extnum_t num); /* - * Call xfs_bmap_do_search_extents() to search for the extent - * record containing block bno. If in multi-level in-core extent - * allocation mode, find and extract the target extent buffer, - * otherwise just use the direct extent list. + * Search the extent records for the entry containing block bno. + * If bno lies in a hole, point to the next entry. If bno lies + * past eof, *eofp will be set, and *prevp will contain the last + * entry (null if none). Else, *lastxp will be set to the index + * of the found entry; *gotp will contain the entry. */ xfs_bmbt_rec_t * xfs_bmap_search_multi_extents(struct xfs_ifork *, xfs_fileoff_t, int *, diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h index 022fff62085..5b7eb81453b 100644 --- a/fs/xfs/xfs_clnt.h +++ b/fs/xfs/xfs_clnt.h @@ -68,6 +68,7 @@ struct xfs_mount_args { * enforcement */ #define XFSMNT_PQUOTAENF 0x00000040 /* IRIX project quota limit * enforcement */ +#define XFSMNT_QUIET 0x00000080 /* don't report mount errors */ #define XFSMNT_NOALIGN 0x00000200 /* don't allocate at * stripe boundaries*/ #define XFSMNT_RETERR 0x00000400 /* return error to user */ diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 26b8e709a56..bc43163456e 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -186,4 +186,7 @@ extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...); #define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \ xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args) +#define xfs_fs_mount_cmn_err(f, fmt, args...) \ + ((f & XFS_MFSI_QUIET)? cmn_err(CE_WARN, "XFS: " fmt, ## args) : (void)0) + #endif /* __XFS_ERROR_H__ */ diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 4eeb856183b..deddbd03c16 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -158,9 +158,10 @@ xfs_ialloc_ag_alloc( */ agi = XFS_BUF_TO_AGI(agbp); newino = be32_to_cpu(agi->agi_newino); - if(likely(newino != NULLAGINO)) { - args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + - XFS_IALLOC_BLOCKS(args.mp); + args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + + XFS_IALLOC_BLOCKS(args.mp); + if (likely(newino != NULLAGINO && + (args.agbno < be32_to_cpu(agi->agi_length)))) { args.fsbno = XFS_AGB_TO_FSB(args.mp, be32_to_cpu(agi->agi_seqno), args.agbno); args.type = XFS_ALLOCTYPE_THIS_BNO; @@ -182,8 +183,8 @@ xfs_ialloc_ag_alloc( * Set the alignment for the allocation. * If stripe alignment is turned on then align at stripe unit * boundary. - * If the cluster size is smaller than a filesystem block - * then we're doing I/O for inodes in filesystem block size + * If the cluster size is smaller than a filesystem block + * then we're doing I/O for inodes in filesystem block size * pieces, so don't need alignment anyway. */ isaligned = 0; @@ -192,7 +193,7 @@ xfs_ialloc_ag_alloc( args.alignment = args.mp->m_dalign; isaligned = 1; } else if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) && - args.mp->m_sb.sb_inoalignmt >= + args.mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(args.mp, XFS_INODE_CLUSTER_SIZE(args.mp))) args.alignment = args.mp->m_sb.sb_inoalignmt; @@ -220,7 +221,7 @@ xfs_ialloc_ag_alloc( if ((error = xfs_alloc_vextent(&args))) return error; } - + /* * If stripe alignment is turned on, then try again with cluster * alignment. diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index bb33113eef9..b5385432526 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -421,7 +421,10 @@ finish_inode: ip->i_chash = chlnew; chlnew->chl_ip = ip; chlnew->chl_blkno = ip->i_blkno; + if (ch->ch_list) + ch->ch_list->chl_prev = chlnew; chlnew->chl_next = ch->ch_list; + chlnew->chl_prev = NULL; ch->ch_list = chlnew; chlnew = NULL; } @@ -723,23 +726,15 @@ xfs_iextract( ASSERT(ip->i_cnext == ip && ip->i_cprev == ip); ASSERT(ip->i_chash != NULL); chm=NULL; - for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) { - if (chl->chl_blkno == ip->i_blkno) { - if (chm == NULL) { - /* first item on the list */ - ch->ch_list = chl->chl_next; - } else { - chm->chl_next = chl->chl_next; - } - kmem_zone_free(xfs_chashlist_zone, chl); - break; - } else { - ASSERT(chl->chl_ip != ip); - chm = chl; - } - } - ASSERT_ALWAYS(chl != NULL); - } else { + chl = ip->i_chash; + if (chl->chl_prev) + chl->chl_prev->chl_next = chl->chl_next; + else + ch->ch_list = chl->chl_next; + if (chl->chl_next) + chl->chl_next->chl_prev = chl->chl_prev; + kmem_zone_free(xfs_chashlist_zone, chl); + } else { /* delete one inode from a non-empty list */ iq = ip->i_cnext; iq->i_cprev = ip->i_cprev; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 48146bdc6bd..94b60dd0380 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2732,16 +2732,29 @@ xfs_iunpin( ASSERT(atomic_read(&ip->i_pincount) > 0); if (atomic_dec_and_test(&ip->i_pincount)) { - vnode_t *vp = XFS_ITOV_NULL(ip); + /* + * If the inode is currently being reclaimed, the + * linux inode _and_ the xfs vnode may have been + * freed so we cannot reference either of them safely. + * Hence we should not try to do anything to them + * if the xfs inode is currently in the reclaim + * path. + * + * However, we still need to issue the unpin wakeup + * call as the inode reclaim may be blocked waiting for + * the inode to become unpinned. + */ + if (!(ip->i_flags & (XFS_IRECLAIM|XFS_IRECLAIMABLE))) { + vnode_t *vp = XFS_ITOV_NULL(ip); - /* make sync come back and flush this inode */ - if (vp) { - struct inode *inode = vn_to_inode(vp); + /* make sync come back and flush this inode */ + if (vp) { + struct inode *inode = vn_to_inode(vp); - if (!(inode->i_state & I_NEW)) - mark_inode_dirty_sync(inode); + if (!(inode->i_state & I_NEW)) + mark_inode_dirty_sync(inode); + } } - wake_up(&ip->i_ipin_wait); } } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 39ef9c36ea5..3b544db1790 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -189,6 +189,7 @@ typedef struct xfs_ihash { */ typedef struct xfs_chashlist { struct xfs_chashlist *chl_next; + struct xfs_chashlist *chl_prev; struct xfs_inode *chl_ip; xfs_daddr_t chl_blkno; /* starting block number of * the cluster */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 72e7e78bfff..c0b1c290688 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -213,7 +213,8 @@ xfs_mount_free( STATIC int xfs_mount_validate_sb( xfs_mount_t *mp, - xfs_sb_t *sbp) + xfs_sb_t *sbp, + int flags) { /* * If the log device and data device have the @@ -223,33 +224,29 @@ xfs_mount_validate_sb( * a volume filesystem in a non-volume manner. */ if (sbp->sb_magicnum != XFS_SB_MAGIC) { - cmn_err(CE_WARN, "XFS: bad magic number"); + xfs_fs_mount_cmn_err(flags, "bad magic number"); return XFS_ERROR(EWRONGFS); } if (!XFS_SB_GOOD_VERSION(sbp)) { - cmn_err(CE_WARN, "XFS: bad version"); + xfs_fs_mount_cmn_err(flags, "bad version"); return XFS_ERROR(EWRONGFS); } if (unlikely( sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { - cmn_err(CE_WARN, - "XFS: filesystem is marked as having an external log; " - "specify logdev on the\nmount command line."); - XFS_CORRUPTION_ERROR("xfs_mount_validate_sb(1)", - XFS_ERRLEVEL_HIGH, mp, sbp); - return XFS_ERROR(EFSCORRUPTED); + xfs_fs_mount_cmn_err(flags, + "filesystem is marked as having an external log; " + "specify logdev on the\nmount command line."); + return XFS_ERROR(EINVAL); } if (unlikely( sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { - cmn_err(CE_WARN, - "XFS: filesystem is marked as having an internal log; " - "don't specify logdev on\nthe mount command line."); - XFS_CORRUPTION_ERROR("xfs_mount_validate_sb(2)", - XFS_ERRLEVEL_HIGH, mp, sbp); - return XFS_ERROR(EFSCORRUPTED); + xfs_fs_mount_cmn_err(flags, + "filesystem is marked as having an internal log; " + "do not specify logdev on\nthe mount command line."); + return XFS_ERROR(EINVAL); } /* @@ -273,10 +270,8 @@ xfs_mount_validate_sb( (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || - (sbp->sb_imax_pct > 100 || sbp->sb_imax_pct < 1))) { - cmn_err(CE_WARN, "XFS: SB sanity check 1 failed"); - XFS_CORRUPTION_ERROR("xfs_mount_validate_sb(3)", - XFS_ERRLEVEL_LOW, mp, sbp); + (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) { + xfs_fs_mount_cmn_err(flags, "SB sanity check 1 failed"); return XFS_ERROR(EFSCORRUPTED); } @@ -289,9 +284,7 @@ xfs_mount_validate_sb( (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks || sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) * sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) { - cmn_err(CE_WARN, "XFS: SB sanity check 2 failed"); - XFS_ERROR_REPORT("xfs_mount_validate_sb(4)", - XFS_ERRLEVEL_LOW, mp); + xfs_fs_mount_cmn_err(flags, "SB sanity check 2 failed"); return XFS_ERROR(EFSCORRUPTED); } @@ -307,15 +300,13 @@ xfs_mount_validate_sb( (sbp->sb_dblocks << (sbp->sb_blocklog - BBSHIFT)) > UINT_MAX || (sbp->sb_rblocks << (sbp->sb_blocklog - BBSHIFT)) > UINT_MAX)) { #endif - cmn_err(CE_WARN, - "XFS: File system is too large to be mounted on this system."); + xfs_fs_mount_cmn_err(flags, + "file system too large to be mounted on this system."); return XFS_ERROR(E2BIG); } if (unlikely(sbp->sb_inprogress)) { - cmn_err(CE_WARN, "XFS: file system busy"); - XFS_ERROR_REPORT("xfs_mount_validate_sb(5)", - XFS_ERRLEVEL_LOW, mp); + xfs_fs_mount_cmn_err(flags, "file system busy"); return XFS_ERROR(EFSCORRUPTED); } @@ -323,8 +314,8 @@ xfs_mount_validate_sb( * Version 1 directory format has never worked on Linux. */ if (unlikely(!XFS_SB_VERSION_HASDIRV2(sbp))) { - cmn_err(CE_WARN, - "XFS: Attempted to mount file system using version 1 directory format"); + xfs_fs_mount_cmn_err(flags, + "file system using version 1 directory format"); return XFS_ERROR(ENOSYS); } @@ -332,11 +323,11 @@ xfs_mount_validate_sb( * Until this is fixed only page-sized or smaller data blocks work. */ if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { - cmn_err(CE_WARN, - "XFS: Attempted to mount file system with blocksize %d bytes", + xfs_fs_mount_cmn_err(flags, + "file system with blocksize %d bytes", sbp->sb_blocksize); - cmn_err(CE_WARN, - "XFS: Only page-sized (%ld) or less blocksizes currently work.", + xfs_fs_mount_cmn_err(flags, + "only pagesize (%ld) or less will currently work.", PAGE_SIZE); return XFS_ERROR(ENOSYS); } @@ -484,7 +475,7 @@ xfs_xlatesb( * Does the initial read of the superblock. */ int -xfs_readsb(xfs_mount_t *mp) +xfs_readsb(xfs_mount_t *mp, int flags) { unsigned int sector_size; unsigned int extra_flags; @@ -506,7 +497,7 @@ xfs_readsb(xfs_mount_t *mp) bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size), extra_flags); if (!bp || XFS_BUF_ISERROR(bp)) { - cmn_err(CE_WARN, "XFS: SB read failed"); + xfs_fs_mount_cmn_err(flags, "SB read failed"); error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM; goto fail; } @@ -520,9 +511,9 @@ xfs_readsb(xfs_mount_t *mp) sbp = XFS_BUF_TO_SBP(bp); xfs_xlatesb(XFS_BUF_PTR(bp), &(mp->m_sb), 1, XFS_SB_ALL_BITS); - error = xfs_mount_validate_sb(mp, &(mp->m_sb)); + error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags); if (error) { - cmn_err(CE_WARN, "XFS: SB validate failed"); + xfs_fs_mount_cmn_err(flags, "SB validate failed"); goto fail; } @@ -530,8 +521,8 @@ xfs_readsb(xfs_mount_t *mp) * We must be able to do sector-sized and sector-aligned IO. */ if (sector_size > mp->m_sb.sb_sectsize) { - cmn_err(CE_WARN, - "XFS: device supports only %u byte sectors (not %u)", + xfs_fs_mount_cmn_err(flags, + "device supports only %u byte sectors (not %u)", sector_size, mp->m_sb.sb_sectsize); error = ENOSYS; goto fail; @@ -548,7 +539,7 @@ xfs_readsb(xfs_mount_t *mp) bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size), extra_flags); if (!bp || XFS_BUF_ISERROR(bp)) { - cmn_err(CE_WARN, "XFS: SB re-read failed"); + xfs_fs_mount_cmn_err(flags, "SB re-read failed"); error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM; goto fail; } @@ -678,7 +669,7 @@ xfs_mountfs( int error = 0; if (mp->m_sb_bp == NULL) { - if ((error = xfs_readsb(mp))) { + if ((error = xfs_readsb(mp, mfsi_flags))) { return error; } } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 66cbee79864..668ad23fd37 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -510,9 +510,12 @@ xfs_preferred_iosize(xfs_mount_t *mp) */ #define XFS_MFSI_SECOND 0x01 /* Secondary mount -- skip stuff */ #define XFS_MFSI_CLIENT 0x02 /* Is a client -- skip lots of stuff */ +/* XFS_MFSI_RRINODES */ #define XFS_MFSI_NOUNLINK 0x08 /* Skip unlinked inode processing in */ /* log recovery */ #define XFS_MFSI_NO_QUOTACHECK 0x10 /* Skip quotacheck processing */ +/* XFS_MFSI_CONVERT_SUNIT */ +#define XFS_MFSI_QUIET 0x40 /* Be silent if mount errors found */ /* * Macros for getting from mount to vfs and back. @@ -581,7 +584,7 @@ extern int xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t, extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, uint, int); extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); -extern int xfs_readsb(xfs_mount_t *mp); +extern int xfs_readsb(xfs_mount_t *, int); extern void xfs_freesb(xfs_mount_t *); extern void xfs_do_force_shutdown(bhv_desc_t *, int, char *, int); extern int xfs_syncsub(xfs_mount_t *, int, int, int *); diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 4f6a034de7f..7fbef974bce 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -196,10 +196,11 @@ typedef struct xfs_qoff_logformat { #define XFS_QMOPT_QUOTAOFF 0x0000080 /* quotas are being turned off */ #define XFS_QMOPT_UMOUNTING 0x0000100 /* filesys is being unmounted */ #define XFS_QMOPT_DOLOG 0x0000200 /* log buf changes (in quotacheck) */ -#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if necessary */ +#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */ #define XFS_QMOPT_ILOCKED 0x0000800 /* inode is already locked (excl) */ -#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot, if damaged. */ +#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ #define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ +#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */ /* * flags to xfs_trans_mod_dquot to indicate which field needs to be diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index 81a05cfd77d..1f148762eb2 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -316,6 +316,18 @@ xfs_rename( } } + /* + * If we are using project inheritance, we only allow renames + * into our tree when the project IDs are the same; else the + * tree quota mechanism would be circumvented. + */ + if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && + (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) { + error = XFS_ERROR(EXDEV); + xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED); + goto rele_return; + } + new_parent = (src_dp != target_dp); src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR); diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index 504d2a80747..36ea1b2094f 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c @@ -442,6 +442,9 @@ xfs_mount( p = vfs_bhv_lookup(vfsp, VFS_POSITION_IO); mp->m_io_ops = p ? *(xfs_ioops_t *) vfs_bhv_custom(p) : xfs_iocore_xfs; + if (args->flags & XFSMNT_QUIET) + flags |= XFS_MFSI_QUIET; + /* * Open real time and log devices - order is important. */ @@ -492,7 +495,7 @@ xfs_mount( error = xfs_start_flags(vfsp, args, mp); if (error) goto error1; - error = xfs_readsb(mp); + error = xfs_readsb(mp, flags); if (error) goto error1; error = xfs_finish_flags(vfsp, args, mp); @@ -666,31 +669,22 @@ xfs_mntupdate( xfs_mount_t *mp = XFS_BHVTOM(bdp); int error; - if (args->flags & XFSMNT_BARRIER) - mp->m_flags |= XFS_MOUNT_BARRIER; - else - mp->m_flags &= ~XFS_MOUNT_BARRIER; - - if ((vfsp->vfs_flag & VFS_RDONLY) && - !(*flags & MS_RDONLY)) { - vfsp->vfs_flag &= ~VFS_RDONLY; - - if (args->flags & XFSMNT_BARRIER) + if (!(*flags & MS_RDONLY)) { /* rw/ro -> rw */ + if (vfsp->vfs_flag & VFS_RDONLY) + vfsp->vfs_flag &= ~VFS_RDONLY; + if (args->flags & XFSMNT_BARRIER) { + mp->m_flags |= XFS_MOUNT_BARRIER; xfs_mountfs_check_barriers(mp); - } - - if (!(vfsp->vfs_flag & VFS_RDONLY) && - (*flags & MS_RDONLY)) { + } else { + mp->m_flags &= ~XFS_MOUNT_BARRIER; + } + } else if (!(vfsp->vfs_flag & VFS_RDONLY)) { /* rw -> ro */ VFS_SYNC(vfsp, SYNC_FSDATA|SYNC_BDFLUSH|SYNC_ATTR, NULL, error); - xfs_quiesce_fs(mp); - - /* Ok now write out an unmount record */ xfs_log_unmount_write(mp); xfs_unmountfs_writesb(mp); vfsp->vfs_flag |= VFS_RDONLY; } - return 0; } @@ -1697,8 +1691,9 @@ xfs_parseargs( int dsunit, dswidth, vol_dsunit, vol_dswidth; int iosize; - args->flags2 |= XFSMNT2_COMPAT_IOSIZE; args->flags |= XFSMNT_IDELETE; + args->flags |= XFSMNT_BARRIER; + args->flags2 |= XFSMNT2_COMPAT_IOSIZE; if (!options) goto done; @@ -1947,8 +1942,6 @@ xfs_showargs( seq_printf(m, "," MNTOPT_IKEEP); if (!(mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE)) seq_printf(m, "," MNTOPT_LARGEIO); - if (mp->m_flags & XFS_MOUNT_BARRIER) - seq_printf(m, "," MNTOPT_BARRIER); if (!(vfsp->vfs_flag & VFS_32BITINODES)) seq_printf(m, "," MNTOPT_64BITINODE); diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index de49601919c..7027ae68ee3 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2663,7 +2663,7 @@ xfs_link( */ if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && (tdp->i_d.di_projid != sip->i_d.di_projid))) { - error = XFS_ERROR(EPERM); + error = XFS_ERROR(EXDEV); goto error_return; } @@ -4649,6 +4649,10 @@ vnodeops_t xfs_vnodeops = { #ifdef HAVE_SENDFILE .vop_sendfile = xfs_sendfile, #endif +#ifdef HAVE_SPLICE + .vop_splice_read = xfs_splice_read, + .vop_splice_write = xfs_splice_write, +#endif .vop_write = xfs_write, .vop_ioctl = xfs_ioctl, .vop_getattr = xfs_getattr, |