diff options
Diffstat (limited to 'fs')
157 files changed, 2685 insertions, 1885 deletions
diff --git a/fs/9p/9p.c b/fs/9p/9p.c index 1a6d08761f3..f86a28d1d6a 100644 --- a/fs/9p/9p.c +++ b/fs/9p/9p.c @@ -111,7 +111,6 @@ static void v9fs_t_clunk_cb(void *a, struct v9fs_fcall *tc, if (!rc) return; - dprintk(DEBUG_9P, "tcall id %d rcall id %d\n", tc->id, rc->id); v9ses = a; if (rc->id == RCLUNK) v9fs_put_idpool(fid, &v9ses->fidpool); diff --git a/fs/9p/conv.c b/fs/9p/conv.c index 32a9f99154e..bf1f1006796 100644 --- a/fs/9p/conv.c +++ b/fs/9p/conv.c @@ -116,13 +116,19 @@ static void buf_put_int64(struct cbuf *buf, u64 val) } } -static void buf_put_stringn(struct cbuf *buf, const char *s, u16 slen) +static char *buf_put_stringn(struct cbuf *buf, const char *s, u16 slen) { + char *ret; + + ret = NULL; if (buf_check_size(buf, slen + 2)) { buf_put_int16(buf, slen); + ret = buf->p; memcpy(buf->p, s, slen); buf->p += slen; } + + return ret; } static inline void buf_put_string(struct cbuf *buf, const char *s) @@ -430,15 +436,19 @@ static inline void v9fs_put_int64(struct cbuf *bufp, u64 val, u64 * p) static void v9fs_put_str(struct cbuf *bufp, char *data, struct v9fs_str *str) { - if (data) { - str->len = strlen(data); - str->str = bufp->p; - } else { - str->len = 0; - str->str = NULL; - } + int len; + char *s; + + if (data) + len = strlen(data); + else + len = 0; - buf_put_stringn(bufp, data, str->len); + s = buf_put_stringn(bufp, data, len); + if (str) { + str->len = len; + str->str = s; + } } static int diff --git a/fs/9p/fid.c b/fs/9p/fid.c index eda449778fa..c4d13bf904d 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -1,7 +1,7 @@ /* * V9FS FID Management * - * Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com> + * Copyright (C) 2005, 2006 by Eric Van Hensbergen <ericvh@gmail.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -40,7 +40,7 @@ * */ -static int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry) +int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry) { struct list_head *fid_list = (struct list_head *)dentry->d_fsdata; dprintk(DEBUG_9P, "fid %d (%p) dentry %s (%p)\n", fid->fid, fid, @@ -57,7 +57,6 @@ static int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry) } fid->uid = current->uid; - fid->pid = current->pid; list_add(&fid->list, fid_list); return 0; } @@ -68,14 +67,11 @@ static int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry) * */ -struct v9fs_fid *v9fs_fid_create(struct dentry *dentry, - struct v9fs_session_info *v9ses, int fid, int create) +struct v9fs_fid *v9fs_fid_create(struct v9fs_session_info *v9ses, int fid) { struct v9fs_fid *new; - dprintk(DEBUG_9P, "fid create dentry %p, fid %d, create %d\n", - dentry, fid, create); - + dprintk(DEBUG_9P, "fid create fid %d\n", fid); new = kmalloc(sizeof(struct v9fs_fid), GFP_KERNEL); if (new == NULL) { dprintk(DEBUG_ERROR, "Out of Memory\n"); @@ -85,19 +81,13 @@ struct v9fs_fid *v9fs_fid_create(struct dentry *dentry, new->fid = fid; new->v9ses = v9ses; new->fidopen = 0; - new->fidcreate = create; new->fidclunked = 0; new->iounit = 0; new->rdir_pos = 0; new->rdir_fcall = NULL; + INIT_LIST_HEAD(&new->list); - if (v9fs_fid_insert(new, dentry) == 0) - return new; - else { - dprintk(DEBUG_ERROR, "Problems inserting to dentry\n"); - kfree(new); - return NULL; - } + return new; } /** @@ -113,140 +103,29 @@ void v9fs_fid_destroy(struct v9fs_fid *fid) } /** - * v9fs_fid_walk_up - walks from the process current directory - * up to the specified dentry. - */ -static struct v9fs_fid *v9fs_fid_walk_up(struct dentry *dentry) -{ - int fidnum, cfidnum, err; - struct v9fs_fid *cfid; - struct dentry *cde; - struct v9fs_session_info *v9ses; - - v9ses = v9fs_inode2v9ses(current->fs->pwd->d_inode); - cfid = v9fs_fid_lookup(current->fs->pwd); - if (cfid == NULL) { - dprintk(DEBUG_ERROR, "process cwd doesn't have a fid\n"); - return ERR_PTR(-ENOENT); - } - - cfidnum = cfid->fid; - cde = current->fs->pwd; - /* TODO: take advantage of multiwalk */ - - fidnum = v9fs_get_idpool(&v9ses->fidpool); - if (fidnum < 0) { - dprintk(DEBUG_ERROR, "could not get a new fid num\n"); - err = -ENOENT; - goto clunk_fid; - } - - while (cde != dentry) { - if (cde == cde->d_parent) { - dprintk(DEBUG_ERROR, "can't find dentry\n"); - err = -ENOENT; - goto clunk_fid; - } - - err = v9fs_t_walk(v9ses, cfidnum, fidnum, "..", NULL); - if (err < 0) { - dprintk(DEBUG_ERROR, "problem walking to parent\n"); - goto clunk_fid; - } - - cfidnum = fidnum; - cde = cde->d_parent; - } - - return v9fs_fid_create(dentry, v9ses, fidnum, 0); - -clunk_fid: - v9fs_t_clunk(v9ses, fidnum); - return ERR_PTR(err); -} - -/** * v9fs_fid_lookup - retrieve the right fid from a particular dentry * @dentry: dentry to look for fid in * @type: intent of lookup (operation or traversal) * - * search list of fids associated with a dentry for a fid with a matching - * thread id or uid. If that fails, look up the dentry's parents to see if you - * can find a matching fid. + * find a fid in the dentry + * + * TODO: only match fids that have the same uid as current user * */ struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry) { struct list_head *fid_list = (struct list_head *)dentry->d_fsdata; - struct v9fs_fid *current_fid = NULL; - struct v9fs_fid *temp = NULL; struct v9fs_fid *return_fid = NULL; dprintk(DEBUG_9P, " dentry: %s (%p)\n", dentry->d_iname, dentry); - if (fid_list) { - list_for_each_entry_safe(current_fid, temp, fid_list, list) { - if (!current_fid->fidcreate) { - return_fid = current_fid; - break; - } - } - - if (!return_fid) - return_fid = current_fid; - } - - /* we are at the root but didn't match */ - if ((!return_fid) && (dentry->d_parent == dentry)) { - /* TODO: clone attach with new uid */ - return_fid = current_fid; - } + if (fid_list) + return_fid = list_entry(fid_list->next, struct v9fs_fid, list); if (!return_fid) { - struct dentry *par = current->fs->pwd->d_parent; - int count = 1; - while (par != NULL) { - if (par == dentry) - break; - count++; - if (par == par->d_parent) { - dprintk(DEBUG_ERROR, - "got to root without finding dentry\n"); - break; - } - par = par->d_parent; - } - -/* XXX - there may be some duplication we can get rid of */ - if (par == dentry) { - return_fid = v9fs_fid_walk_up(dentry); - if (IS_ERR(return_fid)) - return_fid = NULL; - } + dprintk(DEBUG_ERROR, "Couldn't find a fid in dentry\n"); } return return_fid; } - -struct v9fs_fid *v9fs_fid_get_created(struct dentry *dentry) -{ - struct list_head *fid_list; - struct v9fs_fid *fid, *ftmp, *ret; - - dprintk(DEBUG_9P, " dentry: %s (%p)\n", dentry->d_iname, dentry); - fid_list = (struct list_head *)dentry->d_fsdata; - ret = NULL; - if (fid_list) { - list_for_each_entry_safe(fid, ftmp, fid_list, list) { - if (fid->fidcreate && fid->pid == current->pid) { - list_del(&fid->list); - ret = fid; - break; - } - } - } - - dprintk(DEBUG_9P, "return %p\n", ret); - return ret; -} diff --git a/fs/9p/fid.h b/fs/9p/fid.h index 84c673a44c8..1fc2dd08d75 100644 --- a/fs/9p/fid.h +++ b/fs/9p/fid.h @@ -33,7 +33,6 @@ struct v9fs_fid { u32 fid; unsigned char fidopen; /* set when fid is opened */ - unsigned char fidcreate; /* set when fid was just created */ unsigned char fidclunked; /* set when fid has already been clunked */ struct v9fs_qid qid; @@ -45,7 +44,6 @@ struct v9fs_fid { struct v9fs_fcall *rdir_fcall; /* management stuff */ - pid_t pid; /* thread associated with this fid */ uid_t uid; /* user associated with this fid */ /* private data */ @@ -56,5 +54,5 @@ struct v9fs_fid { struct v9fs_fid *v9fs_fid_lookup(struct dentry *dentry); struct v9fs_fid *v9fs_fid_get_created(struct dentry *); void v9fs_fid_destroy(struct v9fs_fid *fid); -struct v9fs_fid *v9fs_fid_create(struct dentry *, - struct v9fs_session_info *v9ses, int fid, int create); +struct v9fs_fid *v9fs_fid_create(struct v9fs_session_info *, int fid); +int v9fs_fid_insert(struct v9fs_fid *fid, struct dentry *dentry); diff --git a/fs/9p/mux.c b/fs/9p/mux.c index 945cb368d45..ea1134eb47c 100644 --- a/fs/9p/mux.c +++ b/fs/9p/mux.c @@ -471,10 +471,13 @@ static void v9fs_write_work(void *a) } spin_lock(&m->lock); - req = - list_entry(m->unsent_req_list.next, struct v9fs_req, +again: + req = list_entry(m->unsent_req_list.next, struct v9fs_req, req_list); list_move_tail(&req->req_list, &m->req_list); + if (req->err == ERREQFLUSH) + goto again; + m->wbuf = req->tcall->sdata; m->wsize = req->tcall->size; m->wpos = 0; @@ -525,7 +528,7 @@ static void process_request(struct v9fs_mux_data *m, struct v9fs_req *req) struct v9fs_str *ename; tag = req->tag; - if (req->rcall->id == RERROR && !req->err) { + if (!req->err && req->rcall->id == RERROR) { ecode = req->rcall->params.rerror.errno; ename = &req->rcall->params.rerror.error; @@ -551,7 +554,10 @@ static void process_request(struct v9fs_mux_data *m, struct v9fs_req *req) req->err = -EIO; } - if (req->cb && req->err != ERREQFLUSH) { + if (req->err == ERREQFLUSH) + return; + + if (req->cb) { dprintk(DEBUG_MUX, "calling callback tcall %p rcall %p\n", req->tcall, req->rcall); @@ -812,6 +818,7 @@ v9fs_mux_rpc_cb(void *a, struct v9fs_fcall *tc, struct v9fs_fcall *rc, int err) struct v9fs_mux_rpc *r; if (err == ERREQFLUSH) { + kfree(rc); dprintk(DEBUG_MUX, "err req flush\n"); return; } diff --git a/fs/9p/trans_fd.c b/fs/9p/trans_fd.c index 1a28ef97a3d..5b2ce21b10f 100644 --- a/fs/9p/trans_fd.c +++ b/fs/9p/trans_fd.c @@ -80,6 +80,7 @@ static int v9fs_fd_send(struct v9fs_transport *trans, void *v, int len) if (!trans || trans->status != Connected || !ts) return -EIO; + oldfs = get_fs(); set_fs(get_ds()); /* The cast to a user pointer is valid due to the set_fs() */ ret = vfs_write(ts->out_file, (void __user *)v, len, &ts->out_file->f_pos); diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 5250c428fc1..61352491ba3 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -66,7 +66,7 @@ static match_table_t tokens = { {Opt_afid, "afid=%u"}, {Opt_rfdno, "rfdno=%u"}, {Opt_wfdno, "wfdno=%u"}, - {Opt_debug, "debug=%u"}, + {Opt_debug, "debug=%x"}, {Opt_name, "name=%s"}, {Opt_remotename, "aname=%s"}, {Opt_unix, "proto=unix"}, @@ -397,6 +397,7 @@ v9fs_session_init(struct v9fs_session_info *v9ses, } if (v9ses->afid != ~0) { + dprintk(DEBUG_ERROR, "afid not equal to ~0\n"); if (v9fs_t_clunk(v9ses, v9ses->afid)) dprintk(DEBUG_ERROR, "clunk failed\n"); } diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index 69cf2905dc9..a759278acaa 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -51,3 +51,4 @@ int v9fs_dir_release(struct inode *inode, struct file *filp); int v9fs_file_open(struct inode *inode, struct file *file); void v9fs_inode2stat(struct inode *inode, struct v9fs_stat *stat); void v9fs_dentry_release(struct dentry *); +int v9fs_uflags2omode(int uflags); diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index 2dd806dac9f..12c9cc926b7 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -43,47 +43,18 @@ #include "fid.h" /** - * v9fs_dentry_validate - VFS dcache hook to validate cache - * @dentry: dentry that is being validated - * @nd: path data + * v9fs_dentry_delete - called when dentry refcount equals 0 + * @dentry: dentry in question * - * dcache really shouldn't be used for 9P2000 as at all due to - * potential attached semantics to directory traversal (walk). - * - * FUTURE: look into how to use dcache to allow multi-stage - * walks in Plan 9 & potential for better dcache operation which - * would remain valid for Plan 9 semantics. Older versions - * had validation via stat for those interested. However, since - * stat has the same approximate overhead as walk there really - * is no difference. The only improvement would be from a - * time-decay cache like NFS has and that undermines the - * synchronous nature of 9P2000. + * By returning 1 here we should remove cacheing of unused + * dentry components. * */ -static int v9fs_dentry_validate(struct dentry *dentry, struct nameidata *nd) +int v9fs_dentry_delete(struct dentry *dentry) { - struct dentry *dc = current->fs->pwd; - - dprintk(DEBUG_VFS, "dentry: %s (%p)\n", dentry->d_iname, dentry); - if (v9fs_fid_lookup(dentry)) { - dprintk(DEBUG_VFS, "VALID\n"); - return 1; - } - - while (dc != NULL) { - if (dc == dentry) { - dprintk(DEBUG_VFS, "VALID\n"); - return 1; - } - if (dc == dc->d_parent) - break; - - dc = dc->d_parent; - } - - dprintk(DEBUG_VFS, "INVALID\n"); - return 0; + dprintk(DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_iname, dentry); + return 1; } /** @@ -118,6 +89,6 @@ void v9fs_dentry_release(struct dentry *dentry) } struct dentry_operations v9fs_dentry_operations = { - .d_revalidate = v9fs_dentry_validate, + .d_delete = v9fs_dentry_delete, .d_release = v9fs_dentry_release, }; diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index ae6d032b9b5..cd5eeb032d6 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -202,7 +202,6 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) filp->private_data = NULL; } - d_drop(filp->f_dentry); return 0; } diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index c7e14d91721..de3a129698d 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -53,94 +53,70 @@ int v9fs_file_open(struct inode *inode, struct file *file) { struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); - struct v9fs_fid *v9fid, *fid; + struct v9fs_fid *vfid; struct v9fs_fcall *fcall = NULL; - int open_mode = 0; - unsigned int iounit = 0; - int newfid = -1; - long result = -1; + int omode; + int fid = V9FS_NOFID; + int err; dprintk(DEBUG_VFS, "inode: %p file: %p \n", inode, file); - v9fid = v9fs_fid_get_created(file->f_dentry); - if (!v9fid) - v9fid = v9fs_fid_lookup(file->f_dentry); - - if (!v9fid) { + vfid = v9fs_fid_lookup(file->f_dentry); + if (!vfid) { dprintk(DEBUG_ERROR, "Couldn't resolve fid from dentry\n"); return -EBADF; } - if (!v9fid->fidcreate) { - fid = kmalloc(sizeof(struct v9fs_fid), GFP_KERNEL); - if (fid == NULL) { - dprintk(DEBUG_ERROR, "Out of Memory\n"); - return -ENOMEM; - } - - fid->fidopen = 0; - fid->fidcreate = 0; - fid->fidclunked = 0; - fid->iounit = 0; - fid->v9ses = v9ses; - - newfid = v9fs_get_idpool(&v9ses->fidpool); - if (newfid < 0) { + fid = v9fs_get_idpool(&v9ses->fidpool); + if (fid < 0) { eprintk(KERN_WARNING, "newfid fails!\n"); return -ENOSPC; } - result = - v9fs_t_walk(v9ses, v9fid->fid, newfid, NULL, NULL); - - if (result < 0) { - v9fs_put_idpool(newfid, &v9ses->fidpool); + err = v9fs_t_walk(v9ses, vfid->fid, fid, NULL, NULL); + if (err < 0) { dprintk(DEBUG_ERROR, "rewalk didn't work\n"); - return -EBADF; + goto put_fid; + } + + vfid = kmalloc(sizeof(struct v9fs_fid), GFP_KERNEL); + if (vfid == NULL) { + dprintk(DEBUG_ERROR, "out of memory\n"); + goto clunk_fid; } - fid->fid = newfid; - v9fid = fid; /* TODO: do special things for O_EXCL, O_NOFOLLOW, O_SYNC */ /* translate open mode appropriately */ - open_mode = file->f_flags & 0x3; + omode = v9fs_uflags2omode(file->f_flags); + err = v9fs_t_open(v9ses, fid, omode, &fcall); + if (err < 0) { + PRINT_FCALL_ERROR("open failed", fcall); + goto destroy_vfid; + } - if (file->f_flags & O_EXCL) - open_mode |= V9FS_OEXCL; + file->private_data = vfid; + vfid->fid = fid; + vfid->fidopen = 1; + vfid->fidclunked = 0; + vfid->iounit = fcall->params.ropen.iounit; + vfid->rdir_pos = 0; + vfid->rdir_fcall = NULL; + vfid->filp = file; + kfree(fcall); - if (v9ses->extended) { - if (file->f_flags & O_TRUNC) - open_mode |= V9FS_OTRUNC; + return 0; - if (file->f_flags & O_APPEND) - open_mode |= V9FS_OAPPEND; - } +destroy_vfid: + v9fs_fid_destroy(vfid); - result = v9fs_t_open(v9ses, newfid, open_mode, &fcall); - if (result < 0) { - PRINT_FCALL_ERROR("open failed", fcall); - kfree(fcall); - return result; - } +clunk_fid: + v9fs_t_clunk(v9ses, fid); - iounit = fcall->params.ropen.iounit; +put_fid: + v9fs_put_idpool(fid, &v9ses->fidpool); kfree(fcall); - } else { - /* create case */ - newfid = v9fid->fid; - iounit = v9fid->iounit; - v9fid->fidcreate = 0; - } - - file->private_data = v9fid; - - v9fid->rdir_pos = 0; - v9fid->rdir_fcall = NULL; - v9fid->fidopen = 1; - v9fid->filp = file; - v9fid->iounit = iounit; - return 0; + return err; } /** @@ -289,9 +265,7 @@ v9fs_file_write(struct file *filp, const char __user * data, total += result; } while (count); - if(inode->i_mapping->nrpages) invalidate_inode_pages2(inode->i_mapping); - return total; } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 91f552454c7..3ad8455f857 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -125,6 +125,38 @@ static int p9mode2unixmode(struct v9fs_session_info *v9ses, int mode) return res; } +int v9fs_uflags2omode(int uflags) +{ + int ret; + + ret = 0; + switch (uflags&3) { + default: + case O_RDONLY: + ret = V9FS_OREAD; + break; + + case O_WRONLY: + ret = V9FS_OWRITE; + break; + + case O_RDWR: + ret = V9FS_ORDWR; + break; + } + + if (uflags & O_EXCL) + ret |= V9FS_OEXCL; + + if (uflags & O_TRUNC) + ret |= V9FS_OTRUNC; + + if (uflags & O_APPEND) + ret |= V9FS_OAPPEND; + + return ret; +} + /** * v9fs_blank_wstat - helper function to setup a 9P stat structure * @v9ses: 9P session info (for determining extended mode) @@ -163,7 +195,7 @@ v9fs_blank_wstat(struct v9fs_wstat *wstat) struct inode *v9fs_get_inode(struct super_block *sb, int mode) { - struct inode *inode = NULL; + struct inode *inode; struct v9fs_session_info *v9ses = sb->s_fs_info; dprintk(DEBUG_VFS, "super block: %p mode: %o\n", sb, mode); @@ -222,171 +254,133 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) return inode; } -/** - * v9fs_create - helper function to create files and directories - * @dir: directory inode file is being created in - * @file_dentry: dentry file is being created in - * @perm: permissions file is being created with - * @open_mode: resulting open mode for file - * - */ - static int -v9fs_create(struct inode *dir, - struct dentry *file_dentry, - unsigned int perm, unsigned int open_mode) +v9fs_create(struct v9fs_session_info *v9ses, u32 pfid, char *name, + u32 perm, u8 mode, u32 *fidp, struct v9fs_qid *qid, u32 *iounit) { - struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); - struct super_block *sb = dir->i_sb; - struct v9fs_fid *dirfid = - v9fs_fid_lookup(file_dentry->d_parent); - struct v9fs_fid *fid = NULL; - struct inode *file_inode = NULL; - struct v9fs_fcall *fcall = NULL; - struct v9fs_qid qid; - int dirfidnum = -1; - long newfid = -1; - int result = 0; - unsigned int iounit = 0; - int wfidno = -1; + u32 fid; int err; + struct v9fs_fcall *fcall; - perm = unixmode2p9mode(v9ses, perm); - - dprintk(DEBUG_VFS, "dir: %p dentry: %p perm: %o mode: %o\n", dir, - file_dentry, perm, open_mode); - - if (!dirfid) - return -EBADF; - - dirfidnum = dirfid->fid; - if (dirfidnum < 0) { - dprintk(DEBUG_ERROR, "No fid for the directory #%lu\n", - dir->i_ino); - return -EBADF; - } - - if (file_dentry->d_inode) { - dprintk(DEBUG_ERROR, - "Odd. There is an inode for dir %lu, name :%s:\n", - dir->i_ino, file_dentry->d_name.name); - return -EEXIST; - } - - newfid = v9fs_get_idpool(&v9ses->fidpool); - if (newfid < 0) { + fid = v9fs_get_idpool(&v9ses->fidpool); + if (fid < 0) { eprintk(KERN_WARNING, "no free fids available\n"); return -ENOSPC; } - result = v9fs_t_walk(v9ses, dirfidnum, newfid, NULL, &fcall); - if (result < 0) { + err = v9fs_t_walk(v9ses, pfid, fid, NULL, &fcall); + if (err < 0) { PRINT_FCALL_ERROR("clone error", fcall); - v9fs_put_idpool(newfid, &v9ses->fidpool); - newfid = -1; - goto CleanUpFid; + goto error; } - kfree(fcall); - fcall = NULL; - result = v9fs_t_create(v9ses, newfid, (char *)file_dentry->d_name.name, - perm, open_mode, &fcall); - if (result < 0) { + err = v9fs_t_create(v9ses, fid, name, perm, mode, &fcall); + if (err < 0) { PRINT_FCALL_ERROR("create fails", fcall); - goto CleanUpFid; + goto error; } - iounit = fcall->params.rcreate.iounit; - qid = fcall->params.rcreate.qid; + if (iounit) + *iounit = fcall->params.rcreate.iounit; + + if (qid) + *qid = fcall->params.rcreate.qid; + + if (fidp) + *fidp = fid; + kfree(fcall); - fcall = NULL; + return 0; - if (!(perm&V9FS_DMDIR)) { - fid = v9fs_fid_create(file_dentry, v9ses, newfid, 1); - dprintk(DEBUG_VFS, "fid %p %d\n", fid, fid->fidcreate); - if (!fid) { - result = -ENOMEM; - goto CleanUpFid; - } +error: + if (fid >= 0) + v9fs_put_idpool(fid, &v9ses->fidpool); - fid->qid = qid; - fid->iounit = iounit; - } else { - err = v9fs_t_clunk(v9ses, newfid); - newfid = -1; - if (err < 0) - dprintk(DEBUG_ERROR, "clunk for mkdir failed: %d\n", err); - } + kfree(fcall); + return err; +} + +static struct v9fs_fid* +v9fs_clone_walk(struct v9fs_session_info *v9ses, u32 fid, struct dentry *dentry) +{ + int err; + u32 nfid; + struct v9fs_fid *ret; + struct v9fs_fcall *fcall; - /* walk to the newly created file and put the fid in the dentry */ - wfidno = v9fs_get_idpool(&v9ses->fidpool); - if (wfidno < 0) { + nfid = v9fs_get_idpool(&v9ses->fidpool); + if (nfid < 0) { eprintk(KERN_WARNING, "no free fids available\n"); - return -ENOSPC; + return ERR_PTR(-ENOSPC); } - result = v9fs_t_walk(v9ses, dirfidnum, wfidno, - (char *) file_dentry->d_name.name, &fcall); - if (result < 0) { - PRINT_FCALL_ERROR("clone error", fcall); - v9fs_put_idpool(wfidno, &v9ses->fidpool); - wfidno = -1; - goto CleanUpFid; + err = v9fs_t_walk(v9ses, fid, nfid, (char *) dentry->d_name.name, + &fcall); + + if (err < 0) { + PRINT_FCALL_ERROR("walk error", fcall); + v9fs_put_idpool(nfid, &v9ses->fidpool); + goto error; } + kfree(fcall); fcall = NULL; + ret = v9fs_fid_create(v9ses, nfid); + if (!ret) { + err = -ENOMEM; + goto clunk_fid; + } - if (!v9fs_fid_create(file_dentry, v9ses, wfidno, 0)) { - v9fs_put_idpool(wfidno, &v9ses->fidpool); - - goto CleanUpFid; + err = v9fs_fid_insert(ret, dentry); + if (err < 0) { + v9fs_fid_destroy(ret); + goto clunk_fid; } - if ((perm & V9FS_DMSYMLINK) || (perm & V9FS_DMLINK) || - (perm & V9FS_DMNAMEDPIPE) || (perm & V9FS_DMSOCKET) || - (perm & V9FS_DMDEVICE)) - return 0; + return ret; - result = v9fs_t_stat(v9ses, wfidno, &fcall); - if (result < 0) { - PRINT_FCALL_ERROR("stat error", fcall); - goto CleanUpFid; - } +clunk_fid: + v9fs_t_clunk(v9ses, nfid); +error: + kfree(fcall); + return ERR_PTR(err); +} - file_inode = v9fs_get_inode(sb, - p9mode2unixmode(v9ses, fcall->params.rstat.stat.mode)); +struct inode * +v9fs_inode_from_fid(struct v9fs_session_info *v9ses, u32 fid, + struct super_block *sb) +{ + int err, umode; + struct inode *ret; + struct v9fs_fcall *fcall; - if ((!file_inode) || IS_ERR(file_inode)) { - dprintk(DEBUG_ERROR, "create inode failed\n"); - result = -EBADF; - goto CleanUpFid; + ret = NULL; + err = v9fs_t_stat(v9ses, fid, &fcall); + if (err) { + PRINT_FCALL_ERROR("stat error", fcall); + goto error; } - v9fs_stat2inode(&fcall->params.rstat.stat, file_inode, sb); - kfree(fcall); - fcall = NULL; - file_dentry->d_op = &v9fs_dentry_operations; - d_instantiate(file_dentry, file_inode); + umode = p9mode2unixmode(v9ses, fcall->params.rstat.stat.mode); + ret = v9fs_get_inode(sb, umode); + if (IS_ERR(ret)) { + err = PTR_ERR(ret); + ret = NULL; + goto error; + } - return 0; + v9fs_stat2inode(&fcall->params.rstat.stat, ret, sb); + kfree(fcall); + return ret; - CleanUpFid: +error: kfree(fcall); - fcall = NULL; + if (ret) + iput(ret); - if (newfid >= 0) { - err = v9fs_t_clunk(v9ses, newfid); - if (err < 0) - dprintk(DEBUG_ERROR, "clunk failed: %d\n", err); - } - if (wfidno >= 0) { - err = v9fs_t_clunk(v9ses, wfidno); - if (err < 0) - dprintk(DEBUG_ERROR, "clunk failed: %d\n", err); - } - return result; + return ERR_PTR(err); } /** @@ -440,20 +434,97 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir) return result; } +static int +v9fs_open_created(struct inode *inode, struct file *file) +{ + return 0; +} + /** * v9fs_vfs_create - VFS hook to create files * @inode: directory inode that is being deleted * @dentry: dentry that is being deleted - * @perm: create permissions + * @mode: create permissions * @nd: path information * */ static int -v9fs_vfs_create(struct inode *inode, struct dentry *dentry, int perm, +v9fs_vfs_create(struct inode *dir, struct dentry *dentry, int mode, struct nameidata *nd) { - return v9fs_create(inode, dentry, perm, O_RDWR); + int err; + u32 fid, perm, iounit; + int flags; + struct v9fs_session_info *v9ses; + struct v9fs_fid *dfid, *vfid, *ffid; + struct inode *inode; + struct v9fs_qid qid; + struct file *filp; + + inode = NULL; + vfid = NULL; + v9ses = v9fs_inode2v9ses(dir); + dfid = v9fs_fid_lookup(dentry->d_parent); + perm = unixmode2p9mode(v9ses, mode); + + if (nd && nd->flags & LOOKUP_OPEN) + flags = nd->intent.open.flags - 1; + else + flags = O_RDWR; + + err = v9fs_create(v9ses, dfid->fid, (char *) dentry->d_name.name, + perm, v9fs_uflags2omode(flags), &fid, &qid, &iounit); + + if (err) + goto error; + + vfid = v9fs_clone_walk(v9ses, dfid->fid, dentry); + if (IS_ERR(vfid)) { + err = PTR_ERR(vfid); + vfid = NULL; + goto error; + } + + inode = v9fs_inode_from_fid(v9ses, vfid->fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + inode = NULL; + goto error; + } + + dentry->d_op = &v9fs_dentry_operations; + d_instantiate(dentry, inode); + + if (nd && nd->flags & LOOKUP_OPEN) { + ffid = v9fs_fid_create(v9ses, fid); + if (!ffid) + return -ENOMEM; + + filp = lookup_instantiate_filp(nd, dentry, v9fs_open_created); + if (IS_ERR(filp)) { + v9fs_fid_destroy(ffid); + return PTR_ERR(filp); + } + + ffid->rdir_pos = 0; + ffid->rdir_fcall = NULL; + ffid->fidopen = 1; + ffid->iounit = iounit; + ffid->filp = filp; + filp->private_data = ffid; + } + + return 0; + +error: + if (vfid) + v9fs_fid_destroy(vfid); + + if (inode) + iput(inode); + + return err; } /** @@ -464,9 +535,57 @@ v9fs_vfs_create(struct inode *inode, struct dentry *dentry, int perm, * */ -static int v9fs_vfs_mkdir(struct inode *inode, struct dentry *dentry, int mode) +static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) { - return v9fs_create(inode, dentry, mode | S_IFDIR, O_RDONLY); + int err; + u32 fid, perm; + struct v9fs_session_info *v9ses; + struct v9fs_fid *dfid, *vfid; + struct inode *inode; + + inode = NULL; + vfid = NULL; + v9ses = v9fs_inode2v9ses(dir); + dfid = v9fs_fid_lookup(dentry->d_parent); + perm = unixmode2p9mode(v9ses, mode | S_IFDIR); + + err = v9fs_create(v9ses, dfid->fid, (char *) dentry->d_name.name, + perm, V9FS_OREAD, &fid, NULL, NULL); + + if (err) { + dprintk(DEBUG_ERROR, "create error %d\n", err); + goto error; + } + + err = v9fs_t_clunk(v9ses, fid); + if (err) { + dprintk(DEBUG_ERROR, "clunk error %d\n", err); + goto error; + } + + vfid = v9fs_clone_walk(v9ses, dfid->fid, dentry); + if (IS_ERR(vfid)) { + err = PTR_ERR(vfid); + vfid = NULL; + goto error; + } + + inode = v9fs_inode_from_fid(v9ses, vfid->fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + inode = NULL; + goto error; + } + + dentry->d_op = &v9fs_dentry_operations; + d_instantiate(dentry, inode); + return 0; + +error: + if (vfid) + v9fs_fid_destroy(vfid); + + return err; } /** @@ -491,7 +610,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, int result = 0; dprintk(DEBUG_VFS, "dir: %p dentry: (%s) %p nameidata: %p\n", - dir, dentry->d_iname, dentry, nameidata); + dir, dentry->d_name.name, dentry, nameidata); sb = dir->i_sb; v9ses = v9fs_inode2v9ses(dir); @@ -516,9 +635,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(-ENOSPC); } - result = - v9fs_t_walk(v9ses, dirfidnum, newfid, (char *)dentry->d_name.name, - NULL); + result = v9fs_t_walk(v9ses, dirfidnum, newfid, + (char *)dentry->d_name.name, NULL); if (result < 0) { v9fs_put_idpool(newfid, &v9ses->fidpool); if (result == -ENOENT) { @@ -551,13 +669,17 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, inode->i_ino = v9fs_qid2ino(&fcall->params.rstat.stat.qid); - fid = v9fs_fid_create(dentry, v9ses, newfid, 0); + fid = v9fs_fid_create(v9ses, newfid); if (fid == NULL) { dprintk(DEBUG_ERROR, "couldn't insert\n"); result = -ENOMEM; goto FreeFcall; } + result = v9fs_fid_insert(fid, dentry); + if (result < 0) + goto FreeFcall; + fid->qid = fcall->params.rstat.stat.qid; dentry->d_op = &v9fs_dentry_operations; @@ -983,53 +1105,75 @@ static void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, int mode, const char *extension) { - int err, retval; + int err; + u32 fid, perm; struct v9fs_session_info *v9ses; + struct v9fs_fid *dfid, *vfid; + struct inode *inode; struct v9fs_fcall *fcall; - struct v9fs_fid *fid; struct v9fs_wstat wstat; - v9ses = v9fs_inode2v9ses(dir); - retval = -EPERM; fcall = NULL; + inode = NULL; + vfid = NULL; + v9ses = v9fs_inode2v9ses(dir); + dfid = v9fs_fid_lookup(dentry->d_parent); + perm = unixmode2p9mode(v9ses, mode); if (!v9ses->extended) { dprintk(DEBUG_ERROR, "not extended\n"); - goto free_mem; + return -EPERM; } - /* issue a create */ - retval = v9fs_create(dir, dentry, mode, 0); - if (retval != 0) - goto free_mem; + err = v9fs_create(v9ses, dfid->fid, (char *) dentry->d_name.name, + perm, V9FS_OREAD, &fid, NULL, NULL); - fid = v9fs_fid_get_created(dentry); - if (!fid) { - dprintk(DEBUG_ERROR, "couldn't resolve fid from dentry\n"); - goto free_mem; + if (err) + goto error; + + err = v9fs_t_clunk(v9ses, fid); + if (err) + goto error; + + vfid = v9fs_clone_walk(v9ses, dfid->fid, dentry); + if (IS_ERR(vfid)) { + err = PTR_ERR(vfid); + vfid = NULL; + goto error; + } + + inode = v9fs_inode_from_fid(v9ses, vfid->fid, dir->i_sb); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + inode = NULL; + goto error; } /* issue a Twstat */ v9fs_blank_wstat(&wstat); wstat.muid = v9ses->name; wstat.extension = (char *) extension; - retval = v9fs_t_wstat(v9ses, fid->fid, &wstat, &fcall); - if (retval < 0) { - PRINT_FCALL_ERROR("wstat error", fcall); - goto free_mem; - } - - err = v9fs_t_clunk(v9ses, fid->fid); + err = v9fs_t_wstat(v9ses, vfid->fid, &wstat, &fcall); if (err < 0) { - dprintk(DEBUG_ERROR, "clunk failed: %d\n", err); - goto free_mem; + PRINT_FCALL_ERROR("wstat error", fcall); + goto error; } - d_drop(dentry); /* FID - will this also clunk? */ + kfree(fcall); + dentry->d_op = &v9fs_dentry_operations; + d_instantiate(dentry, inode); + return 0; -free_mem: +error: kfree(fcall); - return retval; + if (vfid) + v9fs_fid_destroy(vfid); + + if (inode) + iput(inode); + + return err; + } /** diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 2c4fa75be02..d05318fa684 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -146,7 +146,6 @@ static struct super_block *v9fs_get_sb(struct file_system_type inode->i_gid = gid; root = d_alloc_root(inode); - if (!root) { retval = -ENOMEM; goto put_back_sb; @@ -158,15 +157,20 @@ static struct super_block *v9fs_get_sb(struct file_system_type if (stat_result < 0) { dprintk(DEBUG_ERROR, "stat error\n"); v9fs_t_clunk(v9ses, newfid); - v9fs_put_idpool(newfid, &v9ses->fidpool); } else { /* Setup the Root Inode */ - root_fid = v9fs_fid_create(root, v9ses, newfid, 0); + root_fid = v9fs_fid_create(v9ses, newfid); if (root_fid == NULL) { retval = -ENOMEM; goto put_back_sb; } + retval = v9fs_fid_insert(root_fid, root); + if (retval < 0) { + kfree(fcall); + goto put_back_sb; + } + root_fid->qid = fcall->params.rstat.stat.qid; root->d_inode->i_ino = v9fs_qid2ino(&fcall->params.rstat.stat.qid); diff --git a/fs/Kconfig b/fs/Kconfig index ef78e3a42d3..e9749b0eecd 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -883,8 +883,6 @@ config CONFIGFS_FS Both sysfs and configfs can and should exist together on the same system. One is not a replacement for the other. - If unsure, say N. - endmenu menu "Miscellaneous filesystems" @@ -1327,7 +1325,7 @@ config UFS_FS config UFS_FS_WRITE bool "UFS file system write support (DANGEROUS)" - depends on UFS_FS && EXPERIMENTAL + depends on UFS_FS && EXPERIMENTAL && BROKEN help Say Y here if you want to try writing to UFS partitions. This is experimental, so you should back up your UFS partitions beforehand. diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 1b117a44129..c2eac2a50bd 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -938,6 +938,11 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs) kfree(elf_interpreter); } else { elf_entry = loc->elf_ex.e_entry; + if (BAD_ADDR(elf_entry)) { + send_sig(SIGSEGV, current, 0); + retval = -ENOEXEC; /* Nobody gets to see this, but.. */ + goto out_free_dentry; + } } kfree(elf_phdata); @@ -411,6 +411,7 @@ static int __bio_add_page(request_queue_t *q, struct bio *bio, struct page /** * bio_add_pc_page - attempt to add page to bio + * @q: the target queue * @bio: destination bio * @page: page to add * @len: vec entry length diff --git a/fs/buffer.c b/fs/buffer.c index 3dc712f29d2..a9b39940200 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1022,6 +1022,7 @@ try_again: bh->b_state = 0; atomic_set(&bh->b_count, 0); + bh->b_private = NULL; bh->b_size = size; /* Link the buffer to its page */ @@ -2866,22 +2867,22 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) else if (test_set_buffer_locked(bh)) continue; - get_bh(bh); if (rw == WRITE || rw == SWRITE) { if (test_clear_buffer_dirty(bh)) { bh->b_end_io = end_buffer_write_sync; + get_bh(bh); submit_bh(WRITE, bh); continue; } } else { if (!buffer_uptodate(bh)) { bh->b_end_io = end_buffer_read_sync; + get_bh(bh); submit_bh(rw, bh); continue; } } unlock_buffer(bh); - put_bh(bh); } } @@ -3050,6 +3051,68 @@ asmlinkage long sys_bdflush(int func, long data) } /* + * Migration function for pages with buffers. This function can only be used + * if the underlying filesystem guarantees that no other references to "page" + * exist. + */ +#ifdef CONFIG_MIGRATION +int buffer_migrate_page(struct page *newpage, struct page *page) +{ + struct address_space *mapping = page->mapping; + struct buffer_head *bh, *head; + int rc; + + if (!mapping) + return -EAGAIN; + + if (!page_has_buffers(page)) + return migrate_page(newpage, page); + + head = page_buffers(page); + + rc = migrate_page_remove_references(newpage, page, 3); + if (rc) + return rc; + + bh = head; + do { + get_bh(bh); + lock_buffer(bh); + bh = bh->b_this_page; + + } while (bh != head); + + ClearPagePrivate(page); + set_page_private(newpage, page_private(page)); + set_page_private(page, 0); + put_page(page); + get_page(newpage); + + bh = head; + do { + set_bh_page(bh, newpage, bh_offset(bh)); + bh = bh->b_this_page; + + } while (bh != head); + + SetPagePrivate(newpage); + + migrate_page_copy(newpage, page); + + bh = head; + do { + unlock_buffer(bh); + put_bh(bh); + bh = bh->b_this_page; + + } while (bh != head); + + return 0; +} +EXPORT_SYMBOL(buffer_migrate_page); +#endif + +/* * Buffer-head allocation */ static kmem_cache_t *bh_cachep; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 3c03aadaff0..7b25463d3c1 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -52,7 +52,7 @@ extern int SendReceive2(const unsigned int /* xid */ , struct cifsSesInfo *, int * /* type of buf returned */ , const int long_op); extern int checkSMBhdr(struct smb_hdr *smb, __u16 mid); extern int checkSMB(struct smb_hdr *smb, __u16 mid, int length); -extern int is_valid_oplock_break(struct smb_hdr *smb); +extern int is_valid_oplock_break(struct smb_hdr *smb, struct TCP_Server_Info *); extern int is_size_safe_to_change(struct cifsInodeInfo *); extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *); extern unsigned int smbCalcSize(struct smb_hdr *ptr); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 217323b0c89..b41e8b37965 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1048,13 +1048,14 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, cifs_small_buf_release(iov[0].iov_base); else if(resp_buf_type == CIFS_LARGE_BUFFER) cifs_buf_release(iov[0].iov_base); - } else /* return buffer to caller to free */ /* BB FIXME how do we tell caller if it is not a large buffer */ { - *buf = iov[0].iov_base; + } else if(resp_buf_type != CIFS_NO_BUFFER) { + /* return buffer to caller to free */ + *buf = iov[0].iov_base; if(resp_buf_type == CIFS_SMALL_BUFFER) *pbuf_type = CIFS_SMALL_BUFFER; else if(resp_buf_type == CIFS_LARGE_BUFFER) *pbuf_type = CIFS_LARGE_BUFFER; - } + } /* else no valid buffer on return - leave as null */ /* Note: On -EAGAIN error only caller can retry on handle based calls since file handle passed in no longer valid */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 88f60aa5205..2a0c1f4ca0a 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -630,7 +630,7 @@ multi_t2_fnd: smallbuf = NULL; } wake_up_process(task_to_wake); - } else if ((is_valid_oplock_break(smb_buffer) == FALSE) + } else if ((is_valid_oplock_break(smb_buffer, server) == FALSE) && (isMultiRsp == FALSE)) { cERROR(1, ("No task to wake, unknown frame rcvd!")); cifs_dump_mem("Received Data is: ",(char *)smb_buffer, @@ -1785,11 +1785,20 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, } else if(volume_info.wsize) cifs_sb->wsize = volume_info.wsize; else - cifs_sb->wsize = CIFSMaxBufSize; /* default */ - if(cifs_sb->rsize < PAGE_CACHE_SIZE) { - cifs_sb->rsize = PAGE_CACHE_SIZE; - /* Windows ME does this */ - cFYI(1,("Attempt to set readsize for mount to less than one page (4096)")); + cifs_sb->wsize = + min_t(const int, PAGEVEC_SIZE * PAGE_CACHE_SIZE, + 127*1024); + /* old default of CIFSMaxBufSize was too small now + that SMB Write2 can send multiple pages in kvec. + RFC1001 does not describe what happens when frame + bigger than 128K is sent so use that as max in + conjunction with 52K kvec constraint on arch with 4K + page size */ + + if(cifs_sb->rsize < 2048) { + cifs_sb->rsize = 2048; + /* Windows ME may prefer this */ + cFYI(1,("readsize set to minimum 2048")); } cifs_sb->mnt_uid = volume_info.linux_uid; cifs_sb->mnt_gid = volume_info.linux_gid; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 77c990f0cb9..675bd256829 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1190,7 +1190,6 @@ retry: /* BB what if continued retry is requested via mount flags? */ set_bit(AS_EIO, &mapping->flags); - SetPageError(page); } else { cifs_stats_bytes_written(cifs_sb->tcon, bytes_written); @@ -1198,6 +1197,13 @@ retry: } for (i = 0; i < n_iov; i++) { page = pvec.pages[first + i]; + /* Should we also set page error on + success rc but too little data written? */ + /* BB investigate retry logic on temporary + server crash cases and how recovery works + when page marked as error */ + if(rc) + SetPageError(page); kunmap(page); unlock_page(page); page_cache_release(page); @@ -1436,13 +1442,15 @@ ssize_t cifs_user_read(struct file *file, char __user *read_data, &bytes_read, &smb_read_data, &buf_type); pSMBr = (struct smb_com_read_rsp *)smb_read_data; - if (copy_to_user(current_offset, - smb_read_data + 4 /* RFC1001 hdr */ - + le16_to_cpu(pSMBr->DataOffset), - bytes_read)) { - rc = -EFAULT; - } if (smb_read_data) { + if (copy_to_user(current_offset, + smb_read_data + + 4 /* RFC1001 length field */ + + le16_to_cpu(pSMBr->DataOffset), + bytes_read)) { + rc = -EFAULT; + } + if(buf_type == CIFS_SMALL_BUFFER) cifs_small_buf_release(smb_read_data); else if(buf_type == CIFS_LARGE_BUFFER) diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 812c6bb0fe3..432ba15e2c2 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -475,7 +475,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, int length) return 0; } int -is_valid_oplock_break(struct smb_hdr *buf) +is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv) { struct smb_com_lock_req * pSMB = (struct smb_com_lock_req *)buf; struct list_head *tmp; @@ -535,7 +535,7 @@ is_valid_oplock_break(struct smb_hdr *buf) read_lock(&GlobalSMBSeslock); list_for_each(tmp, &GlobalTreeConnectionList) { tcon = list_entry(tmp, struct cifsTconInfo, cifsConnectionList); - if (tcon->tid == buf->Tid) { + if ((tcon->tid == buf->Tid) && (srv == tcon->ses->server)) { cifs_stats_inc(&tcon->num_oplock_brks); list_for_each(tmp1,&tcon->openFileList){ netfile = list_entry(tmp1,struct cifsFileInfo, diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 7b98792150e..b12cb8a7da7 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -498,7 +498,6 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, else *pRespBufType = CIFS_SMALL_BUFFER; iov[0].iov_len = receive_len + 4; - iov[1].iov_len = 0; dump_smb(midQ->resp_buf, 80); /* convert the length into a more usable form */ diff --git a/fs/compat.c b/fs/compat.c index ff0bafcff72..5333c7d7427 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -73,17 +73,17 @@ asmlinkage long compat_sys_utime(char __user *filename, struct compat_utimbuf __ return do_utimes(AT_FDCWD, filename, t ? tv : NULL); } -asmlinkage long compat_sys_futimesat(int dfd, char __user *filename, struct compat_timeval __user *t) +asmlinkage long compat_sys_futimesat(unsigned int dfd, char __user *filename, struct compat_timeval __user *t) { struct timeval tv[2]; - if (t) { + if (t) { if (get_user(tv[0].tv_sec, &t[0].tv_sec) || get_user(tv[0].tv_usec, &t[0].tv_usec) || get_user(tv[1].tv_sec, &t[1].tv_sec) || get_user(tv[1].tv_usec, &t[1].tv_usec)) - return -EFAULT; - } + return -EFAULT; + } return do_utimes(dfd, filename, t ? tv : NULL); } @@ -114,7 +114,7 @@ asmlinkage long compat_sys_newlstat(char __user * filename, return error; } -asmlinkage long compat_sys_newfstatat(int dfd, char __user *filename, +asmlinkage long compat_sys_newfstatat(unsigned int dfd, char __user *filename, struct compat_stat __user *statbuf, int flag) { struct kstat stat; @@ -1326,7 +1326,7 @@ compat_sys_open(const char __user *filename, int flags, int mode) * O_LARGEFILE flag. */ asmlinkage long -compat_sys_openat(int dfd, const char __user *filename, int flags, int mode) +compat_sys_openat(unsigned int dfd, const char __user *filename, int flags, int mode) { return do_sys_open(dfd, filename, flags, mode); } @@ -1751,11 +1751,15 @@ asmlinkage long compat_sys_select(int n, compat_ulong_t __user *inp, ret = compat_core_sys_select(n, inp, outp, exp, &timeout); if (tvp) { + struct compat_timeval rtv; + if (current->personality & STICKY_TIMEOUTS) goto sticky; - tv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)); - tv.tv_sec = timeout; - if (copy_to_user(tvp, &tv, sizeof(tv))) { + rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)); + rtv.tv_sec = timeout; + if (compat_timeval_compare(&rtv, &tv) >= 0) + rtv = tv; + if (copy_to_user(tvp, &rtv, sizeof(rtv))) { sticky: /* * If an application puts its timeval in read-only @@ -1781,7 +1785,7 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp, { compat_sigset_t ss32; sigset_t ksigmask, sigsaved; - long timeout = MAX_SCHEDULE_TIMEOUT; + s64 timeout = MAX_SCHEDULE_TIMEOUT; struct compat_timespec ts; int ret; @@ -1822,13 +1826,17 @@ asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp, } while (!ret && !timeout && tsp && (ts.tv_sec || ts.tv_nsec)); if (tsp && !(current->personality & STICKY_TIMEOUTS)) { - ts.tv_sec += timeout / HZ; - ts.tv_nsec += (timeout % HZ) * (1000000000/HZ); - if (ts.tv_nsec >= 1000000000) { - ts.tv_sec++; - ts.tv_nsec -= 1000000000; + struct compat_timespec rts; + + rts.tv_sec = timeout / HZ; + rts.tv_nsec = (timeout % HZ) * (NSEC_PER_SEC/HZ); + if (rts.tv_nsec >= NSEC_PER_SEC) { + rts.tv_sec++; + rts.tv_nsec -= NSEC_PER_SEC; } - (void)copy_to_user(tsp, &ts, sizeof(ts)); + if (compat_timespec_compare(&rts, &ts) >= 0) + rts = ts; + copy_to_user(tsp, &rts, sizeof(rts)); } if (ret == -ERESTARTNOHAND) { @@ -1918,12 +1926,17 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, sigprocmask(SIG_SETMASK, &sigsaved, NULL); if (tsp && timeout >= 0) { + struct compat_timespec rts; + if (current->personality & STICKY_TIMEOUTS) goto sticky; /* Yes, we know it's actually an s64, but it's also positive. */ - ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000; - ts.tv_sec = timeout; - if (copy_to_user(tsp, &ts, sizeof(ts))) { + rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * + 1000; + rts.tv_sec = timeout; + if (compat_timespec_compare(&rts, &ts) >= 0) + rts = ts; + if (copy_to_user(tsp, &rts, sizeof(rts))) { sticky: /* * If an application puts its timeval in read-only diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 5dd0207ffd4..c666769a875 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -446,7 +446,7 @@ static int dev_ifconf(unsigned int fd, unsigned int cmd, unsigned long arg) ifr = ifc.ifc_req; ifr32 = compat_ptr(ifc32.ifcbuf); for (i = 0, j = 0; - i + sizeof (struct ifreq32) < ifc32.ifc_len && j < ifc.ifc_len; + i + sizeof (struct ifreq32) <= ifc32.ifc_len && j < ifc.ifc_len; i += sizeof (struct ifreq32), j += sizeof (struct ifreq)) { if (copy_in_user(ifr32, ifr, sizeof (struct ifreq32))) return -EFAULT; @@ -931,8 +931,8 @@ struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */ static int sg_grt_trans(unsigned int fd, unsigned int cmd, unsigned long arg) { int err, i; - sg_req_info_t *r; - struct compat_sg_req_info *o = (struct compat_sg_req_info *)arg; + sg_req_info_t __user *r; + struct compat_sg_req_info __user *o = (void __user *)arg; r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE); err = sys_ioctl(fd,cmd,(unsigned long)r); if (err < 0) @@ -2531,18 +2531,9 @@ static int rtc_ioctl(unsigned fd, unsigned cmd, unsigned long arg) val32 = kval; return put_user(val32, (unsigned int __user *)arg); case RTC_IRQP_SET32: + return sys_ioctl(fd, RTC_IRQP_SET, arg); case RTC_EPOCH_SET32: - ret = get_user(val32, (unsigned int __user *)arg); - if (ret) - return ret; - kval = val32; - - set_fs(KERNEL_DS); - ret = sys_ioctl(fd, (cmd == RTC_IRQP_SET32) ? - RTC_IRQP_SET : RTC_EPOCH_SET, - (unsigned long)&kval); - set_fs(oldfs); - return ret; + return sys_ioctl(fd, RTC_EPOCH_SET, arg); default: /* unreached */ return -ENOIOCTLCMD; @@ -2739,8 +2730,8 @@ static int do_ncp_setprivatedata(unsigned int fd, unsigned int cmd, unsigned lon static int lp_timeout_trans(unsigned int fd, unsigned int cmd, unsigned long arg) { - struct compat_timeval *tc = (struct compat_timeval *)arg; - struct timeval *tn = compat_alloc_user_space(sizeof(struct timeval)); + struct compat_timeval __user *tc = (struct compat_timeval __user *)arg; + struct timeval __user *tn = compat_alloc_user_space(sizeof(struct timeval)); struct timeval ts; if (get_user(ts.tv_sec, &tc->tv_sec) || get_user(ts.tv_usec, &tc->tv_usec) || diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index 8899d9c5f6b..f70e46951b3 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -36,6 +36,7 @@ struct configfs_dirent { int s_type; umode_t s_mode; struct dentry * s_dentry; + struct iattr * s_iattr; }; #define CONFIGFS_ROOT 0x0001 @@ -48,10 +49,11 @@ struct configfs_dirent { #define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR) extern struct vfsmount * configfs_mount; +extern kmem_cache_t *configfs_dir_cachep; extern int configfs_is_root(struct config_item *item); -extern struct inode * configfs_new_inode(mode_t mode); +extern struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent *); extern int configfs_create(struct dentry *, int mode, int (*init)(struct inode *)); extern int configfs_create_file(struct config_item *, const struct configfs_attribute *); @@ -63,6 +65,7 @@ extern void configfs_hash_and_remove(struct dentry * dir, const char * name); extern const unsigned char * configfs_get_name(struct configfs_dirent *sd); extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent); +extern int configfs_setattr(struct dentry *dentry, struct iattr *iattr); extern int configfs_pin_fs(void); extern void configfs_release_fs(void); @@ -120,8 +123,10 @@ static inline struct config_item *configfs_get_config_item(struct dentry *dentry static inline void release_configfs_dirent(struct configfs_dirent * sd) { - if (!(sd->s_type & CONFIGFS_ROOT)) - kfree(sd); + if (!(sd->s_type & CONFIGFS_ROOT)) { + kfree(sd->s_iattr); + kmem_cache_free(configfs_dir_cachep, sd); + } } static inline struct configfs_dirent * configfs_get(struct configfs_dirent * sd) diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index b668ec61527..ca60e3abef4 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -72,7 +72,7 @@ static struct configfs_dirent *configfs_new_dirent(struct configfs_dirent * pare { struct configfs_dirent * sd; - sd = kmalloc(sizeof(*sd), GFP_KERNEL); + sd = kmem_cache_alloc(configfs_dir_cachep, GFP_KERNEL); if (!sd) return NULL; @@ -136,13 +136,19 @@ static int create_dir(struct config_item * k, struct dentry * p, int error; umode_t mode = S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO; - error = configfs_create(d, mode, init_dir); + error = configfs_make_dirent(p->d_fsdata, d, k, mode, + CONFIGFS_DIR); if (!error) { - error = configfs_make_dirent(p->d_fsdata, d, k, mode, - CONFIGFS_DIR); + error = configfs_create(d, mode, init_dir); if (!error) { p->d_inode->i_nlink++; (d)->d_op = &configfs_dentry_ops; + } else { + struct configfs_dirent *sd = d->d_fsdata; + if (sd) { + list_del_init(&sd->s_sibling); + configfs_put(sd); + } } } return error; @@ -182,12 +188,19 @@ int configfs_create_link(struct configfs_symlink *sl, int err = 0; umode_t mode = S_IFLNK | S_IRWXUGO; - err = configfs_create(dentry, mode, init_symlink); + err = configfs_make_dirent(parent->d_fsdata, dentry, sl, mode, + CONFIGFS_ITEM_LINK); if (!err) { - err = configfs_make_dirent(parent->d_fsdata, dentry, sl, - mode, CONFIGFS_ITEM_LINK); + err = configfs_create(dentry, mode, init_symlink); if (!err) dentry->d_op = &configfs_dentry_ops; + else { + struct configfs_dirent *sd = dentry->d_fsdata; + if (sd) { + list_del_init(&sd->s_sibling); + configfs_put(sd); + } + } } return err; } @@ -241,13 +254,15 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den struct configfs_attribute * attr = sd->s_element; int error; + dentry->d_fsdata = configfs_get(sd); + sd->s_dentry = dentry; error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, init_file); - if (error) + if (error) { + configfs_put(sd); return error; + } dentry->d_op = &configfs_dentry_ops; - dentry->d_fsdata = configfs_get(sd); - sd->s_dentry = dentry; d_rehash(dentry); return 0; @@ -839,6 +854,7 @@ struct inode_operations configfs_dir_inode_operations = { .symlink = configfs_symlink, .unlink = configfs_unlink, .lookup = configfs_lookup, + .setattr = configfs_setattr, }; #if 0 diff --git a/fs/configfs/file.c b/fs/configfs/file.c index c26cd61f13a..3921920d871 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -26,7 +26,6 @@ #include <linux/fs.h> #include <linux/module.h> -#include <linux/dnotify.h> #include <linux/slab.h> #include <asm/uaccess.h> #include <asm/semaphore.h> @@ -150,7 +149,7 @@ out: /** * fill_write_buffer - copy buffer from userspace. * @buffer: data buffer for file. - * @userbuf: data from user. + * @buf: data from user. * @count: number of bytes in @userbuf. * * Allocate @buffer->page if it hasn't been already, then @@ -177,8 +176,9 @@ fill_write_buffer(struct configfs_buffer * buffer, const char __user * buf, size /** * flush_write_buffer - push buffer to config_item. - * @file: file pointer. + * @dentry: dentry to the attribute * @buffer: data buffer for file. + * @count: number of bytes * * Get the correct pointers for the config_item and the attribute we're * dealing with, then call the store() method for the attribute, @@ -217,15 +217,16 @@ static ssize_t configfs_write_file(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct configfs_buffer * buffer = file->private_data; + ssize_t len; down(&buffer->sem); - count = fill_write_buffer(buffer,buf,count); - if (count > 0) - count = flush_write_buffer(file->f_dentry,buffer,count); - if (count > 0) - *ppos += count; + len = fill_write_buffer(buffer, buf, count); + if (len > 0) + len = flush_write_buffer(file->f_dentry, buffer, count); + if (len > 0) + *ppos += len; up(&buffer->sem); - return count; + return len; } static int check_perm(struct inode * inode, struct file * file) diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 6577c588de9..c153bd9534c 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -31,6 +31,7 @@ #include <linux/pagemap.h> #include <linux/namei.h> #include <linux/backing-dev.h> +#include <linux/capability.h> #include <linux/configfs.h> #include "configfs_internal.h" @@ -48,18 +49,107 @@ static struct backing_dev_info configfs_backing_dev_info = { .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, }; -struct inode * configfs_new_inode(mode_t mode) +static struct inode_operations configfs_inode_operations ={ + .setattr = configfs_setattr, +}; + +int configfs_setattr(struct dentry * dentry, struct iattr * iattr) +{ + struct inode * inode = dentry->d_inode; + struct configfs_dirent * sd = dentry->d_fsdata; + struct iattr * sd_iattr; + unsigned int ia_valid = iattr->ia_valid; + int error; + + if (!sd) + return -EINVAL; + + sd_iattr = sd->s_iattr; + + error = inode_change_ok(inode, iattr); + if (error) + return error; + + error = inode_setattr(inode, iattr); + if (error) + return error; + + if (!sd_iattr) { + /* setting attributes for the first time, allocate now */ + sd_iattr = kmalloc(sizeof(struct iattr), GFP_KERNEL); + if (!sd_iattr) + return -ENOMEM; + /* assign default attributes */ + memset(sd_iattr, 0, sizeof(struct iattr)); + sd_iattr->ia_mode = sd->s_mode; + sd_iattr->ia_uid = 0; + sd_iattr->ia_gid = 0; + sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME; + sd->s_iattr = sd_iattr; + } + + /* attributes were changed atleast once in past */ + + if (ia_valid & ATTR_UID) + sd_iattr->ia_uid = iattr->ia_uid; + if (ia_valid & ATTR_GID) + sd_iattr->ia_gid = iattr->ia_gid; + if (ia_valid & ATTR_ATIME) + sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MTIME) + sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_CTIME) + sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MODE) { + umode_t mode = iattr->ia_mode; + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + sd_iattr->ia_mode = sd->s_mode = mode; + } + + return error; +} + +static inline void set_default_inode_attr(struct inode * inode, mode_t mode) +{ + inode->i_mode = mode; + inode->i_uid = 0; + inode->i_gid = 0; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; +} + +static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) +{ + inode->i_mode = iattr->ia_mode; + inode->i_uid = iattr->ia_uid; + inode->i_gid = iattr->ia_gid; + inode->i_atime = iattr->ia_atime; + inode->i_mtime = iattr->ia_mtime; + inode->i_ctime = iattr->ia_ctime; +} + +struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd) { struct inode * inode = new_inode(configfs_sb); if (inode) { - inode->i_mode = mode; - inode->i_uid = 0; - inode->i_gid = 0; inode->i_blksize = PAGE_CACHE_SIZE; inode->i_blocks = 0; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_mapping->a_ops = &configfs_aops; inode->i_mapping->backing_dev_info = &configfs_backing_dev_info; + inode->i_op = &configfs_inode_operations; + + if (sd->s_iattr) { + /* sysfs_dirent has non-default attributes + * get them for the new inode from persistent copy + * in sysfs_dirent + */ + set_inode_attr(inode, sd->s_iattr); + } else + set_default_inode_attr(inode, mode); } return inode; } @@ -70,7 +160,8 @@ int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode * struct inode * inode = NULL; if (dentry) { if (!dentry->d_inode) { - if ((inode = configfs_new_inode(mode))) { + struct configfs_dirent *sd = dentry->d_fsdata; + if ((inode = configfs_new_inode(mode, sd))) { if (dentry->d_parent && dentry->d_parent->d_inode) { struct inode *p_inode = dentry->d_parent->d_inode; p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME; @@ -103,10 +194,9 @@ int configfs_create(struct dentry * dentry, int mode, int (*init)(struct inode * */ const unsigned char * configfs_get_name(struct configfs_dirent *sd) { - struct attribute * attr; + struct configfs_attribute *attr; - if (!sd || !sd->s_element) - BUG(); + BUG_ON(!sd || !sd->s_element); /* These always have a dentry, so use that */ if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK)) @@ -114,7 +204,7 @@ const unsigned char * configfs_get_name(struct configfs_dirent *sd) if (sd->s_type & CONFIGFS_ITEM_ATTR) { attr = sd->s_element; - return attr->name; + return attr->ca_name; } return NULL; } @@ -130,13 +220,17 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent) if (dentry) { spin_lock(&dcache_lock); + spin_lock(&dentry->d_lock); if (!(d_unhashed(dentry) && dentry->d_inode)) { dget_locked(dentry); __d_drop(dentry); + spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); simple_unlink(parent->d_inode, dentry); - } else + } else { + spin_unlock(&dentry->d_lock); spin_unlock(&dcache_lock); + } } } @@ -145,6 +239,10 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name) struct configfs_dirent * sd; struct configfs_dirent * parent_sd = dir->d_fsdata; + if (dir->d_inode == NULL) + /* no inode means this hasn't been made visible yet */ + return; + mutex_lock(&dir->d_inode->i_mutex); list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { if (!sd->s_element) diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 1a2f6f6a4d9..f920d30478e 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -38,6 +38,7 @@ struct vfsmount * configfs_mount = NULL; struct super_block * configfs_sb = NULL; +kmem_cache_t *configfs_dir_cachep; static int configfs_mnt_count = 0; static struct super_operations configfs_ops = { @@ -62,6 +63,7 @@ static struct configfs_dirent configfs_root = { .s_children = LIST_HEAD_INIT(configfs_root.s_children), .s_element = &configfs_root_group.cg_item, .s_type = CONFIGFS_ROOT, + .s_iattr = NULL, }; static int configfs_fill_super(struct super_block *sb, void *data, int silent) @@ -73,9 +75,11 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_magic = CONFIGFS_MAGIC; sb->s_op = &configfs_ops; + sb->s_time_gran = 1; configfs_sb = sb; - inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO); + inode = configfs_new_inode(S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO, + &configfs_root); if (inode) { inode->i_op = &configfs_dir_inode_operations; inode->i_fop = &configfs_dir_operations; @@ -128,19 +132,31 @@ static decl_subsys(config, NULL, NULL); static int __init configfs_init(void) { - int err; + int err = -ENOMEM; + + configfs_dir_cachep = kmem_cache_create("configfs_dir_cache", + sizeof(struct configfs_dirent), + 0, 0, NULL, NULL); + if (!configfs_dir_cachep) + goto out; kset_set_kset_s(&config_subsys, kernel_subsys); err = subsystem_register(&config_subsys); - if (err) - return err; + if (err) { + kmem_cache_destroy(configfs_dir_cachep); + configfs_dir_cachep = NULL; + goto out; + } err = register_filesystem(&configfs_fs_type); if (err) { printk(KERN_ERR "configfs: Unable to register filesystem!\n"); subsystem_unregister(&config_subsys); + kmem_cache_destroy(configfs_dir_cachep); + configfs_dir_cachep = NULL; } +out: return err; } @@ -148,11 +164,13 @@ static void __exit configfs_exit(void) { unregister_filesystem(&configfs_fs_type); subsystem_unregister(&config_subsys); + kmem_cache_destroy(configfs_dir_cachep); + configfs_dir_cachep = NULL; } MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); -MODULE_VERSION("0.0.1"); +MODULE_VERSION("0.0.2"); MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration."); module_init(configfs_init); diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index 50f5840521a..e5512e295cf 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -162,8 +162,7 @@ int configfs_unlink(struct inode *dir, struct dentry *dentry) if (!(sd->s_type & CONFIGFS_ITEM_LINK)) goto out; - if (dentry->d_parent == configfs_sb->s_root) - BUG(); + BUG_ON(dentry->d_parent == configfs_sb->s_root); sl = sd->s_element; @@ -277,5 +276,6 @@ struct inode_operations configfs_symlink_inode_operations = { .follow_link = configfs_follow_link, .readlink = generic_readlink, .put_link = configfs_put_link, + .setattr = configfs_setattr, }; diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 7fe85415ae7..8ad52f5bf25 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -36,7 +36,7 @@ static DECLARE_MUTEX(read_mutex); /* These two macros may change in future, to provide better st_ino semantics. */ -#define CRAMINO(x) ((x)->offset?(x)->offset<<2:1) +#define CRAMINO(x) (((x)->offset && (x)->size)?(x)->offset<<2:1) #define OFFSET(x) ((x)->i_ino) @@ -66,8 +66,36 @@ static int cramfs_iget5_test(struct inode *inode, void *opaque) static int cramfs_iget5_set(struct inode *inode, void *opaque) { + static struct timespec zerotime; struct cramfs_inode *cramfs_inode = opaque; + inode->i_mode = cramfs_inode->mode; + inode->i_uid = cramfs_inode->uid; + inode->i_size = cramfs_inode->size; + inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1; + inode->i_blksize = PAGE_CACHE_SIZE; + inode->i_gid = cramfs_inode->gid; + /* Struct copy intentional */ + inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime; inode->i_ino = CRAMINO(cramfs_inode); + /* inode->i_nlink is left 1 - arguably wrong for directories, + but it's the best we can do without reading the directory + contents. 1 yields the right result in GNU find, even + without -noleaf option. */ + if (S_ISREG(inode->i_mode)) { + inode->i_fop = &generic_ro_fops; + inode->i_data.a_ops = &cramfs_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &cramfs_dir_inode_operations; + inode->i_fop = &cramfs_directory_operations; + } else if (S_ISLNK(inode->i_mode)) { + inode->i_op = &page_symlink_inode_operations; + inode->i_data.a_ops = &cramfs_aops; + } else { + inode->i_size = 0; + inode->i_blocks = 0; + init_special_inode(inode, inode->i_mode, + old_decode_dev(cramfs_inode->size)); + } return 0; } @@ -77,37 +105,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb, struct inode *inode = iget5_locked(sb, CRAMINO(cramfs_inode), cramfs_iget5_test, cramfs_iget5_set, cramfs_inode); - static struct timespec zerotime; - if (inode && (inode->i_state & I_NEW)) { - inode->i_mode = cramfs_inode->mode; - inode->i_uid = cramfs_inode->uid; - inode->i_size = cramfs_inode->size; - inode->i_blocks = (cramfs_inode->size - 1) / 512 + 1; - inode->i_blksize = PAGE_CACHE_SIZE; - inode->i_gid = cramfs_inode->gid; - /* Struct copy intentional */ - inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime; - inode->i_ino = CRAMINO(cramfs_inode); - /* inode->i_nlink is left 1 - arguably wrong for directories, - but it's the best we can do without reading the directory - contents. 1 yields the right result in GNU find, even - without -noleaf option. */ - if (S_ISREG(inode->i_mode)) { - inode->i_fop = &generic_ro_fops; - inode->i_data.a_ops = &cramfs_aops; - } else if (S_ISDIR(inode->i_mode)) { - inode->i_op = &cramfs_dir_inode_operations; - inode->i_fop = &cramfs_directory_operations; - } else if (S_ISLNK(inode->i_mode)) { - inode->i_op = &page_symlink_inode_operations; - inode->i_data.a_ops = &cramfs_aops; - } else { - inode->i_size = 0; - inode->i_blocks = 0; - init_special_inode(inode, inode->i_mode, - old_decode_dev(cramfs_inode->size)); - } unlock_new_inode(inode); } return inode; diff --git a/fs/dcache.c b/fs/dcache.c index 86bdb93789c..11dc83092d4 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -743,7 +743,9 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) dentry->d_op = NULL; dentry->d_fsdata = NULL; dentry->d_mounted = 0; +#ifdef CONFIG_PROFILING dentry->d_cookie = NULL; +#endif INIT_HLIST_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_LIST_HEAD(&dentry->d_subdirs); @@ -1734,7 +1736,7 @@ void __init vfs_caches_init(unsigned long mempages) SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, filp_ctor, filp_dtor); + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); dcache_init(mempages); inode_init(mempages); diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index efc97d9b786..d575452cd9f 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -56,7 +56,7 @@ static u64 debugfs_u8_get(void *data) DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n"); /** - * debugfs_create_u8 - create a file in the debugfs filesystem that is used to read and write a unsigned 8 bit value. + * debugfs_create_u8 - create a file in the debugfs filesystem that is used to read and write an unsigned 8 bit value. * * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have @@ -98,7 +98,7 @@ static u64 debugfs_u16_get(void *data) DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n"); /** - * debugfs_create_u16 - create a file in the debugfs filesystem that is used to read and write a unsigned 8 bit value. + * debugfs_create_u16 - create a file in the debugfs filesystem that is used to read and write an unsigned 16 bit value. * * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have @@ -140,7 +140,7 @@ static u64 debugfs_u32_get(void *data) DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n"); /** - * debugfs_create_u32 - create a file in the debugfs filesystem that is used to read and write a unsigned 8 bit value. + * debugfs_create_u32 - create a file in the debugfs filesystem that is used to read and write an unsigned 32 bit value. * * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have diff --git a/fs/direct-io.c b/fs/direct-io.c index 30dbbd1df51..27f3e787fac 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -857,6 +857,7 @@ do_holes: /* Handle holes */ if (!buffer_mapped(map_bh)) { char *kaddr; + loff_t i_size_aligned; /* AKPM: eargh, -ENOTBLK is a hack */ if (dio->rw == WRITE) { @@ -864,8 +865,14 @@ do_holes: return -ENOTBLK; } + /* + * Be sure to account for a partial block as the + * last block in the file + */ + i_size_aligned = ALIGN(i_size_read(dio->inode), + 1 << blkbits); if (dio->block_in_file >= - i_size_read(dio->inode)>>blkbits) { + i_size_aligned >> blkbits) { /* We hit eof */ page_cache_release(page); goto out; @@ -1148,15 +1155,16 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, * For writes, i_mutex is not held on entry; it is never taken. * * DIO_LOCKING (simple locking for regular files) - * For writes we are called under i_mutex and return with i_mutex held, even though - * it is internally dropped. + * For writes we are called under i_mutex and return with i_mutex held, even + * though it is internally dropped. * For reads, i_mutex is not held on entry, but it is taken and dropped before * returning. * * DIO_OWN_LOCKING (filesystem provides synchronisation and handling of * uninitialised data, allowing parallel direct readers and writers) * For writes we are called without i_mutex, return without it, never touch it. - * For reads, i_mutex is held on entry and will be released before returning. + * For reads we are called under i_mutex and return with i_mutex held, even + * though it may be internally dropped. * * Additional i_alloc_sem locking requirements described inline below. */ @@ -1175,7 +1183,8 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, ssize_t retval = -EINVAL; loff_t end = offset; struct dio *dio; - int reader_with_isem = (rw == READ && dio_lock_type == DIO_OWN_LOCKING); + int release_i_mutex = 0; + int acquire_i_mutex = 0; if (rw & WRITE) current->flags |= PF_SYNCWRITE; @@ -1218,7 +1227,6 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, * writers need to grab i_alloc_sem only (i_mutex is already held) * For regular files using DIO_OWN_LOCKING, * neither readers nor writers take any locks here - * (i_mutex is already held and release for writers here) */ dio->lock_type = dio_lock_type; if (dio_lock_type != DIO_NO_LOCKING) { @@ -1229,7 +1237,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, mapping = iocb->ki_filp->f_mapping; if (dio_lock_type != DIO_OWN_LOCKING) { mutex_lock(&inode->i_mutex); - reader_with_isem = 1; + release_i_mutex = 1; } retval = filemap_write_and_wait_range(mapping, offset, @@ -1241,7 +1249,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, if (dio_lock_type == DIO_OWN_LOCKING) { mutex_unlock(&inode->i_mutex); - reader_with_isem = 0; + acquire_i_mutex = 1; } } @@ -1262,11 +1270,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, nr_segs, blkbits, get_blocks, end_io, dio); if (rw == READ && dio_lock_type == DIO_LOCKING) - reader_with_isem = 0; + release_i_mutex = 0; out: - if (reader_with_isem) + if (release_i_mutex) mutex_unlock(&inode->i_mutex); + else if (acquire_i_mutex) + mutex_lock(&inode->i_mutex); if (rw & WRITE) current->flags &= ~PF_SYNCWRITE; return retval; diff --git a/fs/exec.c b/fs/exec.c index 055378d2513..0b515ac5313 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -885,6 +885,12 @@ int flush_old_exec(struct linux_binprm * bprm) current->flags &= ~PF_RANDOMIZE; flush_thread(); + /* Set the new mm task size. We have to do that late because it may + * depend on TIF_32BIT which is only updated in flush_thread() on + * some architectures like powerpc + */ + current->mm->task_size = TASK_SIZE; + if (bprm->e_uid != current->euid || bprm->e_gid != current->egid || file_permission(bprm->file, MAY_READ) || (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)) { @@ -1403,7 +1409,7 @@ static void zap_threads (struct mm_struct *mm) do_each_thread(g,p) { if (mm == p->mm && p != tsk && p->ptrace && p->parent->mm == mm) { - __ptrace_unlink(p); + __ptrace_detach(p, 0); } } while_each_thread(g,p); write_unlock_irq(&tasklist_lock); diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 35acc43b897..da52b4a5db6 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -220,7 +220,7 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) struct ext2_inode_info *ei = EXT2_I(inode); int name_index; void *value = NULL; - size_t size; + size_t size = 0; int error; if (S_ISLNK(inode->i_mode)) diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 7442bdd1267..b3dbd716cd3 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -256,11 +256,10 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) unsigned long npages = dir_pages(inode); unsigned chunk_mask = ~(ext2_chunk_size(inode)-1); unsigned char *types = NULL; - int need_revalidate = (filp->f_version != inode->i_version); - int ret; + int need_revalidate = filp->f_version != inode->i_version; if (pos > inode->i_size - EXT2_DIR_REC_LEN(1)) - goto success; + return 0; if (EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_FILETYPE)) types = ext2_filetype_table; @@ -275,12 +274,15 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) "bad page in #%lu", inode->i_ino); filp->f_pos += PAGE_CACHE_SIZE - offset; - ret = -EIO; - goto done; + return -EIO; } kaddr = page_address(page); - if (need_revalidate) { - offset = ext2_validate_entry(kaddr, offset, chunk_mask); + if (unlikely(need_revalidate)) { + if (offset) { + offset = ext2_validate_entry(kaddr, offset, chunk_mask); + filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset; + } + filp->f_version = inode->i_version; need_revalidate = 0; } de = (ext2_dirent *)(kaddr+offset); @@ -289,9 +291,8 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) if (de->rec_len == 0) { ext2_error(sb, __FUNCTION__, "zero-length directory entry"); - ret = -EIO; ext2_put_page(page); - goto done; + return -EIO; } if (de->inode) { int over; @@ -306,19 +307,14 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) le32_to_cpu(de->inode), d_type); if (over) { ext2_put_page(page); - goto success; + return 0; } } filp->f_pos += le16_to_cpu(de->rec_len); } ext2_put_page(page); } - -success: - ret = 0; -done: - filp->f_version = inode->i_version; - return ret; + return 0; } /* diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 74714af4ae6..e52765219e1 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -605,7 +605,7 @@ got: insert_inode_hash(inode); if (DQUOT_ALLOC_INODE(inode)) { - err = -ENOSPC; + err = -EDQUOT; goto fail_drop; } diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index e7d3f0522d0..a717837f272 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -706,6 +706,7 @@ struct address_space_operations ext2_aops = { .bmap = ext2_bmap, .direct_IO = ext2_direct_IO, .writepages = ext2_writepages, + .migratepage = buffer_migrate_page, }; struct address_space_operations ext2_aops_xip = { @@ -723,6 +724,7 @@ struct address_space_operations ext2_nobh_aops = { .bmap = ext2_bmap, .direct_IO = ext2_direct_IO, .writepages = ext2_writepages, + .migratepage = buffer_migrate_page, }; /* diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 8d6819846fc..cb6f9bd658d 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -221,6 +221,11 @@ static int ext2_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",grpquota"); #endif +#if defined(CONFIG_EXT2_FS_XIP) + if (sbi->s_mount_opt & EXT2_MOUNT_XIP) + seq_puts(seq, ",xip"); +#endif + return 0; } diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index a2ca3107d47..86ae8e93adb 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -792,18 +792,20 @@ ext2_xattr_delete_inode(struct inode *inode) ext2_free_blocks(inode, EXT2_I(inode)->i_file_acl, 1); get_bh(bh); bforget(bh); + unlock_buffer(bh); } else { HDR(bh)->h_refcount = cpu_to_le32( le32_to_cpu(HDR(bh)->h_refcount) - 1); if (ce) mb_cache_entry_release(ce); + ea_bdebug(bh, "refcount now=%d", + le32_to_cpu(HDR(bh)->h_refcount)); + unlock_buffer(bh); mark_buffer_dirty(bh); if (IS_SYNC(inode)) sync_dirty_buffer(bh); DQUOT_FREE_BLOCK(inode, 1); } - ea_bdebug(bh, "refcount now=%d", le32_to_cpu(HDR(bh)->h_refcount) - 1); - unlock_buffer(bh); EXT2_I(inode)->i_file_acl = 0; cleanup: diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index 47a9da2dfb4..0d21d558b87 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -226,7 +226,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, struct ext3_inode_info *ei = EXT3_I(inode); int name_index; void *value = NULL; - size_t size; + size_t size = 0; int error; if (S_ISLNK(inode->i_mode)) diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 8824e84f8a5..0384e539b88 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1559,6 +1559,7 @@ static struct address_space_operations ext3_ordered_aops = { .invalidatepage = ext3_invalidatepage, .releasepage = ext3_releasepage, .direct_IO = ext3_direct_IO, + .migratepage = buffer_migrate_page, }; static struct address_space_operations ext3_writeback_aops = { @@ -1572,6 +1573,7 @@ static struct address_space_operations ext3_writeback_aops = { .invalidatepage = ext3_invalidatepage, .releasepage = ext3_releasepage, .direct_IO = ext3_direct_IO, + .migratepage = buffer_migrate_page, }; static struct address_space_operations ext3_journalled_aops = { @@ -1622,15 +1624,14 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page, * For "nobh" option, we can only work if we don't need to * read-in the page - otherwise we create buffers to do the IO. */ - if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH)) { - if (PageUptodate(page)) { - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr + offset, 0, length); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); - set_page_dirty(page); - goto unlock; - } + if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && + ext3_should_writeback_data(inode) && PageUptodate(page)) { + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, 0, length); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + set_page_dirty(page); + goto unlock; } if (!page_has_buffers(page)) diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 8bd8ac07770..b8f5cd1e540 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -2141,7 +2141,8 @@ retry: * We have a transaction open. All is sweetness. It also sets * i_size in generic_commit_write(). */ - err = page_symlink(inode, symname, l); + err = __page_symlink(inode, symname, l, + mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); if (err) { ext3_dec_count(handle, inode); ext3_mark_inode_dirty(handle, inode); diff --git a/fs/fat/file.c b/fs/fat/file.c index e99c5a73b39..88aa1ae13f9 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -210,10 +210,30 @@ static int fat_free(struct inode *inode, int skip) if (MSDOS_I(inode)->i_start == 0) return 0; - /* - * Write a new EOF, and get the remaining cluster chain for freeing. - */ + fat_cache_inval_inode(inode); + wait = IS_DIRSYNC(inode); + i_start = free_start = MSDOS_I(inode)->i_start; + i_logstart = MSDOS_I(inode)->i_logstart; + + /* First, we write the new file size. */ + if (!skip) { + MSDOS_I(inode)->i_start = 0; + MSDOS_I(inode)->i_logstart = 0; + } + MSDOS_I(inode)->i_attrs |= ATTR_ARCH; + inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + if (wait) { + err = fat_sync_inode(inode); + if (err) { + MSDOS_I(inode)->i_start = i_start; + MSDOS_I(inode)->i_logstart = i_logstart; + return err; + } + } else + mark_inode_dirty(inode); + + /* Write a new EOF, and get the remaining cluster chain for freeing. */ if (skip) { struct fat_entry fatent; int ret, fclus, dclus; @@ -244,35 +264,11 @@ static int fat_free(struct inode *inode, int skip) return ret; free_start = ret; - i_start = i_logstart = 0; - fat_cache_inval_inode(inode); - } else { - fat_cache_inval_inode(inode); - - i_start = free_start = MSDOS_I(inode)->i_start; - i_logstart = MSDOS_I(inode)->i_logstart; - MSDOS_I(inode)->i_start = 0; - MSDOS_I(inode)->i_logstart = 0; } - MSDOS_I(inode)->i_attrs |= ATTR_ARCH; - inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; - if (wait) { - err = fat_sync_inode(inode); - if (err) - goto error; - } else - mark_inode_dirty(inode); inode->i_blocks = skip << (MSDOS_SB(sb)->cluster_bits - 9); /* Freeing the remained cluster chain */ return fat_free_clusters(inode, free_start); - -error: - if (i_start) { - MSDOS_I(inode)->i_start = i_start; - MSDOS_I(inode)->i_logstart = i_logstart; - } - return err; } void fat_truncate(struct inode *inode) diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 32fb0a3f1da..944652e9dde 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -196,19 +196,9 @@ EXPORT_SYMBOL_GPL(fat_date_unix2dos); int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs) { - int i, e, err = 0; + int i, err = 0; - for (i = 0; i < nr_bhs; i++) { - lock_buffer(bhs[i]); - if (test_clear_buffer_dirty(bhs[i])) { - get_bh(bhs[i]); - bhs[i]->b_end_io = end_buffer_write_sync; - e = submit_bh(WRITE, bhs[i]); - if (!err && e) - err = e; - } else - unlock_buffer(bhs[i]); - } + ll_rw_block(SWRITE, nr_bhs, bhs); for (i = 0; i < nr_bhs; i++) { wait_on_buffer(bhs[i]); if (buffer_eopnotsupp(bhs[i])) { diff --git a/fs/fcntl.c b/fs/fcntl.c index 5f96786d1c7..dc4a7007f4e 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -208,8 +208,11 @@ static int setfl(int fd, struct file * filp, unsigned long arg) struct inode * inode = filp->f_dentry->d_inode; int error = 0; - /* O_APPEND cannot be cleared if the file is marked as append-only */ - if (!(arg & O_APPEND) && IS_APPEND(inode)) + /* + * O_APPEND cannot be cleared if the file is marked as append-only + * and the file is open for write. + */ + if (((arg ^ filp->f_flags) & O_APPEND) && IS_APPEND(inode)) return -EPERM; /* O_NOATIME can only be set by the owner or superuser */ diff --git a/fs/fifo.c b/fs/fifo.c index 923371b753a..d13fcd3ec80 100644 --- a/fs/fifo.c +++ b/fs/fifo.c @@ -34,10 +34,7 @@ static int fifo_open(struct inode *inode, struct file *filp) { int ret; - ret = -ERESTARTSYS; - if (mutex_lock_interruptible(PIPE_MUTEX(*inode))) - goto err_nolock_nocleanup; - + mutex_lock(PIPE_MUTEX(*inode)); if (!inode->i_pipe) { ret = -ENOMEM; if(!pipe_new(inode)) @@ -140,8 +137,6 @@ err: err_nocleanup: mutex_unlock(PIPE_MUTEX(*inode)); - -err_nolock_nocleanup: return ret; } diff --git a/fs/file.c b/fs/file.c index fd066b261c7..cea7cbea11d 100644 --- a/fs/file.c +++ b/fs/file.c @@ -379,7 +379,6 @@ static void __devinit fdtable_defer_list_init(int cpu) void __init files_defer_init(void) { int i; - /* Really early - can't use for_each_cpu */ - for (i = 0; i < NR_CPUS; i++) + for_each_cpu(i) fdtable_defer_list_init(i); } diff --git a/fs/file_table.c b/fs/file_table.c index 768b5816754..44fabeaa941 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -5,6 +5,7 @@ * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) */ +#include <linux/config.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/file.h> @@ -19,52 +20,67 @@ #include <linux/capability.h> #include <linux/cdev.h> #include <linux/fsnotify.h> +#include <linux/sysctl.h> +#include <linux/percpu_counter.h> + +#include <asm/atomic.h> /* sysctl tunables... */ struct files_stat_struct files_stat = { .max_files = NR_FILE }; -EXPORT_SYMBOL(files_stat); /* Needed by unix.o */ - /* public. Not pretty! */ - __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); -static DEFINE_SPINLOCK(filp_count_lock); +static struct percpu_counter nr_files __cacheline_aligned_in_smp; -/* slab constructors and destructors are called from arbitrary - * context and must be fully threaded - use a local spinlock - * to protect files_stat.nr_files - */ -void filp_ctor(void *objp, struct kmem_cache *cachep, unsigned long cflags) +static inline void file_free_rcu(struct rcu_head *head) { - if ((cflags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == - SLAB_CTOR_CONSTRUCTOR) { - unsigned long flags; - spin_lock_irqsave(&filp_count_lock, flags); - files_stat.nr_files++; - spin_unlock_irqrestore(&filp_count_lock, flags); - } + struct file *f = container_of(head, struct file, f_u.fu_rcuhead); + kmem_cache_free(filp_cachep, f); } -void filp_dtor(void *objp, struct kmem_cache *cachep, unsigned long dflags) +static inline void file_free(struct file *f) { - unsigned long flags; - spin_lock_irqsave(&filp_count_lock, flags); - files_stat.nr_files--; - spin_unlock_irqrestore(&filp_count_lock, flags); + percpu_counter_dec(&nr_files); + call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); } -static inline void file_free_rcu(struct rcu_head *head) +/* + * Return the total number of open files in the system + */ +static int get_nr_files(void) { - struct file *f = container_of(head, struct file, f_u.fu_rcuhead); - kmem_cache_free(filp_cachep, f); + return percpu_counter_read_positive(&nr_files); } -static inline void file_free(struct file *f) +/* + * Return the maximum number of open files in the system + */ +int get_max_files(void) { - call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); + return files_stat.max_files; } +EXPORT_SYMBOL_GPL(get_max_files); + +/* + * Handle nr_files sysctl + */ +#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) +int proc_nr_files(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + files_stat.nr_files = get_nr_files(); + return proc_dointvec(table, write, filp, buffer, lenp, ppos); +} +#else +int proc_nr_files(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} +#endif /* Find an unused file structure and return a pointer to it. * Returns NULL, if there are no more free file structures or @@ -78,14 +94,20 @@ struct file *get_empty_filp(void) /* * Privileged users can go above max_files */ - if (files_stat.nr_files >= files_stat.max_files && - !capable(CAP_SYS_ADMIN)) - goto over; + if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { + /* + * percpu_counters are inaccurate. Do an expensive check before + * we go and fail. + */ + if (percpu_counter_sum(&nr_files) >= files_stat.max_files) + goto over; + } f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); if (f == NULL) goto fail; + percpu_counter_inc(&nr_files); memset(f, 0, sizeof(*f)); if (security_file_alloc(f)) goto fail_sec; @@ -101,10 +123,10 @@ struct file *get_empty_filp(void) over: /* Ran out of filps - report that */ - if (files_stat.nr_files > old_max) { + if (get_nr_files() > old_max) { printk(KERN_INFO "VFS: file-max limit %d reached\n", - files_stat.max_files); - old_max = files_stat.nr_files; + get_max_files()); + old_max = get_nr_files(); } goto fail; @@ -276,4 +298,5 @@ void __init files_init(unsigned long mempages) if (files_stat.max_files < NR_FILE) files_stat.max_files = NR_FILE; files_defer_init(); + percpu_counter_init(&nr_files); } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 4526da8907c..0c9a2ee54c9 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -66,6 +66,12 @@ static void restore_sigs(sigset_t *oldset) sigprocmask(SIG_SETMASK, oldset, NULL); } +/* + * Reset request, so that it can be reused + * + * The caller must be _very_ careful to make sure, that it is holding + * the only reference to req + */ void fuse_reset_request(struct fuse_req *req) { int preallocated = req->preallocated; @@ -120,9 +126,9 @@ struct fuse_req *fuse_get_request(struct fuse_conn *fc) return do_get_request(fc); } +/* Must be called with fuse_lock held */ static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req) { - spin_lock(&fuse_lock); if (req->preallocated) { atomic_dec(&fc->num_waiting); list_add(&req->list, &fc->unused_list); @@ -134,11 +140,19 @@ static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req) fc->outstanding_debt--; else up(&fc->outstanding_sem); - spin_unlock(&fuse_lock); } void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) { + if (atomic_dec_and_test(&req->count)) { + spin_lock(&fuse_lock); + fuse_putback_request(fc, req); + spin_unlock(&fuse_lock); + } +} + +static void fuse_put_request_locked(struct fuse_conn *fc, struct fuse_req *req) +{ if (atomic_dec_and_test(&req->count)) fuse_putback_request(fc, req); } @@ -163,26 +177,36 @@ void fuse_release_background(struct fuse_req *req) * still waiting), the 'end' callback is called if given, else the * reference to the request is released * + * Releasing extra reference for foreground requests must be done + * within the same locked region as setting state to finished. This + * is because fuse_reset_request() may be called after request is + * finished and it must be the sole possessor. If request is + * interrupted and put in the background, it will return with an error + * and hence never be reset and reused. + * * Called with fuse_lock, unlocks it */ static void request_end(struct fuse_conn *fc, struct fuse_req *req) { - void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; - req->end = NULL; list_del(&req->list); req->state = FUSE_REQ_FINISHED; - spin_unlock(&fuse_lock); - if (req->background) { + if (!req->background) { + wake_up(&req->waitq); + fuse_put_request_locked(fc, req); + spin_unlock(&fuse_lock); + } else { + void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; + req->end = NULL; + spin_unlock(&fuse_lock); down_read(&fc->sbput_sem); if (fc->mounted) fuse_release_background(req); up_read(&fc->sbput_sem); + if (end) + end(fc, req); + else + fuse_put_request(fc, req); } - wake_up(&req->waitq); - if (end) - end(fc, req); - else - fuse_put_request(fc, req); } /* diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 21fd59c7bc2..c72a8a97935 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -111,6 +111,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) /* Doesn't hurt to "reset" the validity timeout */ fuse_invalidate_entry_cache(entry); + + /* For negative dentries, always do a fresh lookup */ if (!inode) return 0; @@ -122,6 +124,9 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) fuse_lookup_init(req, entry->d_parent->d_inode, entry, &outarg); request_send(fc, req); err = req->out.h.error; + /* Zero nodeid is same as -ENOENT */ + if (!err && !outarg.nodeid) + err = -ENOENT; if (!err) { struct fuse_inode *fi = get_fuse_inode(inode); if (outarg.nodeid != get_node_id(inode)) { @@ -190,8 +195,9 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, fuse_lookup_init(req, dir, entry, &outarg); request_send(fc, req); err = req->out.h.error; - if (!err && ((outarg.nodeid && invalid_nodeid(outarg.nodeid)) || - !valid_mode(outarg.attr.mode))) + /* Zero nodeid is same as -ENOENT, but with valid timeout */ + if (!err && outarg.nodeid && + (invalid_nodeid(outarg.nodeid) || !valid_mode(outarg.attr.mode))) err = -EIO; if (!err && outarg.nodeid) { inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a7ef5e716f3..6f05379b0a0 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -116,9 +116,14 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir) /* Special case for failed iget in CREATE */ static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) { - u64 nodeid = req->in.h.nodeid; - fuse_reset_request(req); - fuse_send_forget(fc, req, nodeid, 1); + /* If called from end_io_requests(), req has more than one + reference and fuse_reset_request() cannot work */ + if (fc->connected) { + u64 nodeid = req->in.h.nodeid; + fuse_reset_request(req); + fuse_send_forget(fc, req, nodeid, 1); + } else + fuse_put_request(fc, req); } void fuse_send_release(struct fuse_conn *fc, struct fuse_file *ff, @@ -335,9 +340,14 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file, loff_t pos = page_offset(req->pages[0]); size_t count = req->num_pages << PAGE_CACHE_SHIFT; req->out.page_zeroing = 1; - req->end = fuse_readpages_end; fuse_read_fill(req, file, inode, pos, count, FUSE_READ); - request_send_background(fc, req); + if (fc->async_read) { + req->end = fuse_readpages_end; + request_send_background(fc, req); + } else { + request_send(fc, req); + fuse_readpages_end(fc, req); + } } struct fuse_readpages_data { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 46cf933aa3b..4a83adfec96 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -272,6 +272,9 @@ struct fuse_conn { reply, before any other request, and never cleared */ unsigned conn_error : 1; + /** Do readpages asynchronously? Only set in INIT */ + unsigned async_read : 1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index c755a0440a6..879e6fba948 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -473,6 +473,16 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION) fc->conn_error = 1; else { + unsigned long ra_pages; + + if (arg->minor >= 6) { + ra_pages = arg->max_readahead / PAGE_CACHE_SIZE; + if (arg->flags & FUSE_ASYNC_READ) + fc->async_read = 1; + } else + ra_pages = fc->max_read / PAGE_CACHE_SIZE; + + fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages); fc->minor = arg->minor; fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; } @@ -496,6 +506,8 @@ static void fuse_send_init(struct fuse_conn *fc) arg->major = FUSE_KERNEL_VERSION; arg->minor = FUSE_KERNEL_MINOR_VERSION; + arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE; + arg->flags |= FUSE_ASYNC_READ; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); @@ -552,8 +564,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fc->user_id = d.user_id; fc->group_id = d.group_id; fc->max_read = d.max_read; - if (fc->max_read / PAGE_CACHE_SIZE < fc->bdi.ra_pages) - fc->bdi.ra_pages = fc->max_read / PAGE_CACHE_SIZE; /* Used by get_root_inode() */ sb->s_fs_info = fc; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index f568102da1e..b3519528994 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -72,8 +72,8 @@ huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma) unsigned long start = vma->vm_start; unsigned long end = vma->vm_end; unsigned long hugepages = (end - start) >> HPAGE_SHIFT; - pgoff_t next = vma->vm_pgoff; - pgoff_t endpg = next + ((end - start) >> PAGE_SHIFT); + pgoff_t next = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT); + pgoff_t endpg = next + hugepages; pagevec_init(&pvec, 0); while (next < endpg) { diff --git a/fs/inode.c b/fs/inode.c index 108138d4e90..d0be6159eb7 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1179,7 +1179,7 @@ EXPORT_SYMBOL(bmap); /** * touch_atime - update the access time * @mnt: mount the inode is accessed on - * @inode: inode accessed + * @dentry: dentry accessed * * Update the accessed time on an inode and mark it for writeback. * This function automatically handles read only file systems and media, diff --git a/fs/inotify.c b/fs/inotify.c index 878ccca6121..3041503bde0 100644 --- a/fs/inotify.c +++ b/fs/inotify.c @@ -967,7 +967,7 @@ asmlinkage long sys_inotify_add_watch(int fd, const char __user *path, u32 mask) mask_add = 1; /* don't let user-space set invalid bits: we don't want flags set */ - mask &= IN_ALL_EVENTS; + mask &= IN_ALL_EVENTS | IN_ONESHOT; if (unlikely(!mask)) { ret = -EINVAL; goto out; diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index e6265a0b56b..543ed543d1e 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -24,75 +24,29 @@ #include <linux/slab.h> /* - * Unlink a buffer from a transaction checkpoint list. + * Unlink a buffer from a transaction. * * Called with j_list_lock held. */ -static void __buffer_unlink_first(struct journal_head *jh) +static inline void __buffer_unlink(struct journal_head *jh) { transaction_t *transaction; transaction = jh->b_cp_transaction; + jh->b_cp_transaction = NULL; jh->b_cpnext->b_cpprev = jh->b_cpprev; jh->b_cpprev->b_cpnext = jh->b_cpnext; - if (transaction->t_checkpoint_list == jh) { + if (transaction->t_checkpoint_list == jh) transaction->t_checkpoint_list = jh->b_cpnext; - if (transaction->t_checkpoint_list == jh) - transaction->t_checkpoint_list = NULL; - } -} - -/* - * Unlink a buffer from a transaction checkpoint(io) list. - * - * Called with j_list_lock held. - */ - -static inline void __buffer_unlink(struct journal_head *jh) -{ - transaction_t *transaction; - - transaction = jh->b_cp_transaction; - - __buffer_unlink_first(jh); - if (transaction->t_checkpoint_io_list == jh) { - transaction->t_checkpoint_io_list = jh->b_cpnext; - if (transaction->t_checkpoint_io_list == jh) - transaction->t_checkpoint_io_list = NULL; - } -} - -/* - * Move a buffer from the checkpoint list to the checkpoint io list - * - * Called with j_list_lock held - */ - -static inline void __buffer_relink_io(struct journal_head *jh) -{ - transaction_t *transaction; - - transaction = jh->b_cp_transaction; - __buffer_unlink_first(jh); - - if (!transaction->t_checkpoint_io_list) { - jh->b_cpnext = jh->b_cpprev = jh; - } else { - jh->b_cpnext = transaction->t_checkpoint_io_list; - jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev; - jh->b_cpprev->b_cpnext = jh; - jh->b_cpnext->b_cpprev = jh; - } - transaction->t_checkpoint_io_list = jh; + if (transaction->t_checkpoint_list == jh) + transaction->t_checkpoint_list = NULL; } /* * Try to release a checkpointed buffer from its transaction. - * Returns 1 if we released it and 2 if we also released the - * whole transaction. - * + * Returns 1 if we released it. * Requires j_list_lock * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it */ @@ -103,11 +57,12 @@ static int __try_to_free_cp_buf(struct journal_head *jh) if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { JBUFFER_TRACE(jh, "remove from checkpoint list"); - ret = __journal_remove_checkpoint(jh) + 1; + __journal_remove_checkpoint(jh); jbd_unlock_bh_state(bh); journal_remove_journal_head(bh); BUFFER_TRACE(bh, "release"); __brelse(bh); + ret = 1; } else { jbd_unlock_bh_state(bh); } @@ -162,53 +117,83 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) } /* - * Clean up transaction's list of buffers submitted for io. - * We wait for any pending IO to complete and remove any clean - * buffers. Note that we take the buffers in the opposite ordering - * from the one in which they were submitted for IO. + * Clean up a transaction's checkpoint list. + * + * We wait for any pending IO to complete and make sure any clean + * buffers are removed from the transaction. + * + * Return 1 if we performed any actions which might have destroyed the + * checkpoint. (journal_remove_checkpoint() deletes the transaction when + * the last checkpoint buffer is cleansed) * * Called with j_list_lock held. */ - -static void __wait_cp_io(journal_t *journal, transaction_t *transaction) +static int __cleanup_transaction(journal_t *journal, transaction_t *transaction) { - struct journal_head *jh; + struct journal_head *jh, *next_jh, *last_jh; struct buffer_head *bh; - tid_t this_tid; - int released = 0; - - this_tid = transaction->t_tid; -restart: - /* Didn't somebody clean up the transaction in the meanwhile */ - if (journal->j_checkpoint_transactions != transaction || - transaction->t_tid != this_tid) - return; - while (!released && transaction->t_checkpoint_io_list) { - jh = transaction->t_checkpoint_io_list; + int ret = 0; + + assert_spin_locked(&journal->j_list_lock); + jh = transaction->t_checkpoint_list; + if (!jh) + return 0; + + last_jh = jh->b_cpprev; + next_jh = jh; + do { + jh = next_jh; bh = jh2bh(jh); - if (!jbd_trylock_bh_state(bh)) { - jbd_sync_bh(journal, bh); - spin_lock(&journal->j_list_lock); - goto restart; - } if (buffer_locked(bh)) { atomic_inc(&bh->b_count); spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); __brelse(bh); - spin_lock(&journal->j_list_lock); - goto restart; + goto out_return_1; } + /* - * Now in whatever state the buffer currently is, we know that - * it has been written out and so we can drop it from the list + * This is foul */ - released = __journal_remove_checkpoint(jh); - jbd_unlock_bh_state(bh); - } + if (!jbd_trylock_bh_state(bh)) { + jbd_sync_bh(journal, bh); + goto out_return_1; + } + + if (jh->b_transaction != NULL) { + transaction_t *t = jh->b_transaction; + tid_t tid = t->t_tid; + + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh); + log_start_commit(journal, tid); + log_wait_commit(journal, tid); + goto out_return_1; + } + + /* + * AKPM: I think the buffer_jbddirty test is redundant - it + * shouldn't have NULL b_transaction? + */ + next_jh = jh->b_cpnext; + if (!buffer_dirty(bh) && !buffer_jbddirty(bh)) { + BUFFER_TRACE(bh, "remove from checkpoint"); + __journal_remove_checkpoint(jh); + jbd_unlock_bh_state(bh); + journal_remove_journal_head(bh); + __brelse(bh); + ret = 1; + } else { + jbd_unlock_bh_state(bh); + } + } while (jh != last_jh); + + return ret; +out_return_1: + spin_lock(&journal->j_list_lock); + return 1; } #define NR_BATCH 64 @@ -218,7 +203,9 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) { int i; + spin_unlock(&journal->j_list_lock); ll_rw_block(SWRITE, *batch_count, bhs); + spin_lock(&journal->j_list_lock); for (i = 0; i < *batch_count; i++) { struct buffer_head *bh = bhs[i]; clear_buffer_jwrite(bh); @@ -234,46 +221,19 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) * Return 1 if something happened which requires us to abort the current * scan of the checkpoint list. * - * Called with j_list_lock held and drops it if 1 is returned + * Called with j_list_lock held. * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it */ -static int __process_buffer(journal_t *journal, struct journal_head *jh, - struct buffer_head **bhs, int *batch_count) +static int __flush_buffer(journal_t *journal, struct journal_head *jh, + struct buffer_head **bhs, int *batch_count, + int *drop_count) { struct buffer_head *bh = jh2bh(jh); int ret = 0; - if (buffer_locked(bh)) { - get_bh(bh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - wait_on_buffer(bh); - /* the journal_head may have gone by now */ - BUFFER_TRACE(bh, "brelse"); - put_bh(bh); - ret = 1; - } - else if (jh->b_transaction != NULL) { - transaction_t *t = jh->b_transaction; - tid_t tid = t->t_tid; + if (buffer_dirty(bh) && !buffer_locked(bh) && jh->b_jlist == BJ_None) { + J_ASSERT_JH(jh, jh->b_transaction == NULL); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - log_start_commit(journal, tid); - log_wait_commit(journal, tid); - ret = 1; - } - else if (!buffer_dirty(bh)) { - J_ASSERT_JH(jh, !buffer_jbddirty(bh)); - BUFFER_TRACE(bh, "remove from checkpoint"); - __journal_remove_checkpoint(jh); - spin_unlock(&journal->j_list_lock); - jbd_unlock_bh_state(bh); - journal_remove_journal_head(bh); - put_bh(bh); - ret = 1; - } - else { /* * Important: we are about to write the buffer, and * possibly block, while still holding the journal lock. @@ -286,30 +246,45 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, J_ASSERT_BH(bh, !buffer_jwrite(bh)); set_buffer_jwrite(bh); bhs[*batch_count] = bh; - __buffer_relink_io(jh); jbd_unlock_bh_state(bh); (*batch_count)++; if (*batch_count == NR_BATCH) { - spin_unlock(&journal->j_list_lock); __flush_batch(journal, bhs, batch_count); ret = 1; } + } else { + int last_buffer = 0; + if (jh->b_cpnext == jh) { + /* We may be about to drop the transaction. Tell the + * caller that the lists have changed. + */ + last_buffer = 1; + } + if (__try_to_free_cp_buf(jh)) { + (*drop_count)++; + ret = last_buffer; + } } return ret; } /* - * Perform an actual checkpoint. We take the first transaction on the - * list of transactions to be checkpointed and send all its buffers - * to disk. We submit larger chunks of data at once. + * Perform an actual checkpoint. We don't write out only enough to + * satisfy the current blocked requests: rather we submit a reasonably + * sized chunk of the outstanding data to disk at once for + * efficiency. __log_wait_for_space() will retry if we didn't free enough. * + * However, we _do_ take into account the amount requested so that once + * the IO has been queued, we can return as soon as enough of it has + * completed to disk. + * * The journal should be locked before calling this function. */ int log_do_checkpoint(journal_t *journal) { - transaction_t *transaction; - tid_t this_tid; int result; + int batch_count = 0; + struct buffer_head *bhs[NR_BATCH]; jbd_debug(1, "Start checkpoint\n"); @@ -324,70 +299,79 @@ int log_do_checkpoint(journal_t *journal) return result; /* - * OK, we need to start writing disk blocks. Take one transaction - * and write it. + * OK, we need to start writing disk blocks. Try to free up a + * quarter of the log in a single checkpoint if we can. */ - spin_lock(&journal->j_list_lock); - if (!journal->j_checkpoint_transactions) - goto out; - transaction = journal->j_checkpoint_transactions; - this_tid = transaction->t_tid; -restart: /* - * If someone cleaned up this transaction while we slept, we're - * done (maybe it's a new transaction, but it fell at the same - * address). + * AKPM: check this code. I had a feeling a while back that it + * degenerates into a busy loop at unmount time. */ - if (journal->j_checkpoint_transactions == transaction && - transaction->t_tid == this_tid) { - int batch_count = 0; - struct buffer_head *bhs[NR_BATCH]; - struct journal_head *jh; - int retry = 0; - - while (!retry && transaction->t_checkpoint_list) { + spin_lock(&journal->j_list_lock); + while (journal->j_checkpoint_transactions) { + transaction_t *transaction; + struct journal_head *jh, *last_jh, *next_jh; + int drop_count = 0; + int cleanup_ret, retry = 0; + tid_t this_tid; + + transaction = journal->j_checkpoint_transactions; + this_tid = transaction->t_tid; + jh = transaction->t_checkpoint_list; + last_jh = jh->b_cpprev; + next_jh = jh; + do { struct buffer_head *bh; - jh = transaction->t_checkpoint_list; + jh = next_jh; + next_jh = jh->b_cpnext; bh = jh2bh(jh); if (!jbd_trylock_bh_state(bh)) { jbd_sync_bh(journal, bh); + spin_lock(&journal->j_list_lock); retry = 1; break; } - retry = __process_buffer(journal, jh, bhs, - &batch_count); - if (!retry && - lock_need_resched(&journal->j_list_lock)) { - spin_unlock(&journal->j_list_lock); + retry = __flush_buffer(journal, jh, bhs, &batch_count, &drop_count); + if (cond_resched_lock(&journal->j_list_lock)) { retry = 1; break; } - } + } while (jh != last_jh && !retry); if (batch_count) { - if (!retry) { - spin_unlock(&journal->j_list_lock); - retry = 1; - } __flush_batch(journal, bhs, &batch_count); + retry = 1; } - if (retry) { - spin_lock(&journal->j_list_lock); - goto restart; - } /* - * Now we have cleaned up the first transaction's checkpoint - * list. Let's clean up the second one. + * If someone cleaned up this transaction while we slept, we're + * done + */ + if (journal->j_checkpoint_transactions != transaction) + break; + if (retry) + continue; + /* + * Maybe it's a new transaction, but it fell at the same + * address */ - __wait_cp_io(journal, transaction); + if (transaction->t_tid != this_tid) + continue; + /* + * We have walked the whole transaction list without + * finding anything to write to disk. We had better be + * able to make some progress or we are in trouble. + */ + cleanup_ret = __cleanup_transaction(journal, transaction); + J_ASSERT(drop_count != 0 || cleanup_ret != 0); + if (journal->j_checkpoint_transactions != transaction) + break; } -out: spin_unlock(&journal->j_list_lock); result = cleanup_journal_tail(journal); if (result < 0) return result; + return 0; } @@ -472,91 +456,52 @@ int cleanup_journal_tail(journal_t *journal) /* Checkpoint list management */ /* - * journal_clean_one_cp_list - * - * Find all the written-back checkpoint buffers in the given list and release them. - * - * Called with the journal locked. - * Called with j_list_lock held. - * Returns number of bufers reaped (for debug) - */ - -static int journal_clean_one_cp_list(struct journal_head *jh, int *released) -{ - struct journal_head *last_jh; - struct journal_head *next_jh = jh; - int ret, freed = 0; - - *released = 0; - if (!jh) - return 0; - - last_jh = jh->b_cpprev; - do { - jh = next_jh; - next_jh = jh->b_cpnext; - /* Use trylock because of the ranking */ - if (jbd_trylock_bh_state(jh2bh(jh))) { - ret = __try_to_free_cp_buf(jh); - if (ret) { - freed++; - if (ret == 2) { - *released = 1; - return freed; - } - } - } - /* - * This function only frees up some memory if possible so we - * dont have an obligation to finish processing. Bail out if - * preemption requested: - */ - if (need_resched()) - return freed; - } while (jh != last_jh); - - return freed; -} - -/* * journal_clean_checkpoint_list * * Find all the written-back checkpoint buffers in the journal and release them. * * Called with the journal locked. * Called with j_list_lock held. - * Returns number of buffers reaped (for debug) + * Returns number of bufers reaped (for debug) */ int __journal_clean_checkpoint_list(journal_t *journal) { transaction_t *transaction, *last_transaction, *next_transaction; - int ret = 0, released; + int ret = 0; transaction = journal->j_checkpoint_transactions; - if (!transaction) + if (transaction == 0) goto out; last_transaction = transaction->t_cpprev; next_transaction = transaction; do { + struct journal_head *jh; + transaction = next_transaction; next_transaction = transaction->t_cpnext; - ret += journal_clean_one_cp_list(transaction-> - t_checkpoint_list, &released); - if (need_resched()) - goto out; - if (released) - continue; - /* - * It is essential that we are as careful as in the case of - * t_checkpoint_list with removing the buffer from the list as - * we can possibly see not yet submitted buffers on io_list - */ - ret += journal_clean_one_cp_list(transaction-> - t_checkpoint_io_list, &released); - if (need_resched()) - goto out; + jh = transaction->t_checkpoint_list; + if (jh) { + struct journal_head *last_jh = jh->b_cpprev; + struct journal_head *next_jh = jh; + + do { + jh = next_jh; + next_jh = jh->b_cpnext; + /* Use trylock because of the ranknig */ + if (jbd_trylock_bh_state(jh2bh(jh))) + ret += __try_to_free_cp_buf(jh); + /* + * This function only frees up some memory + * if possible so we dont have an obligation + * to finish processing. Bail out if preemption + * requested: + */ + if (need_resched()) + goto out; + } while (jh != last_jh); + } } while (transaction != last_transaction); out: return ret; @@ -571,22 +516,18 @@ out: * buffer updates committed in that transaction have safely been stored * elsewhere on disk. To achieve this, all of the buffers in a * transaction need to be maintained on the transaction's checkpoint - * lists until they have been rewritten, at which point this function is + * list until they have been rewritten, at which point this function is * called to remove the buffer from the existing transaction's - * checkpoint lists. - * - * The function returns 1 if it frees the transaction, 0 otherwise. + * checkpoint list. * * This function is called with the journal locked. * This function is called with j_list_lock held. - * This function is called with jbd_lock_bh_state(jh2bh(jh)) */ -int __journal_remove_checkpoint(struct journal_head *jh) +void __journal_remove_checkpoint(struct journal_head *jh) { transaction_t *transaction; journal_t *journal; - int ret = 0; JBUFFER_TRACE(jh, "entry"); @@ -597,10 +538,8 @@ int __journal_remove_checkpoint(struct journal_head *jh) journal = transaction->t_journal; __buffer_unlink(jh); - jh->b_cp_transaction = NULL; - if (transaction->t_checkpoint_list != NULL || - transaction->t_checkpoint_io_list != NULL) + if (transaction->t_checkpoint_list != NULL) goto out; JBUFFER_TRACE(jh, "transaction has no more buffers"); @@ -626,10 +565,8 @@ int __journal_remove_checkpoint(struct journal_head *jh) /* Just in case anybody was waiting for more transactions to be checkpointed... */ wake_up(&journal->j_wait_logspace); - ret = 1; out: JBUFFER_TRACE(jh, "exit"); - return ret; } /* @@ -691,7 +628,6 @@ void __journal_drop_transaction(journal_t *journal, transaction_t *transaction) J_ASSERT(transaction->t_shadow_list == NULL); J_ASSERT(transaction->t_log_list == NULL); J_ASSERT(transaction->t_checkpoint_list == NULL); - J_ASSERT(transaction->t_checkpoint_io_list == NULL); J_ASSERT(transaction->t_updates == 0); J_ASSERT(journal->j_committing_transaction != transaction); J_ASSERT(journal->j_running_transaction != transaction); diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 29e62d98bae..002ad2bbc76 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -829,8 +829,7 @@ restart_loop: journal->j_committing_transaction = NULL; spin_unlock(&journal->j_state_lock); - if (commit_transaction->t_checkpoint_list == NULL && - commit_transaction->t_checkpoint_io_list == NULL) { + if (commit_transaction->t_checkpoint_list == NULL) { __journal_drop_transaction(journal, commit_transaction); } else { if (journal->j_checkpoint_transactions == NULL) { diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 429f4b263cf..ca917973c2c 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -1308,6 +1308,7 @@ int journal_stop(handle_t *handle) transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; int old_handle_count, err; + pid_t pid; J_ASSERT(transaction->t_updates > 0); J_ASSERT(journal_current_handle() == handle); @@ -1333,8 +1334,15 @@ int journal_stop(handle_t *handle) * It doesn't cost much - we're about to run a commit and sleep * on IO anyway. Speeds up many-threaded, many-dir operations * by 30x or more... + * + * But don't do this if this process was the most recent one to + * perform a synchronous write. We do this to detect the case where a + * single process is doing a stream of sync writes. No point in waiting + * for joiners in that case. */ - if (handle->h_sync) { + pid = current->pid; + if (handle->h_sync && journal->j_last_sync_writer != pid) { + journal->j_last_sync_writer = pid; do { old_handle_count = transaction->t_handle_count; schedule_timeout_uninterruptible(1); diff --git a/fs/jffs/intrep.c b/fs/jffs/intrep.c index b2e95421d93..ce7b54b0b2b 100644 --- a/fs/jffs/intrep.c +++ b/fs/jffs/intrep.c @@ -1965,7 +1965,7 @@ retry: iovec_cnt++; if (JFFS_GET_PAD_BYTES(raw_inode->nsize)) { - static char allff[3]={255,255,255}; + static unsigned char allff[3]={255,255,255}; /* Add some extra padding if necessary */ node_iovec[iovec_cnt].iov_base = allff; node_iovec[iovec_cnt].iov_len = diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c index b635e167a3f..d4d0c41490c 100644 --- a/fs/jffs2/nodelist.c +++ b/fs/jffs2/nodelist.c @@ -406,7 +406,8 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info int err = 0, pointed = 0; struct jffs2_eraseblock *jeb; unsigned char *buffer; - uint32_t crc, ofs, retlen, len; + uint32_t crc, ofs, len; + size_t retlen; BUG_ON(tn->csize == 0); diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 5f0652df5d4..f1695642d0f 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -112,7 +112,7 @@ static struct jffs2_raw_node_ref *jffs2_first_valid_node(struct jffs2_raw_node_r * negative error code on failure. */ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref, - struct jffs2_raw_dirent *rd, uint32_t read, struct jffs2_full_dirent **fdp, + struct jffs2_raw_dirent *rd, size_t read, struct jffs2_full_dirent **fdp, uint32_t *latest_mctime, uint32_t *mctime_ver) { struct jffs2_full_dirent *fd; diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index 3e51dd1da8a..cf55b221fc2 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -233,7 +233,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) c->nextblock->dirty_size = 0; } #ifdef CONFIG_JFFS2_FS_WRITEBUFFER - if (!jffs2_can_mark_obsolete(c) && c->nextblock && (c->nextblock->free_size % c->wbuf_pagesize)) { + if (!jffs2_can_mark_obsolete(c) && c->wbuf_pagesize && c->nextblock && (c->nextblock->free_size % c->wbuf_pagesize)) { /* If we're going to start writing into a block which already contains data, and the end of the data isn't page-aligned, skip a little and align it. */ diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 2967b739341..79b5404db10 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -532,10 +532,10 @@ dbUpdatePMap(struct inode *ipbmap, lastlblkno = lblkno; + LOGSYNC_LOCK(log, flags); if (mp->lsn != 0) { /* inherit older/smaller lsn */ logdiff(diffp, mp->lsn, log); - LOGSYNC_LOCK(log, flags); if (difft < diffp) { mp->lsn = lsn; @@ -548,20 +548,17 @@ dbUpdatePMap(struct inode *ipbmap, logdiff(diffp, mp->clsn, log); if (difft > diffp) mp->clsn = tblk->clsn; - LOGSYNC_UNLOCK(log, flags); } else { mp->log = log; mp->lsn = lsn; /* insert bp after tblock in logsync list */ - LOGSYNC_LOCK(log, flags); - log->count++; list_add(&mp->synclist, &tblk->synclist); mp->clsn = tblk->clsn; - LOGSYNC_UNLOCK(log, flags); } + LOGSYNC_UNLOCK(log, flags); } /* write the last buffer. */ diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 31b4aa13dd4..4efa0d0eec3 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -2844,11 +2844,11 @@ diUpdatePMap(struct inode *ipimap, */ lsn = tblk->lsn; log = JFS_SBI(tblk->sb)->log; + LOGSYNC_LOCK(log, flags); if (mp->lsn != 0) { /* inherit older/smaller lsn */ logdiff(difft, lsn, log); logdiff(diffp, mp->lsn, log); - LOGSYNC_LOCK(log, flags); if (difft < diffp) { mp->lsn = lsn; /* move mp after tblock in logsync list */ @@ -2860,17 +2860,15 @@ diUpdatePMap(struct inode *ipimap, logdiff(diffp, mp->clsn, log); if (difft > diffp) mp->clsn = tblk->clsn; - LOGSYNC_UNLOCK(log, flags); } else { mp->log = log; mp->lsn = lsn; /* insert mp after tblock in logsync list */ - LOGSYNC_LOCK(log, flags); log->count++; list_add(&mp->synclist, &tblk->synclist); mp->clsn = tblk->clsn; - LOGSYNC_UNLOCK(log, flags); } + LOGSYNC_UNLOCK(log, flags); write_metapage(mp); return (0); } diff --git a/fs/libfs.c b/fs/libfs.c index 63c020e6589..71fd08fa410 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -388,6 +388,7 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; + inode->i_nlink = 2; root = d_alloc_root(inode); if (!root) { iput(inode); diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 3eaf6e70108..da6354baa0b 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -111,9 +111,10 @@ long nlmclnt_block(struct nlm_rqst *req, long timeout) /* * The server lockd has called us back to tell us the lock was granted */ -u32 -nlmclnt_grant(struct nlm_lock *lock) +u32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock) { + const struct file_lock *fl = &lock->fl; + const struct nfs_fh *fh = &lock->fh; struct nlm_wait *block; u32 res = nlm_lck_denied; @@ -122,14 +123,20 @@ nlmclnt_grant(struct nlm_lock *lock) * Warning: must not use cookie to match it! */ list_for_each_entry(block, &nlm_blocked, b_list) { - if (nlm_compare_locks(block->b_lock, &lock->fl)) { - /* Alright, we found a lock. Set the return status - * and wake up the caller - */ - block->b_status = NLM_LCK_GRANTED; - wake_up(&block->b_wait); - res = nlm_granted; - } + struct file_lock *fl_blocked = block->b_lock; + + if (!nlm_compare_locks(fl_blocked, fl)) + continue; + if (!nlm_cmp_addr(&block->b_host->h_addr, addr)) + continue; + if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_dentry->d_inode) ,fh) != 0) + continue; + /* Alright, we found a lock. Set the return status + * and wake up the caller + */ + block->b_status = NLM_LCK_GRANTED; + wake_up(&block->b_wait); + res = nlm_granted; } return res; } diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 14552403957..970b6a6aa33 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -22,12 +22,14 @@ #define NLMDBG_FACILITY NLMDBG_CLIENT #define NLMCLNT_GRACE_WAIT (5*HZ) #define NLMCLNT_POLL_TIMEOUT (30*HZ) +#define NLMCLNT_MAX_RETRIES 3 static int nlmclnt_test(struct nlm_rqst *, struct file_lock *); static int nlmclnt_lock(struct nlm_rqst *, struct file_lock *); static int nlmclnt_unlock(struct nlm_rqst *, struct file_lock *); static int nlm_stat_to_errno(u32 stat); static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host); +static int nlmclnt_cancel(struct nlm_host *, int , struct file_lock *); static const struct rpc_call_ops nlmclnt_unlock_ops; static const struct rpc_call_ops nlmclnt_cancel_ops; @@ -598,7 +600,7 @@ out_unblock: nlmclnt_finish_block(req); /* Cancel the blocked request if it is still pending */ if (resp->status == NLM_LCK_BLOCKED) - nlmclnt_cancel(host, fl); + nlmclnt_cancel(host, req->a_args.block, fl); out: nlmclnt_release_lockargs(req); return status; @@ -660,12 +662,18 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) * reclaimed while we're stuck in the unlock call. */ fl->fl_u.nfs_fl.flags &= ~NFS_LCK_GRANTED; + /* + * Note: the server is supposed to either grant us the unlock + * request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either + * case, we want to unlock. + */ + do_vfs_lock(fl); + if (req->a_flags & RPC_TASK_ASYNC) { status = nlmclnt_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops); /* Hrmf... Do the unlock early since locks_remove_posix() * really expects us to free the lock synchronously */ - do_vfs_lock(fl); if (status < 0) { nlmclnt_release_lockargs(req); kfree(req); @@ -678,7 +686,6 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) if (status < 0) return status; - do_vfs_lock(fl); if (resp->status == NLM_LCK_GRANTED) return 0; @@ -728,8 +735,7 @@ static const struct rpc_call_ops nlmclnt_unlock_ops = { * We always use an async RPC call for this in order not to hang a * process that has been Ctrl-C'ed. */ -int -nlmclnt_cancel(struct nlm_host *host, struct file_lock *fl) +static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl) { struct nlm_rqst *req; unsigned long flags; @@ -750,6 +756,7 @@ nlmclnt_cancel(struct nlm_host *host, struct file_lock *fl) req->a_flags = RPC_TASK_ASYNC; nlmclnt_setlockargs(req, fl); + req->a_args.block = block; status = nlmclnt_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops); if (status < 0) { @@ -801,6 +808,9 @@ die: return; retry_cancel: + /* Don't ever retry more than 3 times */ + if (req->a_retries++ >= NLMCLNT_MAX_RETRIES) + goto die; nlm_rebind_host(req->a_host); rpc_restart_call(task); rpc_delay(task, 30 * HZ); diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 4063095d849..b10f913aa06 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -228,7 +228,7 @@ nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; dprintk("lockd: GRANTED called\n"); - resp->status = nlmclnt_grant(&argp->lock); + resp->status = nlmclnt_grant(&rqstp->rq_addr, &argp->lock); dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); return rpc_success; } diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 3bc437e0cf5..35681d9cf1f 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -256,7 +256,7 @@ nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; dprintk("lockd: GRANTED called\n"); - resp->status = nlmclnt_grant(&argp->lock); + resp->status = nlmclnt_grant(&rqstp->rq_addr, &argp->lock); dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); return rpc_success; } diff --git a/fs/namei.c b/fs/namei.c index 4acdac043b6..8dc2b038d5d 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -790,7 +790,7 @@ static fastcall int __link_path_walk(const char * name, struct nameidata *nd) inode = nd->dentry->d_inode; if (nd->depth) - lookup_flags = LOOKUP_FOLLOW; + lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE); /* At this point we know we have a real path component. */ for(;;) { @@ -885,7 +885,8 @@ static fastcall int __link_path_walk(const char * name, struct nameidata *nd) last_with_slashes: lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; last_component: - nd->flags &= ~LOOKUP_CONTINUE; + /* Clear LOOKUP_CONTINUE iff it was previously unset */ + nd->flags &= lookup_flags | ~LOOKUP_CONTINUE; if (lookup_flags & LOOKUP_PARENT) goto lookup_parent; if (this.name[0] == '.') switch (this.len) { @@ -1069,6 +1070,8 @@ static int fastcall do_path_lookup(int dfd, const char *name, unsigned int flags, struct nameidata *nd) { int retval = 0; + int fput_needed; + struct file *file; nd->last_type = LAST_ROOT; /* if there are only slashes... */ nd->flags = flags; @@ -1090,29 +1093,22 @@ static int fastcall do_path_lookup(int dfd, const char *name, nd->mnt = mntget(current->fs->pwdmnt); nd->dentry = dget(current->fs->pwd); } else { - struct file *file; - int fput_needed; struct dentry *dentry; file = fget_light(dfd, &fput_needed); - if (!file) { - retval = -EBADF; - goto out_fail; - } + retval = -EBADF; + if (!file) + goto unlock_fail; dentry = file->f_dentry; - if (!S_ISDIR(dentry->d_inode->i_mode)) { - retval = -ENOTDIR; - fput_light(file, fput_needed); - goto out_fail; - } + retval = -ENOTDIR; + if (!S_ISDIR(dentry->d_inode->i_mode)) + goto fput_unlock_fail; retval = file_permission(file, MAY_EXEC); - if (retval) { - fput_light(file, fput_needed); - goto out_fail; - } + if (retval) + goto fput_unlock_fail; nd->mnt = mntget(file->f_vfsmnt); nd->dentry = dget(dentry); @@ -1123,10 +1119,17 @@ static int fastcall do_path_lookup(int dfd, const char *name, current->total_link_count = 0; retval = link_path_walk(name, nd); out: - if (unlikely(current->audit_context - && nd && nd->dentry && nd->dentry->d_inode)) + if (likely(retval == 0)) { + if (unlikely(current->audit_context && nd && nd->dentry && + nd->dentry->d_inode)) audit_inode(name, nd->dentry->d_inode, flags); -out_fail: + } + return retval; + +fput_unlock_fail: + fput_light(file, fput_needed); +unlock_fail: + read_unlock(¤t->fs->lock); return retval; } @@ -1161,6 +1164,7 @@ static int __path_lookup_intent_open(int dfd, const char *name, /** * path_lookup_open - lookup a file path with open intent + * @dfd: the directory to use as base, or AT_FDCWD * @name: pointer to file name * @lookup_flags: lookup intent flags * @nd: pointer to nameidata @@ -1175,6 +1179,7 @@ int path_lookup_open(int dfd, const char *name, unsigned int lookup_flags, /** * path_lookup_create - lookup a file path with open + create intent + * @dfd: the directory to use as base, or AT_FDCWD * @name: pointer to file name * @lookup_flags: lookup intent flags * @nd: pointer to nameidata @@ -2219,13 +2224,17 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de * and other special files. --ADM */ asmlinkage long sys_linkat(int olddfd, const char __user *oldname, - int newdfd, const char __user *newname) + int newdfd, const char __user *newname, + int flags) { struct dentry *new_dentry; struct nameidata nd, old_nd; int error; char * to; + if (flags != 0) + return -EINVAL; + to = getname(newname); if (IS_ERR(to)) return PTR_ERR(to); @@ -2258,7 +2267,7 @@ exit: asmlinkage long sys_link(const char __user *oldname, const char __user *newname) { - return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname); + return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } /* @@ -2604,13 +2613,15 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) } } -int page_symlink(struct inode *inode, const char *symname, int len) +int __page_symlink(struct inode *inode, const char *symname, int len, + gfp_t gfp_mask) { struct address_space *mapping = inode->i_mapping; - struct page *page = grab_cache_page(mapping, 0); + struct page *page; int err = -ENOMEM; char *kaddr; + page = find_or_create_page(mapping, 0, gfp_mask); if (!page) goto fail; err = mapping->a_ops->prepare_write(NULL, page, 0, len-1); @@ -2645,6 +2656,12 @@ fail: return err; } +int page_symlink(struct inode *inode, const char *symname, int len) +{ + return __page_symlink(inode, symname, len, + mapping_gfp_mask(inode->i_mapping)); +} + struct inode_operations page_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, @@ -2663,6 +2680,7 @@ EXPORT_SYMBOL(lookup_one_len); EXPORT_SYMBOL(page_follow_link_light); EXPORT_SYMBOL(page_put_link); EXPORT_SYMBOL(page_readlink); +EXPORT_SYMBOL(__page_symlink); EXPORT_SYMBOL(page_symlink); EXPORT_SYMBOL(page_symlink_inode_operations); EXPORT_SYMBOL(path_lookup); diff --git a/fs/namespace.c b/fs/namespace.c index ce97becff46..39c81a8d631 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -494,7 +494,7 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) p->mnt_namespace = NULL; list_del_init(&p->mnt_child); if (p->mnt_parent != p) - mnt->mnt_mountpoint->d_mounted--; + p->mnt_mountpoint->d_mounted--; change_mnt_propagation(p, MS_PRIVATE); } } @@ -1325,30 +1325,20 @@ dput_out: return retval; } -int copy_namespace(int flags, struct task_struct *tsk) +/* + * Allocate a new namespace structure and populate it with contents + * copied from the namespace of the passed in task structure. + */ +struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs) { struct namespace *namespace = tsk->namespace; struct namespace *new_ns; struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL; - struct fs_struct *fs = tsk->fs; struct vfsmount *p, *q; - if (!namespace) - return 0; - - get_namespace(namespace); - - if (!(flags & CLONE_NEWNS)) - return 0; - - if (!capable(CAP_SYS_ADMIN)) { - put_namespace(namespace); - return -EPERM; - } - new_ns = kmalloc(sizeof(struct namespace), GFP_KERNEL); if (!new_ns) - goto out; + return NULL; atomic_set(&new_ns->count, 1); INIT_LIST_HEAD(&new_ns->list); @@ -1362,7 +1352,7 @@ int copy_namespace(int flags, struct task_struct *tsk) if (!new_ns->root) { up_write(&namespace_sem); kfree(new_ns); - goto out; + return NULL; } spin_lock(&vfsmount_lock); list_add_tail(&new_ns->list, &new_ns->root->mnt_list); @@ -1396,8 +1386,6 @@ int copy_namespace(int flags, struct task_struct *tsk) } up_write(&namespace_sem); - tsk->namespace = new_ns; - if (rootmnt) mntput(rootmnt); if (pwdmnt) @@ -1405,12 +1393,39 @@ int copy_namespace(int flags, struct task_struct *tsk) if (altrootmnt) mntput(altrootmnt); - put_namespace(namespace); - return 0; + return new_ns; +} + +int copy_namespace(int flags, struct task_struct *tsk) +{ + struct namespace *namespace = tsk->namespace; + struct namespace *new_ns; + int err = 0; + + if (!namespace) + return 0; + + get_namespace(namespace); + + if (!(flags & CLONE_NEWNS)) + return 0; + + if (!capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto out; + } + + new_ns = dup_namespace(tsk, tsk->fs); + if (!new_ns) { + err = -ENOMEM; + goto out; + } + + tsk->namespace = new_ns; out: put_namespace(namespace); - return -ENOMEM; + return err; } asmlinkage long sys_mount(char __user * dev_name, char __user * dir_name, diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 10ae377e68f..4e9b3a1b36c 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -57,6 +57,7 @@ #define NFSDBG_FACILITY NFSDBG_VFS #define MAX_DIRECTIO_SIZE (4096UL << PAGE_SHIFT) +static void nfs_free_user_pages(struct page **pages, int npages, int do_dirty); static kmem_cache_t *nfs_direct_cachep; /* @@ -107,6 +108,15 @@ nfs_get_user_pages(int rw, unsigned long user_addr, size_t size, page_count, (rw == READ), 0, *pages, NULL); up_read(¤t->mm->mmap_sem); + /* + * If we got fewer pages than expected from get_user_pages(), + * the user buffer runs off the end of a mapping; return EFAULT. + */ + if (result >= 0 && result < page_count) { + nfs_free_user_pages(*pages, result, 0); + *pages = NULL; + result = -EFAULT; + } } return result; } @@ -481,7 +491,7 @@ retry: if (wdata->verf.committed != NFS_FILE_SYNC) { need_commit = 1; if (memcmp(&first_verf.verifier, &wdata->verf.verifier, - sizeof(first_verf.verifier))); + sizeof(first_verf.verifier))) goto sync_retry; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 984ca3454d0..f8c0066e02e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1430,7 +1430,7 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, if (status == 0) status = nfs4_do_fsinfo(server, fhandle, info); out: - return status; + return nfs4_map_errors(status); } static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index e897e00c2c9..c0a754ecdee 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -465,10 +465,11 @@ static int __init root_nfs_ports(void) "number from server, using default\n"); port = nfsd_port; } - nfs_port = htons(port); + nfs_port = port; dprintk("Root-NFS: Portmapper on server returned %d " "as nfsd port\n", port); } + nfs_port = htons(nfs_port); if ((port = root_nfs_getport(NFS_MNT_PROGRAM, mountd_ver, proto)) < 0) { printk(KERN_ERR "Root-NFS: Unable to get mountd port " diff --git a/fs/nfsctl.c b/fs/nfsctl.c index 0d4cf948606..1c72c7f85dd 100644 --- a/fs/nfsctl.c +++ b/fs/nfsctl.c @@ -98,10 +98,8 @@ asmlinkage sys_nfsservctl(int cmd, struct nfsctl_arg __user *arg, void __user *r if (copy_from_user(&version, &arg->ca_version, sizeof(int))) return -EFAULT; - if (version != NFSCTL_VERSION) { - printk(KERN_WARNING "nfsd: incompatible version in syscall.\n"); + if (version != NFSCTL_VERSION) return -EINVAL; - } if (cmd < 0 || cmd >= sizeof(map)/sizeof(map[0]) || !map[cmd].name) return -EINVAL; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index a00fe868629..6d63f1d9e5f 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -195,10 +195,12 @@ nfsd4_open(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open /* Openowner is now set, so sequence id will get bumped. Now we need * these checks before we do any creates: */ + status = nfserr_grace; if (nfs4_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) - return nfserr_grace; + goto out; + status = nfserr_no_grace; if (!nfs4_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) - return nfserr_no_grace; + goto out; switch (open->op_claim_type) { case NFS4_OPEN_CLAIM_DELEGATE_CUR: diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 89ed0469686..1d163b61691 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -64,6 +64,32 @@ struct nfsd_list { }; static struct list_head nfsd_list = LIST_HEAD_INIT(nfsd_list); +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) +static struct svc_stat nfsd_acl_svcstats; +static struct svc_version * nfsd_acl_version[] = { + [2] = &nfsd_acl_version2, + [3] = &nfsd_acl_version3, +}; + +#define NFSD_ACL_MINVERS 2 +#define NFSD_ACL_NRVERS (sizeof(nfsd_acl_version)/sizeof(nfsd_acl_version[0])) +static struct svc_version *nfsd_acl_versions[NFSD_ACL_NRVERS]; + +static struct svc_program nfsd_acl_program = { + .pg_prog = NFS_ACL_PROGRAM, + .pg_nvers = NFSD_ACL_NRVERS, + .pg_vers = nfsd_acl_versions, + .pg_name = "nfsd", + .pg_class = "nfsd", + .pg_stats = &nfsd_acl_svcstats, + .pg_authenticate = &svc_set_client, +}; + +static struct svc_stat nfsd_acl_svcstats = { + .program = &nfsd_acl_program, +}; +#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ + static struct svc_version * nfsd_version[] = { [2] = &nfsd_version2, #if defined(CONFIG_NFSD_V3) @@ -79,6 +105,9 @@ static struct svc_version * nfsd_version[] = { static struct svc_version *nfsd_versions[NFSD_NRVERS]; struct svc_program nfsd_program = { +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) + .pg_next = &nfsd_acl_program, +#endif .pg_prog = NFS_PROGRAM, /* program number */ .pg_nvers = NFSD_NRVERS, /* nr of entries in nfsd_version */ .pg_vers = nfsd_versions, /* version table */ @@ -147,6 +176,26 @@ nfsd_svc(unsigned short port, int nrservs) nfsd_program.pg_vers[i] = nfsd_version[i]; } + +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) + found_one = 0; + + for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++) { + if (NFSCTL_VERISSET(nfsd_versbits, i)) { + nfsd_acl_program.pg_vers[i] = + nfsd_acl_version[i]; + found_one = 1; + } else + nfsd_acl_program.pg_vers[i] = NULL; + } + + if (!found_one) { + for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++) + nfsd_acl_program.pg_vers[i] = + nfsd_acl_version[i]; + } +#endif + atomic_set(&nfsd_busy, 0); error = -ENOMEM; nfsd_serv = svc_create(&nfsd_program, NFSD_BUFSIZE); @@ -411,30 +460,3 @@ nfsd_dispatch(struct svc_rqst *rqstp, u32 *statp) nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1); return 1; } - -#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) -static struct svc_stat nfsd_acl_svcstats; -static struct svc_version * nfsd_acl_version[] = { - [2] = &nfsd_acl_version2, - [3] = &nfsd_acl_version3, -}; - -#define NFSD_ACL_NRVERS (sizeof(nfsd_acl_version)/sizeof(nfsd_acl_version[0])) -static struct svc_program nfsd_acl_program = { - .pg_prog = NFS_ACL_PROGRAM, - .pg_nvers = NFSD_ACL_NRVERS, - .pg_vers = nfsd_acl_version, - .pg_name = "nfsd", - .pg_class = "nfsd", - .pg_stats = &nfsd_acl_svcstats, - .pg_authenticate = &svc_set_client, -}; - -static struct svc_stat nfsd_acl_svcstats = { - .program = &nfsd_acl_program, -}; - -#define nfsd_acl_program_p &nfsd_acl_program -#else -#define nfsd_acl_program_p NULL -#endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */ diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog index 02f44094bda..9d8ffa89e2c 100644 --- a/fs/ntfs/ChangeLog +++ b/fs/ntfs/ChangeLog @@ -1,9 +1,9 @@ ToDo/Notes: - Find and fix bugs. - The only places in the kernel where a file is resized are - ntfs_file_write*() and ntfs_truncate() for both of which i_sem is + ntfs_file_write*() and ntfs_truncate() for both of which i_mutex is held. Just have to be careful in read-/writepage and other helpers - not running under i_sem that we play nice... Also need to be careful + not running under i_mutex that we play nice. Also need to be careful with initialized_size extension in ntfs_file_write*() and writepage. UPDATE: The only things that need to be checked are the compressed write and the other attribute resize/write cases like index @@ -19,6 +19,24 @@ ToDo/Notes: - Enable the code for setting the NT4 compatibility flag when we start making NTFS 1.2 specific modifications. +2.1.26 - Minor bug fixes and updates. + + - Fix a potential overflow in file.c where a cast to s64 was missing in + a left shift of a page index. + - The struct inode has had its i_sem semaphore changed to a mutex named + i_mutex. + - We have struct kmem_cache now so use it instead of the typedef + kmem_cache_t. (Pekka Enberg) + - Implement support for sector sizes above 512 bytes (up to the maximum + supported by NTFS which is 4096 bytes). + - Do more detailed reporting of why we cannot mount read-write by + special casing the VOLUME_MODIFIED_BY_CHKDSK flag. + - Miscellaneous updates to layout.h. + - Cope with attribute list attribute having invalid flags. Windows + copes with this and even chkdsk does not detect or fix this so we + have to cope with it, too. Thanks to Pawel Kot for reporting the + problem. + 2.1.25 - (Almost) fully implement write(2) and truncate(2). - Change ntfs_map_runlist_nolock(), ntfs_attr_find_vcn_nolock() and @@ -373,7 +391,7 @@ ToDo/Notes: single one of them had an mst error. (Thanks to Ken MacFerrin for the bug report.) - Fix error handling in fs/ntfs/quota.c::ntfs_mark_quotas_out_of_date() - where we failed to release i_sem on the $Quota/$Q attribute inode. + where we failed to release i_mutex on the $Quota/$Q attribute inode. - Fix bug in handling of bad inodes in fs/ntfs/namei.c::ntfs_lookup(). - Add mapping of unmapped buffers to all remaining code paths, i.e. fs/ntfs/aops.c::ntfs_write_mst_block(), mft.c::ntfs_sync_mft_mirror(), @@ -874,7 +892,7 @@ ToDo/Notes: clusters. (Philipp Thomas) - attrib.c::load_attribute_list(): Fix bug when initialized_size is a multiple of the block_size but not the cluster size. (Szabolcs - Szakacsits <szaka@sienet.hu>) + Szakacsits) 2.1.2 - Important bug fixes aleviating the hangs in statfs. @@ -884,7 +902,7 @@ ToDo/Notes: - Add handling for initialized_size != data_size in compressed files. - Reduce function local stack usage from 0x3d4 bytes to just noise in - fs/ntfs/upcase.c. (Randy Dunlap <rdunlap@xenotime.net>) + fs/ntfs/upcase.c. (Randy Dunlap) - Remove compiler warnings for newer gcc. - Pages are no longer kmapped by mm/filemap.c::generic_file_write() around calls to ->{prepare,commit}_write. Adapt NTFS appropriately @@ -1201,11 +1219,11 @@ ToDo/Notes: the kernel. We probably want a kernel generic init_address_space() function... - Drop BKL from ntfs_readdir() after consultation with Al Viro. The - only caller of ->readdir() is vfs_readdir() which holds i_sem during - the call, and i_sem is sufficient protection against changes in the - directory inode (including ->i_size). + only caller of ->readdir() is vfs_readdir() which holds i_mutex + during the call, and i_mutex is sufficient protection against changes + in the directory inode (including ->i_size). - Use generic_file_llseek() for directories (as opposed to - default_llseek()) as this downs i_sem instead of the BKL which is + default_llseek()) as this downs i_mutex instead of the BKL which is what we now need for exclusion against ->f_pos changes considering we no longer take the BKL in ntfs_readdir(). diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile index d0d45d1c853..d95fac7fdeb 100644 --- a/fs/ntfs/Makefile +++ b/fs/ntfs/Makefile @@ -6,7 +6,7 @@ ntfs-objs := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \ index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \ unistr.o upcase.o -EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.25\" +EXTRA_CFLAGS = -DNTFS_VERSION=\"2.1.26\" ifeq ($(CONFIG_NTFS_DEBUG),y) EXTRA_CFLAGS += -DDEBUG diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 1c0a4315876..7e361da770b 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -2,7 +2,7 @@ * aops.c - NTFS kernel address space operations and page cache handling. * Part of the Linux-NTFS project. * - * Copyright (c) 2001-2005 Anton Altaparmakov + * Copyright (c) 2001-2006 Anton Altaparmakov * Copyright (c) 2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -200,8 +200,8 @@ static int ntfs_read_block(struct page *page) /* $MFT/$DATA must have its complete runlist in memory at all times. */ BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni)); - blocksize_bits = VFS_I(ni)->i_blkbits; - blocksize = 1 << blocksize_bits; + blocksize = vol->sb->s_blocksize; + blocksize_bits = vol->sb->s_blocksize_bits; if (!page_has_buffers(page)) { create_empty_buffers(page, blocksize, 0); @@ -569,10 +569,8 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc) BUG_ON(!NInoNonResident(ni)); BUG_ON(NInoMstProtected(ni)); - - blocksize_bits = vi->i_blkbits; - blocksize = 1 << blocksize_bits; - + blocksize = vol->sb->s_blocksize; + blocksize_bits = vol->sb->s_blocksize_bits; if (!page_has_buffers(page)) { BUG_ON(!PageUptodate(page)); create_empty_buffers(page, blocksize, @@ -949,8 +947,8 @@ static int ntfs_write_mst_block(struct page *page, */ BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) || (NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION))); - bh_size_bits = vi->i_blkbits; - bh_size = 1 << bh_size_bits; + bh_size = vol->sb->s_blocksize; + bh_size_bits = vol->sb->s_blocksize_bits; max_bhs = PAGE_CACHE_SIZE / bh_size; BUG_ON(!max_bhs); BUG_ON(max_bhs > MAX_BUF_PER_PAGE); @@ -1596,7 +1594,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) { BUG_ON(!PageUptodate(page)); end = ofs + ni->itype.index.block_size; - bh_size = 1 << VFS_I(ni)->i_blkbits; + bh_size = VFS_I(ni)->i_sb->s_blocksize; spin_lock(&mapping->private_lock); if (unlikely(!page_has_buffers(page))) { spin_unlock(&mapping->private_lock); diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index fb413d3d861..5027d3d1b3f 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1,7 +1,7 @@ /* * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. * - * Copyright (c) 2001-2005 Anton Altaparmakov + * Copyright (c) 2001-2006 Anton Altaparmakov * * This program/include file is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as published @@ -248,7 +248,7 @@ do_non_resident_extend: * enough to make ntfs_writepage() work. */ write_lock_irqsave(&ni->size_lock, flags); - ni->initialized_size = (index + 1) << PAGE_CACHE_SHIFT; + ni->initialized_size = (s64)(index + 1) << PAGE_CACHE_SHIFT; if (ni->initialized_size > new_init_size) ni->initialized_size = new_init_size; write_unlock_irqrestore(&ni->size_lock, flags); @@ -529,8 +529,8 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages, "index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.", vi->i_ino, ni->type, pages[0]->index, nr_pages, (long long)pos, bytes); - blocksize_bits = vi->i_blkbits; - blocksize = 1 << blocksize_bits; + blocksize = vol->sb->s_blocksize; + blocksize_bits = vol->sb->s_blocksize_bits; u = 0; do { struct page *page = pages[u]; @@ -1525,7 +1525,7 @@ static inline int ntfs_commit_pages_after_non_resident_write( vi = pages[0]->mapping->host; ni = NTFS_I(vi); - blocksize = 1 << vi->i_blkbits; + blocksize = vi->i_sb->s_blocksize; end = pos + bytes; u = 0; do { diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index ea1bd3feea1..55263b7de9c 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -677,13 +677,28 @@ static int ntfs_read_locked_inode(struct inode *vi) ntfs_debug("Attribute list found in inode 0x%lx.", vi->i_ino); NInoSetAttrList(ni); a = ctx->attr; - if (a->flags & ATTR_IS_ENCRYPTED || - a->flags & ATTR_COMPRESSION_MASK || - a->flags & ATTR_IS_SPARSE) { + if (a->flags & ATTR_COMPRESSION_MASK) { ntfs_error(vi->i_sb, "Attribute list attribute is " - "compressed/encrypted/sparse."); + "compressed."); goto unm_err_out; } + if (a->flags & ATTR_IS_ENCRYPTED || + a->flags & ATTR_IS_SPARSE) { + if (a->non_resident) { + ntfs_error(vi->i_sb, "Non-resident attribute " + "list attribute is encrypted/" + "sparse."); + goto unm_err_out; + } + ntfs_warning(vi->i_sb, "Resident attribute list " + "attribute in inode 0x%lx is marked " + "encrypted/sparse which is not true. " + "However, Windows allows this and " + "chkdsk does not detect or correct it " + "so we will just ignore the invalid " + "flags and pretend they are not set.", + vi->i_ino); + } /* Now allocate memory for the attribute list. */ ni->attr_list_size = (u32)ntfs_attr_size(a); ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); @@ -1809,19 +1824,33 @@ int ntfs_read_inode_mount(struct inode *vi) } else /* if (!err) */ { ATTR_LIST_ENTRY *al_entry, *next_al_entry; u8 *al_end; + static const char *es = " Not allowed. $MFT is corrupt. " + "You should run chkdsk."; ntfs_debug("Attribute list attribute found in $MFT."); NInoSetAttrList(ni); a = ctx->attr; - if (a->flags & ATTR_IS_ENCRYPTED || - a->flags & ATTR_COMPRESSION_MASK || - a->flags & ATTR_IS_SPARSE) { + if (a->flags & ATTR_COMPRESSION_MASK) { ntfs_error(sb, "Attribute list attribute is " - "compressed/encrypted/sparse. Not " - "allowed. $MFT is corrupt. You should " - "run chkdsk."); + "compressed.%s", es); goto put_err_out; } + if (a->flags & ATTR_IS_ENCRYPTED || + a->flags & ATTR_IS_SPARSE) { + if (a->non_resident) { + ntfs_error(sb, "Non-resident attribute list " + "attribute is encrypted/" + "sparse.%s", es); + goto put_err_out; + } + ntfs_warning(sb, "Resident attribute list attribute " + "in $MFT system file is marked " + "encrypted/sparse which is not true. " + "However, Windows allows this and " + "chkdsk does not detect or correct it " + "so we will just ignore the invalid " + "flags and pretend they are not set."); + } /* Now allocate memory for the attribute list. */ ni->attr_list_size = (u32)ntfs_attr_size(a); ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size); diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h index f5678d5d791..bb408d4dcbb 100644 --- a/fs/ntfs/layout.h +++ b/fs/ntfs/layout.h @@ -838,15 +838,19 @@ enum { F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT, F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask is used to to obtain all flags that are valid for setting. */ - /* - * The following flags are only present in the FILE_NAME attribute (in + * The following flag is only present in the FILE_NAME attribute (in * the field file_attributes). */ FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = const_cpu_to_le32(0x10000000), /* Note, this is a copy of the corresponding bit from the mft record, telling us whether this is a directory or not, i.e. whether it has an index root attribute or not. */ + /* + * The following flag is present both in the STANDARD_INFORMATION + * attribute and in the FILE_NAME attribute (in the field + * file_attributes). + */ FILE_ATTR_DUP_VIEW_INDEX_PRESENT = const_cpu_to_le32(0x20000000), /* Note, this is a copy of the corresponding bit from the mft record, telling us whether this file has a view index present (eg. object id @@ -1071,9 +1075,15 @@ typedef struct { modified. */ /* 20*/ sle64 last_access_time; /* Time this mft record was last accessed. */ -/* 28*/ sle64 allocated_size; /* Byte size of allocated space for the - data attribute. NOTE: Is a multiple - of the cluster size. */ +/* 28*/ sle64 allocated_size; /* Byte size of on-disk allocated space + for the data attribute. So for + normal $DATA, this is the + allocated_size from the unnamed + $DATA attribute and for compressed + and/or sparse $DATA, this is the + compressed_size from the unnamed + $DATA attribute. NOTE: This is a + multiple of the cluster size. */ /* 30*/ sle64 data_size; /* Byte size of actual data in data attribute. */ /* 38*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */ @@ -1904,12 +1914,13 @@ enum { VOLUME_DELETE_USN_UNDERWAY = const_cpu_to_le16(0x0010), VOLUME_REPAIR_OBJECT_ID = const_cpu_to_le16(0x0020), + VOLUME_CHKDSK_UNDERWAY = const_cpu_to_le16(0x4000), VOLUME_MODIFIED_BY_CHKDSK = const_cpu_to_le16(0x8000), - VOLUME_FLAGS_MASK = const_cpu_to_le16(0x803f), + VOLUME_FLAGS_MASK = const_cpu_to_le16(0xc03f), /* To make our life easier when checking if we must mount read-only. */ - VOLUME_MUST_MOUNT_RO_MASK = const_cpu_to_le16(0x8027), + VOLUME_MUST_MOUNT_RO_MASK = const_cpu_to_le16(0xc027), } __attribute__ ((__packed__)); typedef le16 VOLUME_FLAGS; diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 0c65cbb8c5c..6499aafc225 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -1,7 +1,7 @@ /** * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project. * - * Copyright (c) 2001-2005 Anton Altaparmakov + * Copyright (c) 2001-2006 Anton Altaparmakov * Copyright (c) 2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -473,7 +473,7 @@ int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, runlist_element *rl; unsigned int block_start, block_end, m_start, m_end, page_ofs; int i_bhs, nr_bhs, err = 0; - unsigned char blocksize_bits = vol->mftmirr_ino->i_blkbits; + unsigned char blocksize_bits = vol->sb->s_blocksize_bits; ntfs_debug("Entering for inode 0x%lx.", mft_no); BUG_ON(!max_bhs); @@ -672,8 +672,8 @@ int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync) { ntfs_volume *vol = ni->vol; struct page *page = ni->page; - unsigned char blocksize_bits = vol->mft_ino->i_blkbits; - unsigned int blocksize = 1 << blocksize_bits; + unsigned int blocksize = vol->sb->s_blocksize; + unsigned char blocksize_bits = vol->sb->s_blocksize_bits; int max_bhs = vol->mft_record_size / blocksize; struct buffer_head *bhs[max_bhs]; struct buffer_head *bh, *head; diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h index 446b5014115..653d2a5c489 100644 --- a/fs/ntfs/ntfs.h +++ b/fs/ntfs/ntfs.h @@ -50,11 +50,11 @@ typedef enum { /* Global variables. */ /* Slab caches (from super.c). */ -extern kmem_cache_t *ntfs_name_cache; -extern kmem_cache_t *ntfs_inode_cache; -extern kmem_cache_t *ntfs_big_inode_cache; -extern kmem_cache_t *ntfs_attr_ctx_cache; -extern kmem_cache_t *ntfs_index_ctx_cache; +extern struct kmem_cache *ntfs_name_cache; +extern struct kmem_cache *ntfs_inode_cache; +extern struct kmem_cache *ntfs_big_inode_cache; +extern struct kmem_cache *ntfs_attr_ctx_cache; +extern struct kmem_cache *ntfs_index_ctx_cache; /* The various operations structs defined throughout the driver files. */ extern struct address_space_operations ntfs_aops; diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index c3a3f1a8310..368a8ec1066 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -1,7 +1,7 @@ /* * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project. * - * Copyright (c) 2001-2005 Anton Altaparmakov + * Copyright (c) 2001-2006 Anton Altaparmakov * Copyright (c) 2001,2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -22,6 +22,7 @@ #include <linux/stddef.h> #include <linux/init.h> +#include <linux/slab.h> #include <linux/string.h> #include <linux/spinlock.h> #include <linux/blkdev.h> /* For bdev_hardsect_size(). */ @@ -471,9 +472,16 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt) ntfs_error(sb, "Volume is dirty and read-only%s", es); return -EROFS; } + if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) { + ntfs_error(sb, "Volume has been modified by chkdsk " + "and is read-only%s", es); + return -EROFS; + } if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { - ntfs_error(sb, "Volume has unsupported flags set and " - "is read-only%s", es); + ntfs_error(sb, "Volume has unsupported flags set " + "(0x%x) and is read-only%s", + (unsigned)le16_to_cpu(vol->vol_flags), + es); return -EROFS; } if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) { @@ -641,7 +649,7 @@ static struct buffer_head *read_ntfs_boot_sector(struct super_block *sb, { const char *read_err_str = "Unable to read %s boot sector."; struct buffer_head *bh_primary, *bh_backup; - long nr_blocks = NTFS_SB(sb)->nr_blocks; + sector_t nr_blocks = NTFS_SB(sb)->nr_blocks; /* Try to read primary boot sector. */ if ((bh_primary = sb_bread(sb, 0))) { @@ -688,13 +696,18 @@ hotfix_primary_boot_sector: /* * If we managed to read sector zero and the volume is not * read-only, copy the found, valid backup boot sector to the - * primary boot sector. + * primary boot sector. Note we only copy the actual boot + * sector structure, not the actual whole device sector as that + * may be bigger and would potentially damage the $Boot system + * file (FIXME: Would be nice to know if the backup boot sector + * on a large sector device contains the whole boot loader or + * just the first 512 bytes). */ if (!(sb->s_flags & MS_RDONLY)) { ntfs_warning(sb, "Hot-fix: Recovering invalid primary " "boot sector from backup copy."); memcpy(bh_primary->b_data, bh_backup->b_data, - sb->s_blocksize); + NTFS_BLOCK_SIZE); mark_buffer_dirty(bh_primary); sync_dirty_buffer(bh_primary); if (buffer_uptodate(bh_primary)) { @@ -733,9 +746,13 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b) vol->sector_size); ntfs_debug("vol->sector_size_bits = %i (0x%x)", vol->sector_size_bits, vol->sector_size_bits); - if (vol->sector_size != vol->sb->s_blocksize) - ntfs_warning(vol->sb, "The boot sector indicates a sector size " - "different from the device sector size."); + if (vol->sector_size < vol->sb->s_blocksize) { + ntfs_error(vol->sb, "Sector size (%i) is smaller than the " + "device block size (%lu). This is not " + "supported. Sorry.", vol->sector_size, + vol->sb->s_blocksize); + return FALSE; + } ntfs_debug("sectors_per_cluster = 0x%x", b->bpb.sectors_per_cluster); sectors_per_cluster_bits = ffs(b->bpb.sectors_per_cluster) - 1; ntfs_debug("sectors_per_cluster_bits = 0x%x", @@ -748,16 +765,11 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b) ntfs_debug("vol->cluster_size = %i (0x%x)", vol->cluster_size, vol->cluster_size); ntfs_debug("vol->cluster_size_mask = 0x%x", vol->cluster_size_mask); - ntfs_debug("vol->cluster_size_bits = %i (0x%x)", - vol->cluster_size_bits, vol->cluster_size_bits); - if (vol->sector_size > vol->cluster_size) { - ntfs_error(vol->sb, "Sector sizes above the cluster size are " - "not supported. Sorry."); - return FALSE; - } - if (vol->sb->s_blocksize > vol->cluster_size) { - ntfs_error(vol->sb, "Cluster sizes smaller than the device " - "sector size are not supported. Sorry."); + ntfs_debug("vol->cluster_size_bits = %i", vol->cluster_size_bits); + if (vol->cluster_size < vol->sector_size) { + ntfs_error(vol->sb, "Cluster size (%i) is smaller than the " + "sector size (%i). This is not supported. " + "Sorry.", vol->cluster_size, vol->sector_size); return FALSE; } clusters_per_mft_record = b->clusters_per_mft_record; @@ -786,11 +798,18 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b) * we store $MFT/$DATA, the table of mft records in the page cache. */ if (vol->mft_record_size > PAGE_CACHE_SIZE) { - ntfs_error(vol->sb, "Mft record size %i (0x%x) exceeds the " - "page cache size on your system %lu (0x%lx). " + ntfs_error(vol->sb, "Mft record size (%i) exceeds the " + "PAGE_CACHE_SIZE on your system (%lu). " "This is not supported. Sorry.", - vol->mft_record_size, vol->mft_record_size, - PAGE_CACHE_SIZE, PAGE_CACHE_SIZE); + vol->mft_record_size, PAGE_CACHE_SIZE); + return FALSE; + } + /* We cannot support mft record sizes below the sector size. */ + if (vol->mft_record_size < vol->sector_size) { + ntfs_error(vol->sb, "Mft record size (%i) is smaller than the " + "sector size (%i). This is not supported. " + "Sorry.", vol->mft_record_size, + vol->sector_size); return FALSE; } clusters_per_index_record = b->clusters_per_index_record; @@ -816,6 +835,14 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b) ntfs_debug("vol->index_record_size_bits = %i (0x%x)", vol->index_record_size_bits, vol->index_record_size_bits); + /* We cannot support index record sizes below the sector size. */ + if (vol->index_record_size < vol->sector_size) { + ntfs_error(vol->sb, "Index record size (%i) is smaller than " + "the sector size (%i). This is not " + "supported. Sorry.", vol->index_record_size, + vol->sector_size); + return FALSE; + } /* * Get the size of the volume in clusters and check for 64-bit-ness. * Windows currently only uses 32 bits to save the clusters so we do @@ -845,15 +872,18 @@ static BOOL parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b) } ll = sle64_to_cpu(b->mft_lcn); if (ll >= vol->nr_clusters) { - ntfs_error(vol->sb, "MFT LCN is beyond end of volume. Weird."); + ntfs_error(vol->sb, "MFT LCN (%lli, 0x%llx) is beyond end of " + "volume. Weird.", (unsigned long long)ll, + (unsigned long long)ll); return FALSE; } vol->mft_lcn = ll; ntfs_debug("vol->mft_lcn = 0x%llx", (long long)vol->mft_lcn); ll = sle64_to_cpu(b->mftmirr_lcn); if (ll >= vol->nr_clusters) { - ntfs_error(vol->sb, "MFTMirr LCN is beyond end of volume. " - "Weird."); + ntfs_error(vol->sb, "MFTMirr LCN (%lli, 0x%llx) is beyond end " + "of volume. Weird.", (unsigned long long)ll, + (unsigned long long)ll); return FALSE; } vol->mftmirr_lcn = ll; @@ -1822,11 +1852,24 @@ get_ctx_vol_failed: /* Make sure that no unsupported volume flags are set. */ if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) { static const char *es1a = "Volume is dirty"; - static const char *es1b = "Volume has unsupported flags set"; - static const char *es2 = ". Run chkdsk and mount in Windows."; - const char *es1; - - es1 = vol->vol_flags & VOLUME_IS_DIRTY ? es1a : es1b; + static const char *es1b = "Volume has been modified by chkdsk"; + static const char *es1c = "Volume has unsupported flags set"; + static const char *es2a = ". Run chkdsk and mount in Windows."; + static const char *es2b = ". Mount in Windows."; + const char *es1, *es2; + + es2 = es2a; + if (vol->vol_flags & VOLUME_IS_DIRTY) + es1 = es1a; + else if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) { + es1 = es1b; + es2 = es2b; + } else { + es1 = es1c; + ntfs_warning(sb, "Unsupported volume flags 0x%x " + "encountered.", + (unsigned)le16_to_cpu(vol->vol_flags)); + } /* If a read-write mount, convert it to a read-only mount. */ if (!(sb->s_flags & MS_RDONLY)) { if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO | @@ -2685,7 +2728,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) ntfs_volume *vol; struct buffer_head *bh; struct inode *tmp_ino; - int result; + int blocksize, result; ntfs_debug("Entering."); #ifndef NTFS_RW @@ -2724,60 +2767,85 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) if (!parse_options(vol, (char*)opt)) goto err_out_now; + /* We support sector sizes up to the PAGE_CACHE_SIZE. */ + if (bdev_hardsect_size(sb->s_bdev) > PAGE_CACHE_SIZE) { + if (!silent) + ntfs_error(sb, "Device has unsupported sector size " + "(%i). The maximum supported sector " + "size on this architecture is %lu " + "bytes.", + bdev_hardsect_size(sb->s_bdev), + PAGE_CACHE_SIZE); + goto err_out_now; + } /* - * TODO: Fail safety check. In the future we should really be able to - * cope with this being the case, but for now just bail out. + * Setup the device access block size to NTFS_BLOCK_SIZE or the hard + * sector size, whichever is bigger. */ - if (bdev_hardsect_size(sb->s_bdev) > NTFS_BLOCK_SIZE) { + blocksize = sb_min_blocksize(sb, NTFS_BLOCK_SIZE); + if (blocksize < NTFS_BLOCK_SIZE) { if (!silent) - ntfs_error(sb, "Device has unsupported hardsect_size."); + ntfs_error(sb, "Unable to set device block size."); goto err_out_now; } - - /* Setup the device access block size to NTFS_BLOCK_SIZE. */ - if (sb_set_blocksize(sb, NTFS_BLOCK_SIZE) != NTFS_BLOCK_SIZE) { + BUG_ON(blocksize != sb->s_blocksize); + ntfs_debug("Set device block size to %i bytes (block size bits %i).", + blocksize, sb->s_blocksize_bits); + /* Determine the size of the device in units of block_size bytes. */ + if (!i_size_read(sb->s_bdev->bd_inode)) { if (!silent) - ntfs_error(sb, "Unable to set block size."); + ntfs_error(sb, "Unable to determine device size."); goto err_out_now; } - - /* Get the size of the device in units of NTFS_BLOCK_SIZE bytes. */ vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >> - NTFS_BLOCK_SIZE_BITS; - + sb->s_blocksize_bits; /* Read the boot sector and return unlocked buffer head to it. */ if (!(bh = read_ntfs_boot_sector(sb, silent))) { if (!silent) ntfs_error(sb, "Not an NTFS volume."); goto err_out_now; } - /* - * Extract the data from the boot sector and setup the ntfs super block + * Extract the data from the boot sector and setup the ntfs volume * using it. */ result = parse_ntfs_boot_sector(vol, (NTFS_BOOT_SECTOR*)bh->b_data); - - /* Initialize the cluster and mft allocators. */ - ntfs_setup_allocators(vol); - brelse(bh); - if (!result) { if (!silent) ntfs_error(sb, "Unsupported NTFS filesystem."); goto err_out_now; } - /* - * TODO: When we start coping with sector sizes different from - * NTFS_BLOCK_SIZE, we now probably need to set the blocksize of the - * device (probably to NTFS_BLOCK_SIZE). + * If the boot sector indicates a sector size bigger than the current + * device block size, switch the device block size to the sector size. + * TODO: It may be possible to support this case even when the set + * below fails, we would just be breaking up the i/o for each sector + * into multiple blocks for i/o purposes but otherwise it should just + * work. However it is safer to leave disabled until someone hits this + * error message and then we can get them to try it without the setting + * so we know for sure that it works. */ - + if (vol->sector_size > blocksize) { + blocksize = sb_set_blocksize(sb, vol->sector_size); + if (blocksize != vol->sector_size) { + if (!silent) + ntfs_error(sb, "Unable to set device block " + "size to sector size (%i).", + vol->sector_size); + goto err_out_now; + } + BUG_ON(blocksize != sb->s_blocksize); + vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >> + sb->s_blocksize_bits; + ntfs_debug("Changed device block size to %i bytes (block size " + "bits %i) to match volume sector size.", + blocksize, sb->s_blocksize_bits); + } + /* Initialize the cluster and mft allocators. */ + ntfs_setup_allocators(vol); /* Setup remaining fields in the super block. */ sb->s_magic = NTFS_SB_MAGIC; - /* * Ntfs allows 63 bits for the file size, i.e. correct would be: * sb->s_maxbytes = ~0ULL >> 1; @@ -2787,9 +2855,8 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) * without overflowing the index or to 2^63 - 1, whichever is smaller. */ sb->s_maxbytes = MAX_LFS_FILESIZE; - + /* Ntfs measures time in 100ns intervals. */ sb->s_time_gran = 100; - /* * Now load the metadata required for the page cache and our address * space operations to function. We do this by setting up a specialised @@ -2987,14 +3054,14 @@ err_out_now: * strings of the maximum length allowed by NTFS, which is NTFS_MAX_NAME_LEN * (255) Unicode characters + a terminating NULL Unicode character. */ -kmem_cache_t *ntfs_name_cache; +struct kmem_cache *ntfs_name_cache; /* Slab caches for efficient allocation/deallocation of inodes. */ -kmem_cache_t *ntfs_inode_cache; -kmem_cache_t *ntfs_big_inode_cache; +struct kmem_cache *ntfs_inode_cache; +struct kmem_cache *ntfs_big_inode_cache; /* Init once constructor for the inode slab cache. */ -static void ntfs_big_inode_init_once(void *foo, kmem_cache_t *cachep, +static void ntfs_big_inode_init_once(void *foo, struct kmem_cache *cachep, unsigned long flags) { ntfs_inode *ni = (ntfs_inode *)foo; @@ -3008,8 +3075,8 @@ static void ntfs_big_inode_init_once(void *foo, kmem_cache_t *cachep, * Slab caches to optimize allocations and deallocations of attribute search * contexts and index contexts, respectively. */ -kmem_cache_t *ntfs_attr_ctx_cache; -kmem_cache_t *ntfs_index_ctx_cache; +struct kmem_cache *ntfs_attr_ctx_cache; +struct kmem_cache *ntfs_index_ctx_cache; /* Driver wide semaphore. */ DECLARE_MUTEX(ntfs_lock); diff --git a/fs/ntfs/upcase.c b/fs/ntfs/upcase.c index 879cdf1d5bd..9101807dc81 100644 --- a/fs/ntfs/upcase.c +++ b/fs/ntfs/upcase.c @@ -3,10 +3,7 @@ * Part of the Linux-NTFS project. * * Copyright (c) 2001 Richard Russon <ntfs@flatcap.org> - * Copyright (c) 2001-2004 Anton Altaparmakov - * - * Modified for mkntfs inclusion 9 June 2001 by Anton Altaparmakov. - * Modified for kernel inclusion 10 September 2001 by Anton Altparmakov. + * Copyright (c) 2001-2006 Anton Altaparmakov * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -75,12 +72,13 @@ ntfschar *generate_default_upcase(void) if (!uc) return uc; memset(uc, 0, default_upcase_len * sizeof(ntfschar)); + /* Generate the little endian Unicode upcase table used by ntfs. */ for (i = 0; i < default_upcase_len; i++) uc[i] = cpu_to_le16(i); for (r = 0; uc_run_table[r][0]; r++) for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++) - uc[i] = cpu_to_le16((le16_to_cpu(uc[i]) + - uc_run_table[r][2])); + uc[i] = cpu_to_le16(le16_to_cpu(uc[i]) + + uc_run_table[r][2]); for (r = 0; uc_dup_table[r][0]; r++) for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2) uc[i + 1] = cpu_to_le16(le16_to_cpu(uc[i + 1]) - 1); diff --git a/fs/ntfs/volume.h b/fs/ntfs/volume.h index 375cd20a9f6..406ab55dfb3 100644 --- a/fs/ntfs/volume.h +++ b/fs/ntfs/volume.h @@ -2,7 +2,7 @@ * volume.h - Defines for volume structures in NTFS Linux kernel driver. Part * of the Linux-NTFS project. * - * Copyright (c) 2001-2005 Anton Altaparmakov + * Copyright (c) 2001-2006 Anton Altaparmakov * Copyright (c) 2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -41,10 +41,8 @@ typedef struct { * structure has stabilized... (AIA) */ /* Device specifics. */ - struct super_block *sb; /* Pointer back to the super_block, - so we don't have to get the offset - every time. */ - LCN nr_blocks; /* Number of NTFS_BLOCK_SIZE bytes + struct super_block *sb; /* Pointer back to the super_block. */ + LCN nr_blocks; /* Number of sb->s_blocksize bytes sized blocks on the device. */ /* Configuration provided by user at mount time. */ unsigned long flags; /* Miscellaneous flags, see below. */ @@ -141,8 +139,8 @@ typedef enum { NV_ShowSystemFiles, /* 1: Return system files in ntfs_readdir(). */ NV_CaseSensitive, /* 1: Treat file names as case sensitive and create filenames in the POSIX namespace. - Otherwise be case insensitive and create - file names in WIN32 namespace. */ + Otherwise be case insensitive but still + create file names in POSIX namespace. */ NV_LogFileEmpty, /* 1: $LogFile journal is empty. */ NV_QuotaOutOfDate, /* 1: $Quota is out of date. */ NV_UsnJrnlStamped, /* 1: $UsnJrnl has been stamped. */ @@ -153,7 +151,7 @@ typedef enum { * Macro tricks to expand the NVolFoo(), NVolSetFoo(), and NVolClearFoo() * functions. */ -#define NVOL_FNS(flag) \ +#define DEFINE_NVOL_BIT_OPS(flag) \ static inline int NVol##flag(ntfs_volume *vol) \ { \ return test_bit(NV_##flag, &(vol)->flags); \ @@ -168,12 +166,12 @@ static inline void NVolClear##flag(ntfs_volume *vol) \ } /* Emit the ntfs volume bitops functions. */ -NVOL_FNS(Errors) -NVOL_FNS(ShowSystemFiles) -NVOL_FNS(CaseSensitive) -NVOL_FNS(LogFileEmpty) -NVOL_FNS(QuotaOutOfDate) -NVOL_FNS(UsnJrnlStamped) -NVOL_FNS(SparseEnabled) +DEFINE_NVOL_BIT_OPS(Errors) +DEFINE_NVOL_BIT_OPS(ShowSystemFiles) +DEFINE_NVOL_BIT_OPS(CaseSensitive) +DEFINE_NVOL_BIT_OPS(LogFileEmpty) +DEFINE_NVOL_BIT_OPS(QuotaOutOfDate) +DEFINE_NVOL_BIT_OPS(UsnJrnlStamped) +DEFINE_NVOL_BIT_OPS(SparseEnabled) #endif /* _LINUX_NTFS_VOLUME_H */ diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index d424041b38e..bae3d7548be 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -58,7 +58,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, goto out; } - down(&OCFS2_I(inode)->ip_io_sem); + mutex_lock(&OCFS2_I(inode)->ip_io_mutex); lock_buffer(bh); set_buffer_uptodate(bh); @@ -82,7 +82,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, brelse(bh); } - up(&OCFS2_I(inode)->ip_io_sem); + mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); out: mlog_exit(ret); return ret; @@ -125,13 +125,13 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, flags &= ~OCFS2_BH_CACHED; if (inode) - down(&OCFS2_I(inode)->ip_io_sem); + mutex_lock(&OCFS2_I(inode)->ip_io_mutex); for (i = 0 ; i < nr ; i++) { if (bhs[i] == NULL) { bhs[i] = sb_getblk(sb, block++); if (bhs[i] == NULL) { if (inode) - up(&OCFS2_I(inode)->ip_io_sem); + mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); status = -EIO; mlog_errno(status); goto bail; @@ -220,7 +220,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, ocfs2_set_buffer_uptodate(inode, bh); } if (inode) - up(&OCFS2_I(inode)->ip_io_sem); + mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); mlog(ML_BH_IO, "block=(%"MLFu64"), nr=(%d), cached=%s\n", block, nr, (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes"); diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 7307ba52891..d08971d29b6 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -917,8 +917,9 @@ static int o2hb_thread(void *data) elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n", - before_hb.tv_sec, before_hb.tv_usec, - after_hb.tv_sec, after_hb.tv_usec, elapsed_msec); + before_hb.tv_sec, (unsigned long) before_hb.tv_usec, + after_hb.tv_sec, (unsigned long) after_hb.tv_usec, + elapsed_msec); if (elapsed_msec < reg->hr_timeout_ms) { /* the kthread api has blocked signals for us so no diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index fd741cea570..636593bf4d1 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -74,6 +74,7 @@ struct mlog_attribute { #define define_mask(_name) { \ .attr = { \ .name = #_name, \ + .owner = THIS_MODULE, \ .mode = S_IRUGO | S_IWUSR, \ }, \ .mask = ML_##_name, \ diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index e8c56a3d9c6..2cadc3009c8 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -256,7 +256,7 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits; } \ } while (0) -#if (BITS_PER_LONG == 32) || defined(CONFIG_X86_64) +#if (BITS_PER_LONG == 32) || defined(CONFIG_X86_64) || (defined(CONFIG_UML_X86) && defined(CONFIG_64BIT)) #define MLFi64 "lld" #define MLFu64 "llu" #define MLFx64 "llx" diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index cf7828f2336..e1fceb8aa32 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -756,7 +756,7 @@ static int __init init_o2nm(void) if (!ocfs2_table_header) { printk(KERN_ERR "nodemanager: unable to register sysctl\n"); ret = -ENOMEM; /* or something. */ - goto out; + goto out_o2net; } ret = o2net_register_hb_callbacks(); @@ -780,6 +780,8 @@ out_callbacks: o2net_unregister_hb_callbacks(); out_sysctl: unregister_sysctl_table(ocfs2_table_header); +out_o2net: + o2net_exit(); out: return ret; } diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 35d92c01a97..0f60cc0d398 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1285,14 +1285,16 @@ static void o2net_idle_timer(unsigned long data) mlog(ML_NOTICE, "here are some times that might help debug the " "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv " "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n", - sc->sc_tv_timer.tv_sec, sc->sc_tv_timer.tv_usec, - now.tv_sec, now.tv_usec, - sc->sc_tv_data_ready.tv_sec, sc->sc_tv_data_ready.tv_usec, - sc->sc_tv_advance_start.tv_sec, sc->sc_tv_advance_start.tv_usec, - sc->sc_tv_advance_stop.tv_sec, sc->sc_tv_advance_stop.tv_usec, + sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec, + now.tv_sec, (long) now.tv_usec, + sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec, + sc->sc_tv_advance_start.tv_sec, + (long) sc->sc_tv_advance_start.tv_usec, + sc->sc_tv_advance_stop.tv_sec, + (long) sc->sc_tv_advance_stop.tv_usec, sc->sc_msg_key, sc->sc_msg_type, - sc->sc_tv_func_start.tv_sec, sc->sc_tv_func_start.tv_usec, - sc->sc_tv_func_stop.tv_sec, sc->sc_tv_func_stop.tv_usec); + sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec, + sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec); o2net_sc_queue_work(sc, &sc->sc_shutdown_work); } @@ -1316,7 +1318,7 @@ static void o2net_start_connect(void *arg) { struct o2net_node *nn = arg; struct o2net_sock_container *sc = NULL; - struct o2nm_node *node = NULL; + struct o2nm_node *node = NULL, *mynode = NULL; struct socket *sock = NULL; struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; int ret = 0; @@ -1332,6 +1334,12 @@ static void o2net_start_connect(void *arg) goto out; } + mynode = o2nm_get_node_by_num(o2nm_this_node()); + if (mynode == NULL) { + ret = 0; + goto out; + } + spin_lock(&nn->nn_lock); /* see if we already have one pending or have given up */ if (nn->nn_sc || nn->nn_persistent_error) @@ -1359,12 +1367,14 @@ static void o2net_start_connect(void *arg) sock->sk->sk_allocation = GFP_ATOMIC; myaddr.sin_family = AF_INET; + myaddr.sin_addr.s_addr = (__force u32)mynode->nd_ipv4_address; myaddr.sin_port = (__force u16)htons(0); /* any port */ ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr, sizeof(myaddr)); if (ret) { - mlog(0, "bind failed: %d\n", ret); + mlog(ML_ERROR, "bind failed with %d at address %u.%u.%u.%u\n", + ret, NIPQUAD(mynode->nd_ipv4_address)); goto out; } @@ -1405,6 +1415,8 @@ out: sc_put(sc); if (node) o2nm_node_put(node); + if (mynode) + o2nm_node_put(mynode); return; } diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h index a6f4585501c..616ff2b8434 100644 --- a/fs/ocfs2/cluster/tcp.h +++ b/fs/ocfs2/cluster/tcp.h @@ -85,13 +85,10 @@ enum { O2NET_DRIVER_READY, }; -int o2net_init_tcp_sock(struct inode *inode); int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len, u8 target_node, int *status); int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec, size_t veclen, u8 target_node, int *status); -int o2net_broadcast_message(u32 msg_type, u32 key, void *data, u32 len, - struct inode *group); int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, o2net_msg_handler_func *func, void *data, @@ -107,7 +104,5 @@ void o2net_disconnect_node(struct o2nm_node *node); int o2net_init(void); void o2net_exit(void); -int o2net_proc_init(struct proc_dir_entry *parent); -void o2net_proc_exit(struct proc_dir_entry *parent); #endif /* O2CLUSTER_TCP_H */ diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 3fecba0a602..9c772583744 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -37,9 +37,7 @@ #define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes #define DLM_THREAD_MS 200 // flush at least every 200 ms -#define DLM_HASH_BITS 7 -#define DLM_HASH_SIZE (1 << DLM_HASH_BITS) -#define DLM_HASH_MASK (DLM_HASH_SIZE - 1) +#define DLM_HASH_BUCKETS (PAGE_SIZE / sizeof(struct hlist_head)) enum dlm_ast_type { DLM_AST = 0, @@ -87,7 +85,7 @@ enum dlm_ctxt_state { struct dlm_ctxt { struct list_head list; - struct list_head *resources; + struct hlist_head *lockres_hash; struct list_head dirty_list; struct list_head purge_list; struct list_head pending_asts; @@ -208,13 +206,16 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm, #define DLM_LOCK_RES_IN_PROGRESS 0x00000010 #define DLM_LOCK_RES_MIGRATING 0x00000020 +/* max milliseconds to wait to sync up a network failure with a node death */ +#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000) + #define DLM_PURGE_INTERVAL_MS (8 * 1000) struct dlm_lock_resource { /* WARNING: Please see the comment in dlm_init_lockres before * adding fields here. */ - struct list_head list; + struct hlist_node hash_node; struct kref refs; /* please keep these next 3 in this order @@ -657,6 +658,8 @@ void dlm_complete_thread(struct dlm_ctxt *dlm); int dlm_launch_recovery_thread(struct dlm_ctxt *dlm); void dlm_complete_recovery_thread(struct dlm_ctxt *dlm); void dlm_wait_for_recovery(struct dlm_ctxt *dlm); +int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node); +int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout); void dlm_put(struct dlm_ctxt *dlm); struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index 6001b22a997..f66e2d818cc 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -392,6 +392,11 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm, } else { mlog_errno(tmpret); if (dlm_is_host_down(tmpret)) { + /* instead of logging the same network error over + * and over, sleep here and wait for the heartbeat + * to notice the node is dead. times out after 5s. */ + dlm_wait_for_node_death(dlm, res->owner, + DLM_NODE_DEATH_WAIT_MAX); ret = DLM_RECOVERING; mlog(0, "node %u died so returning DLM_RECOVERING " "from convert message!\n", res->owner); @@ -421,7 +426,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) struct dlm_lockstatus *lksb; enum dlm_status status = DLM_NORMAL; u32 flags; - int call_ast = 0, kick_thread = 0; + int call_ast = 0, kick_thread = 0, ast_reserved = 0; if (!dlm_grab(dlm)) { dlm_error(DLM_REJECTED); @@ -490,6 +495,7 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) status = __dlm_lockres_state_to_status(res); if (status == DLM_NORMAL) { __dlm_lockres_reserve_ast(res); + ast_reserved = 1; res->state |= DLM_LOCK_RES_IN_PROGRESS; status = __dlmconvert_master(dlm, res, lock, flags, cnv->requested_type, @@ -512,10 +518,10 @@ leave: else dlm_lock_put(lock); - /* either queue the ast or release it */ + /* either queue the ast or release it, if reserved */ if (call_ast) dlm_queue_ast(dlm, lock); - else + else if (ast_reserved) dlm_lockres_release_ast(dlm, res); if (kick_thread) diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index f339fe27975..54f61b76ab5 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -117,8 +117,8 @@ EXPORT_SYMBOL_GPL(dlm_print_one_lock); void dlm_dump_lock_resources(struct dlm_ctxt *dlm) { struct dlm_lock_resource *res; - struct list_head *iter; - struct list_head *bucket; + struct hlist_node *iter; + struct hlist_head *bucket; int i; mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n", @@ -129,12 +129,10 @@ void dlm_dump_lock_resources(struct dlm_ctxt *dlm) } spin_lock(&dlm->spinlock); - for (i=0; i<DLM_HASH_SIZE; i++) { - bucket = &(dlm->resources[i]); - list_for_each(iter, bucket) { - res = list_entry(iter, struct dlm_lock_resource, list); + for (i=0; i<DLM_HASH_BUCKETS; i++) { + bucket = &(dlm->lockres_hash[i]); + hlist_for_each_entry(res, iter, bucket, hash_node) dlm_print_one_lock_resource(res); - } } spin_unlock(&dlm->spinlock); } diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index da3c22045f8..8f3a9e3106f 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -77,26 +77,26 @@ static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm); void __dlm_unhash_lockres(struct dlm_lock_resource *lockres) { - list_del_init(&lockres->list); + hlist_del_init(&lockres->hash_node); dlm_lockres_put(lockres); } void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { - struct list_head *bucket; + struct hlist_head *bucket; struct qstr *q; assert_spin_locked(&dlm->spinlock); q = &res->lockname; q->hash = full_name_hash(q->name, q->len); - bucket = &(dlm->resources[q->hash & DLM_HASH_MASK]); + bucket = &(dlm->lockres_hash[q->hash % DLM_HASH_BUCKETS]); /* get a reference for our hashtable */ dlm_lockres_get(res); - list_add_tail(&res->list, bucket); + hlist_add_head(&res->hash_node, bucket); } struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, @@ -104,9 +104,9 @@ struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, unsigned int len) { unsigned int hash; - struct list_head *iter; + struct hlist_node *iter; struct dlm_lock_resource *tmpres=NULL; - struct list_head *bucket; + struct hlist_head *bucket; mlog_entry("%.*s\n", len, name); @@ -114,11 +114,11 @@ struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, hash = full_name_hash(name, len); - bucket = &(dlm->resources[hash & DLM_HASH_MASK]); + bucket = &(dlm->lockres_hash[hash % DLM_HASH_BUCKETS]); /* check for pre-existing lock */ - list_for_each(iter, bucket) { - tmpres = list_entry(iter, struct dlm_lock_resource, list); + hlist_for_each(iter, bucket) { + tmpres = hlist_entry(iter, struct dlm_lock_resource, hash_node); if (tmpres->lockname.len == len && memcmp(tmpres->lockname.name, name, len) == 0) { dlm_lockres_get(tmpres); @@ -193,8 +193,8 @@ static int dlm_wait_on_domain_helper(const char *domain) static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm) { - if (dlm->resources) - free_page((unsigned long) dlm->resources); + if (dlm->lockres_hash) + free_page((unsigned long) dlm->lockres_hash); if (dlm->name) kfree(dlm->name); @@ -303,10 +303,10 @@ static void dlm_migrate_all_locks(struct dlm_ctxt *dlm) mlog(0, "Migrating locks from domain %s\n", dlm->name); restart: spin_lock(&dlm->spinlock); - for (i=0; i<DLM_HASH_SIZE; i++) { - while (!list_empty(&dlm->resources[i])) { - res = list_entry(dlm->resources[i].next, - struct dlm_lock_resource, list); + for (i = 0; i < DLM_HASH_BUCKETS; i++) { + while (!hlist_empty(&dlm->lockres_hash[i])) { + res = hlist_entry(dlm->lockres_hash[i].first, + struct dlm_lock_resource, hash_node); /* need reference when manually grabbing lockres */ dlm_lockres_get(res); /* this should unhash the lockres @@ -573,8 +573,11 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data) spin_lock(&dlm_domain_lock); dlm = __dlm_lookup_domain_full(query->domain, query->name_len); /* Once the dlm ctxt is marked as leaving then we don't want - * to be put in someone's domain map. */ + * to be put in someone's domain map. + * Also, explicitly disallow joining at certain troublesome + * times (ie. during recovery). */ if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) { + int bit = query->node_idx; spin_lock(&dlm->spinlock); if (dlm->dlm_state == DLM_CTXT_NEW && @@ -586,6 +589,19 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data) } else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) { /* Disallow parallel joins. */ response = JOIN_DISALLOW; + } else if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) { + mlog(ML_NOTICE, "node %u trying to join, but recovery " + "is ongoing.\n", bit); + response = JOIN_DISALLOW; + } else if (test_bit(bit, dlm->recovery_map)) { + mlog(ML_NOTICE, "node %u trying to join, but it " + "still needs recovery.\n", bit); + response = JOIN_DISALLOW; + } else if (test_bit(bit, dlm->domain_map)) { + mlog(ML_NOTICE, "node %u trying to join, but it " + "is still in the domain! needs recovery?\n", + bit); + response = JOIN_DISALLOW; } else { /* Alright we're fully a part of this domain * so we keep some state as to who's joining @@ -1175,18 +1191,17 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, goto leave; } - dlm->resources = (struct list_head *) __get_free_page(GFP_KERNEL); - if (!dlm->resources) { + dlm->lockres_hash = (struct hlist_head *) __get_free_page(GFP_KERNEL); + if (!dlm->lockres_hash) { mlog_errno(-ENOMEM); kfree(dlm->name); kfree(dlm); dlm = NULL; goto leave; } - memset(dlm->resources, 0, PAGE_SIZE); - for (i=0; i<DLM_HASH_SIZE; i++) - INIT_LIST_HEAD(&dlm->resources[i]); + for (i=0; i<DLM_HASH_BUCKETS; i++) + INIT_HLIST_HEAD(&dlm->lockres_hash[i]); strcpy(dlm->name, domain); dlm->key = key; diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index d1a0038557a..671d4ff222c 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -220,6 +220,17 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm, dlm_error(status); dlm_revert_pending_lock(res, lock); dlm_lock_put(lock); + } else if (dlm_is_recovery_lock(res->lockname.name, + res->lockname.len)) { + /* special case for the $RECOVERY lock. + * there will never be an AST delivered to put + * this lock on the proper secondary queue + * (granted), so do it manually. */ + mlog(0, "%s: $RECOVERY lock for this node (%u) is " + "mastered by %u; got lock, manually granting (no ast)\n", + dlm->name, dlm->node_num, res->owner); + list_del_init(&lock->list); + list_add_tail(&lock->list, &res->granted); } spin_unlock(&res->spinlock); @@ -646,7 +657,19 @@ retry_lock: mlog(0, "retrying lock with migration/" "recovery/in progress\n"); msleep(100); - dlm_wait_for_recovery(dlm); + /* no waiting for dlm_reco_thread */ + if (recovery) { + if (status == DLM_RECOVERING) { + mlog(0, "%s: got RECOVERING " + "for $REOCVERY lock, master " + "was %u\n", dlm->name, + res->owner); + dlm_wait_for_node_death(dlm, res->owner, + DLM_NODE_DEATH_WAIT_MAX); + } + } else { + dlm_wait_for_recovery(dlm); + } goto retry_lock; } diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 27e984f7e4c..847dd3cc4cf 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -564,7 +564,7 @@ static void dlm_lockres_release(struct kref *kref) /* By the time we're ready to blow this guy away, we shouldn't * be on any lists. */ - BUG_ON(!list_empty(&res->list)); + BUG_ON(!hlist_unhashed(&res->hash_node)); BUG_ON(!list_empty(&res->granted)); BUG_ON(!list_empty(&res->converting)); BUG_ON(!list_empty(&res->blocked)); @@ -605,7 +605,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, init_waitqueue_head(&res->wq); spin_lock_init(&res->spinlock); - INIT_LIST_HEAD(&res->list); + INIT_HLIST_NODE(&res->hash_node); INIT_LIST_HEAD(&res->granted); INIT_LIST_HEAD(&res->converting); INIT_LIST_HEAD(&res->blocked); @@ -1050,17 +1050,10 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, node = dlm_bitmap_diff_iter_next(&bdi, &sc); while (node >= 0) { if (sc == NODE_UP) { - /* a node came up. easy. might not even need - * to talk to it if its node number is higher - * or if we are already blocked. */ - mlog(0, "node up! %d\n", node); - if (blocked) - goto next; - - if (node > dlm->node_num) { - mlog(0, "node > this node. skipping.\n"); - goto next; - } + /* a node came up. clear any old vote from + * the response map and set it in the vote map + * then restart the mastery. */ + mlog(ML_NOTICE, "node %d up while restarting\n", node); /* redo the master request, but only for the new node */ mlog(0, "sending request to new node\n"); @@ -2005,6 +1998,15 @@ fail: break; mlog(0, "timed out during migration\n"); + /* avoid hang during shutdown when migrating lockres + * to a node which also goes down */ + if (dlm_is_node_dead(dlm, target)) { + mlog(0, "%s:%.*s: expected migration target %u " + "is no longer up. restarting.\n", + dlm->name, res->lockname.len, + res->lockname.name, target); + ret = -ERESTARTSYS; + } } if (ret == -ERESTARTSYS) { /* migration failed, detach and clean up mle */ @@ -2480,7 +2482,9 @@ top: atomic_set(&mle->woken, 1); spin_unlock(&mle->spinlock); wake_up(&mle->wq); - /* final put will take care of list removal */ + /* do not need events any longer, so detach + * from heartbeat */ + __dlm_mle_detach_hb_events(dlm, mle); __dlm_put_mle(mle); } continue; @@ -2535,6 +2539,9 @@ top: spin_unlock(&res->spinlock); dlm_lockres_put(res); + /* about to get rid of mle, detach from heartbeat */ + __dlm_mle_detach_hb_events(dlm, mle); + /* dump the mle */ spin_lock(&dlm->master_lock); __dlm_put_mle(mle); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 0c8eb1093f0..1e232000f3f 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -39,6 +39,7 @@ #include <linux/inet.h> #include <linux/timer.h> #include <linux/kthread.h> +#include <linux/delay.h> #include "cluster/heartbeat.h" @@ -256,6 +257,45 @@ static int dlm_recovery_thread(void *data) return 0; } +/* returns true when the recovery master has contacted us */ +static int dlm_reco_master_ready(struct dlm_ctxt *dlm) +{ + int ready; + spin_lock(&dlm->spinlock); + ready = (dlm->reco.new_master != O2NM_INVALID_NODE_NUM); + spin_unlock(&dlm->spinlock); + return ready; +} + +/* returns true if node is no longer in the domain + * could be dead or just not joined */ +int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node) +{ + int dead; + spin_lock(&dlm->spinlock); + dead = test_bit(node, dlm->domain_map); + spin_unlock(&dlm->spinlock); + return dead; +} + +int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) +{ + if (timeout) { + mlog(ML_NOTICE, "%s: waiting %dms for notification of " + "death of node %u\n", dlm->name, timeout, node); + wait_event_timeout(dlm->dlm_reco_thread_wq, + dlm_is_node_dead(dlm, node), + msecs_to_jiffies(timeout)); + } else { + mlog(ML_NOTICE, "%s: waiting indefinitely for notification " + "of death of node %u\n", dlm->name, node); + wait_event(dlm->dlm_reco_thread_wq, + dlm_is_node_dead(dlm, node)); + } + /* for now, return 0 */ + return 0; +} + /* callers of the top-level api calls (dlmlock/dlmunlock) should * block on the dlm->reco.event when recovery is in progress. * the dlm recovery thread will set this state when it begins @@ -297,6 +337,7 @@ static void dlm_end_recovery(struct dlm_ctxt *dlm) static int dlm_do_recovery(struct dlm_ctxt *dlm) { int status = 0; + int ret; spin_lock(&dlm->spinlock); @@ -343,10 +384,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) goto master_here; if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) { - /* choose a new master */ - if (!dlm_pick_recovery_master(dlm)) { + /* choose a new master, returns 0 if this node + * is the master, -EEXIST if it's another node. + * this does not return until a new master is chosen + * or recovery completes entirely. */ + ret = dlm_pick_recovery_master(dlm); + if (!ret) { /* already notified everyone. go. */ - dlm->reco.new_master = dlm->node_num; goto master_here; } mlog(0, "another node will master this recovery session.\n"); @@ -371,8 +415,13 @@ master_here: if (status < 0) { mlog(ML_ERROR, "error %d remastering locks for node %u, " "retrying.\n", status, dlm->reco.dead_node); + /* yield a bit to allow any final network messages + * to get handled on remaining nodes */ + msleep(100); } else { /* success! see if any other nodes need recovery */ + mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n", + dlm->name, dlm->reco.dead_node, dlm->node_num); dlm_reset_recovery(dlm); } dlm_end_recovery(dlm); @@ -477,7 +526,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) BUG(); break; case DLM_RECO_NODE_DATA_DEAD: - mlog(0, "node %u died after " + mlog(ML_NOTICE, "node %u died after " "requesting recovery info for " "node %u\n", ndata->node_num, dead_node); @@ -485,6 +534,19 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) // start all over destroy = 1; status = -EAGAIN; + /* instead of spinning like crazy here, + * wait for the domain map to catch up + * with the network state. otherwise this + * can be hit hundreds of times before + * the node is really seen as dead. */ + wait_event_timeout(dlm->dlm_reco_thread_wq, + dlm_is_node_dead(dlm, + ndata->node_num), + msecs_to_jiffies(1000)); + mlog(0, "waited 1 sec for %u, " + "dead? %s\n", ndata->node_num, + dlm_is_node_dead(dlm, ndata->node_num) ? + "yes" : "no"); goto leave; case DLM_RECO_NODE_DATA_RECEIVING: case DLM_RECO_NODE_DATA_REQUESTED: @@ -678,11 +740,27 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) dlm = item->dlm; dead_node = item->u.ral.dead_node; reco_master = item->u.ral.reco_master; + mres = (struct dlm_migratable_lockres *)data; + + if (dead_node != dlm->reco.dead_node || + reco_master != dlm->reco.new_master) { + /* show extra debug info if the recovery state is messed */ + mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), " + "request(dead=%u, master=%u)\n", + dlm->name, dlm->reco.dead_node, dlm->reco.new_master, + dead_node, reco_master); + mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u " + "entry[0]={c=%"MLFu64",l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n", + dlm->name, mres->lockname_len, mres->lockname, mres->master, + mres->num_locks, mres->total_locks, mres->flags, + mres->ml[0].cookie, mres->ml[0].list, mres->ml[0].flags, + mres->ml[0].type, mres->ml[0].convert_type, + mres->ml[0].highest_blocked, mres->ml[0].node); + BUG(); + } BUG_ON(dead_node != dlm->reco.dead_node); BUG_ON(reco_master != dlm->reco.new_master); - mres = (struct dlm_migratable_lockres *)data; - /* lock resources should have already been moved to the * dlm->reco.resources list. now move items from that list * to a temp list if the dead owner matches. note that the @@ -757,15 +835,18 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data) continue; switch (ndata->state) { + /* should have moved beyond INIT but not to FINALIZE yet */ case DLM_RECO_NODE_DATA_INIT: case DLM_RECO_NODE_DATA_DEAD: - case DLM_RECO_NODE_DATA_DONE: case DLM_RECO_NODE_DATA_FINALIZE_SENT: mlog(ML_ERROR, "bad ndata state for node %u:" " state=%d\n", ndata->node_num, ndata->state); BUG(); break; + /* these states are possible at this point, anywhere along + * the line of recovery */ + case DLM_RECO_NODE_DATA_DONE: case DLM_RECO_NODE_DATA_RECEIVING: case DLM_RECO_NODE_DATA_REQUESTED: case DLM_RECO_NODE_DATA_REQUESTING: @@ -799,13 +880,31 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm, { struct dlm_lock_resource *res; struct list_head *iter, *iter2; + struct dlm_lock *lock; spin_lock(&dlm->spinlock); list_for_each_safe(iter, iter2, &dlm->reco.resources) { res = list_entry (iter, struct dlm_lock_resource, recovering); + /* always prune any $RECOVERY entries for dead nodes, + * otherwise hangs can occur during later recovery */ if (dlm_is_recovery_lock(res->lockname.name, - res->lockname.len)) + res->lockname.len)) { + spin_lock(&res->spinlock); + list_for_each_entry(lock, &res->granted, list) { + if (lock->ml.node == dead_node) { + mlog(0, "AHA! there was " + "a $RECOVERY lock for dead " + "node %u (%s)!\n", + dead_node, dlm->name); + list_del_init(&lock->list); + dlm_lock_put(lock); + break; + } + } + spin_unlock(&res->spinlock); continue; + } + if (res->owner == dead_node) { mlog(0, "found lockres owned by dead node while " "doing recovery for node %u. sending it.\n", @@ -1179,7 +1278,7 @@ static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data) again: ret = dlm_lockres_master_requery(dlm, res, &real_master); if (ret < 0) { - mlog(0, "dlm_lockres_master_requery failure: %d\n", + mlog(0, "dlm_lockres_master_requery ret=%d\n", ret); goto again; } @@ -1594,7 +1693,10 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, u8 dead_node, u8 new_master) { int i; - struct list_head *iter, *iter2, *bucket; + struct list_head *iter, *iter2; + struct hlist_node *hash_iter; + struct hlist_head *bucket; + struct dlm_lock_resource *res; mlog_entry_void(); @@ -1618,10 +1720,9 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, * for now we need to run the whole hash, clear * the RECOVERING state and set the owner * if necessary */ - for (i=0; i<DLM_HASH_SIZE; i++) { - bucket = &(dlm->resources[i]); - list_for_each(iter, bucket) { - res = list_entry (iter, struct dlm_lock_resource, list); + for (i = 0; i < DLM_HASH_BUCKETS; i++) { + bucket = &(dlm->lockres_hash[i]); + hlist_for_each_entry(res, hash_iter, bucket, hash_node) { if (res->state & DLM_LOCK_RES_RECOVERING) { if (res->owner == dead_node) { mlog(0, "(this=%u) res %.*s owner=%u " @@ -1753,10 +1854,11 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm, static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) { - struct list_head *iter; + struct hlist_node *iter; struct dlm_lock_resource *res; int i; - struct list_head *bucket; + struct hlist_head *bucket; + struct dlm_lock *lock; /* purge any stale mles */ @@ -1776,14 +1878,28 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) * can be kicked again to see if any ASTs or BASTs * need to be fired as a result. */ - for (i=0; i<DLM_HASH_SIZE; i++) { - bucket = &(dlm->resources[i]); - list_for_each(iter, bucket) { - res = list_entry (iter, struct dlm_lock_resource, list); + for (i = 0; i < DLM_HASH_BUCKETS; i++) { + bucket = &(dlm->lockres_hash[i]); + hlist_for_each_entry(res, iter, bucket, hash_node) { + /* always prune any $RECOVERY entries for dead nodes, + * otherwise hangs can occur during later recovery */ if (dlm_is_recovery_lock(res->lockname.name, - res->lockname.len)) + res->lockname.len)) { + spin_lock(&res->spinlock); + list_for_each_entry(lock, &res->granted, list) { + if (lock->ml.node == dead_node) { + mlog(0, "AHA! there was " + "a $RECOVERY lock for dead " + "node %u (%s)!\n", + dead_node, dlm->name); + list_del_init(&lock->list); + dlm_lock_put(lock); + break; + } + } + spin_unlock(&res->spinlock); continue; - + } spin_lock(&res->spinlock); /* zero the lvb if necessary */ dlm_revalidate_lvb(dlm, res, dead_node); @@ -1869,12 +1985,9 @@ void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data) return; spin_lock(&dlm->spinlock); - set_bit(idx, dlm->live_nodes_map); - - /* notify any mles attached to the heartbeat events */ - dlm_hb_event_notify_attached(dlm, idx, 1); - + /* do NOT notify mle attached to the heartbeat events. + * new nodes are not interesting in mastery until joined. */ spin_unlock(&dlm->spinlock); dlm_put(dlm); @@ -1897,7 +2010,18 @@ static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st) mlog(0, "unlockast for recovery lock fired!\n"); } - +/* + * dlm_pick_recovery_master will continually attempt to use + * dlmlock() on the special "$RECOVERY" lockres with the + * LKM_NOQUEUE flag to get an EX. every thread that enters + * this function on each node racing to become the recovery + * master will not stop attempting this until either: + * a) this node gets the EX (and becomes the recovery master), + * or b) dlm->reco.new_master gets set to some nodenum + * != O2NM_INVALID_NODE_NUM (another node will do the reco). + * so each time a recovery master is needed, the entire cluster + * will sync at this point. if the new master dies, that will + * be detected in dlm_do_recovery */ static int dlm_pick_recovery_master(struct dlm_ctxt *dlm) { enum dlm_status ret; @@ -1906,23 +2030,69 @@ static int dlm_pick_recovery_master(struct dlm_ctxt *dlm) mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n", dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num); -retry: +again: memset(&lksb, 0, sizeof(lksb)); ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY, DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast); + mlog(0, "%s: dlmlock($RECOVERY) returned %d, lksb=%d\n", + dlm->name, ret, lksb.status); + if (ret == DLM_NORMAL) { mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n", dlm->name, dlm->node_num); - /* I am master, send message to all nodes saying - * that I am beginning a recovery session */ - status = dlm_send_begin_reco_message(dlm, - dlm->reco.dead_node); + + /* got the EX lock. check to see if another node + * just became the reco master */ + if (dlm_reco_master_ready(dlm)) { + mlog(0, "%s: got reco EX lock, but %u will " + "do the recovery\n", dlm->name, + dlm->reco.new_master); + status = -EEXIST; + } else { + status = 0; + + /* see if recovery was already finished elsewhere */ + spin_lock(&dlm->spinlock); + if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { + status = -EINVAL; + mlog(0, "%s: got reco EX lock, but " + "node got recovered already\n", dlm->name); + if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) { + mlog(ML_ERROR, "%s: new master is %u " + "but no dead node!\n", + dlm->name, dlm->reco.new_master); + BUG(); + } + } + spin_unlock(&dlm->spinlock); + } + + /* if this node has actually become the recovery master, + * set the master and send the messages to begin recovery */ + if (!status) { + mlog(0, "%s: dead=%u, this=%u, sending " + "begin_reco now\n", dlm->name, + dlm->reco.dead_node, dlm->node_num); + status = dlm_send_begin_reco_message(dlm, + dlm->reco.dead_node); + /* this always succeeds */ + BUG_ON(status); + + /* set the new_master to this node */ + spin_lock(&dlm->spinlock); + dlm->reco.new_master = dlm->node_num; + spin_unlock(&dlm->spinlock); + } /* recovery lock is a special case. ast will not get fired, * so just go ahead and unlock it. */ ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm); + if (ret == DLM_DENIED) { + mlog(0, "got DLM_DENIED, trying LKM_CANCEL\n"); + ret = dlmunlock(dlm, &lksb, LKM_CANCEL, dlm_reco_unlock_ast, dlm); + } if (ret != DLM_NORMAL) { /* this would really suck. this could only happen * if there was a network error during the unlock @@ -1930,20 +2100,42 @@ retry: * is actually "done" and the lock structure is * even freed. we can continue, but only * because this specific lock name is special. */ - mlog(0, "dlmunlock returned %d\n", ret); - } - - if (status < 0) { - mlog(0, "failed to send recovery message. " - "must retry with new node map.\n"); - goto retry; + mlog(ML_ERROR, "dlmunlock returned %d\n", ret); } } else if (ret == DLM_NOTQUEUED) { mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n", dlm->name, dlm->node_num); /* another node is master. wait on - * reco.new_master != O2NM_INVALID_NODE_NUM */ + * reco.new_master != O2NM_INVALID_NODE_NUM + * for at most one second */ + wait_event_timeout(dlm->dlm_reco_thread_wq, + dlm_reco_master_ready(dlm), + msecs_to_jiffies(1000)); + if (!dlm_reco_master_ready(dlm)) { + mlog(0, "%s: reco master taking awhile\n", + dlm->name); + goto again; + } + /* another node has informed this one that it is reco master */ + mlog(0, "%s: reco master %u is ready to recover %u\n", + dlm->name, dlm->reco.new_master, dlm->reco.dead_node); status = -EEXIST; + } else { + struct dlm_lock_resource *res; + + /* dlmlock returned something other than NOTQUEUED or NORMAL */ + mlog(ML_ERROR, "%s: got %s from dlmlock($RECOVERY), " + "lksb.status=%s\n", dlm->name, dlm_errname(ret), + dlm_errname(lksb.status)); + res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME, + DLM_RECOVERY_LOCK_NAME_LEN); + if (res) { + dlm_print_one_lock_resource(res); + dlm_lockres_put(res); + } else { + mlog(ML_ERROR, "recovery lock not found\n"); + } + BUG(); } return status; @@ -1982,7 +2174,7 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) mlog(0, "not sending begin reco to self\n"); continue; } - +retry: ret = -EINVAL; mlog(0, "attempting to send begin reco msg to %d\n", nodenum); @@ -1991,8 +2183,17 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) /* negative status is handled ok by caller here */ if (ret >= 0) ret = status; + if (dlm_is_host_down(ret)) { + /* node is down. not involved in recovery + * so just keep going */ + mlog(0, "%s: node %u was down when sending " + "begin reco msg (%d)\n", dlm->name, nodenum, ret); + ret = 0; + } if (ret < 0) { struct dlm_lock_resource *res; + /* this is now a serious problem, possibly ENOMEM + * in the network stack. must retry */ mlog_errno(ret); mlog(ML_ERROR, "begin reco of dlm %s to node %u " " returned %d\n", dlm->name, nodenum, ret); @@ -2004,7 +2205,10 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) } else { mlog(ML_ERROR, "recovery lock not found\n"); } - break; + /* sleep for a bit in hopes that we can avoid + * another ENOMEM */ + msleep(100); + goto retry; } } @@ -2027,19 +2231,34 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data) spin_lock(&dlm->spinlock); if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) { - mlog(0, "new_master already set to %u!\n", - dlm->reco.new_master); + if (test_bit(dlm->reco.new_master, dlm->recovery_map)) { + mlog(0, "%s: new_master %u died, changing " + "to %u\n", dlm->name, dlm->reco.new_master, + br->node_idx); + } else { + mlog(0, "%s: new_master %u NOT DEAD, changing " + "to %u\n", dlm->name, dlm->reco.new_master, + br->node_idx); + /* may not have seen the new master as dead yet */ + } } if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) { - mlog(0, "dead_node already set to %u!\n", - dlm->reco.dead_node); + mlog(ML_NOTICE, "%s: dead_node previously set to %u, " + "node %u changing it to %u\n", dlm->name, + dlm->reco.dead_node, br->node_idx, br->dead_node); } dlm->reco.new_master = br->node_idx; dlm->reco.dead_node = br->dead_node; if (!test_bit(br->dead_node, dlm->recovery_map)) { - mlog(ML_ERROR, "recovery master %u sees %u as dead, but this " + mlog(0, "recovery master %u sees %u as dead, but this " "node has not yet. marking %u as dead\n", br->node_idx, br->dead_node, br->dead_node); + if (!test_bit(br->dead_node, dlm->domain_map) || + !test_bit(br->dead_node, dlm->live_nodes_map)) + mlog(0, "%u not in domain/live_nodes map " + "so setting it in reco map manually\n", + br->dead_node); + set_bit(br->dead_node, dlm->recovery_map); __dlm_hb_node_down(dlm, br->dead_node); } spin_unlock(&dlm->spinlock); diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index cec2ce1cd31..c95f08d2e92 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -188,6 +188,19 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, actions &= ~(DLM_UNLOCK_REMOVE_LOCK| DLM_UNLOCK_REGRANT_LOCK| DLM_UNLOCK_CLEAR_CONVERT_TYPE); + } else if (status == DLM_RECOVERING || + status == DLM_MIGRATING || + status == DLM_FORWARD) { + /* must clear the actions because this unlock + * is about to be retried. cannot free or do + * any list manipulation. */ + mlog(0, "%s:%.*s: clearing actions, %s\n", + dlm->name, res->lockname.len, + res->lockname.name, + status==DLM_RECOVERING?"recovering": + (status==DLM_MIGRATING?"migrating": + "forward")); + actions = 0; } if (flags & LKM_CANCEL) lock->cancel_pending = 0; diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c index e1fdd288796..c3764f4744e 100644 --- a/fs/ocfs2/dlm/userdlm.c +++ b/fs/ocfs2/dlm/userdlm.c @@ -27,7 +27,7 @@ * Boston, MA 021110-1307, USA. */ -#include <asm/signal.h> +#include <linux/signal.h> #include <linux/module.h> #include <linux/fs.h> diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index f2fb40cd296..e6f207eebab 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -181,6 +181,12 @@ static int ocfs2_extent_map_find_leaf(struct inode *inode, ret = -EBADR; if (rec_end > OCFS2_I(inode)->ip_clusters) { mlog_errno(ret); + ocfs2_error(inode->i_sb, + "Extent %d at e_blkno %"MLFu64" of inode %"MLFu64" goes past ip_clusters of %u\n", + i, + le64_to_cpu(rec->e_blkno), + OCFS2_I(inode)->ip_blkno, + OCFS2_I(inode)->ip_clusters); goto out_free; } @@ -226,6 +232,12 @@ static int ocfs2_extent_map_find_leaf(struct inode *inode, ret = -EBADR; if (blkno) { mlog_errno(ret); + ocfs2_error(inode->i_sb, + "Multiple extents for (cpos = %u, clusters = %u) on inode %"MLFu64"; e_blkno %"MLFu64" and rec %d at e_blkno %"MLFu64"\n", + cpos, clusters, + OCFS2_I(inode)->ip_blkno, + blkno, i, + le64_to_cpu(rec->e_blkno)); goto out_free; } @@ -238,6 +250,10 @@ static int ocfs2_extent_map_find_leaf(struct inode *inode, */ ret = -EBADR; if (!blkno) { + ocfs2_error(inode->i_sb, + "No record found for (cpos = %u, clusters = %u) on inode %"MLFu64"\n", + cpos, clusters, + OCFS2_I(inode)->ip_blkno); mlog_errno(ret); goto out_free; } @@ -262,11 +278,24 @@ static int ocfs2_extent_map_find_leaf(struct inode *inode, el = &eb->h_list; } - if (el->l_tree_depth) - BUG(); + BUG_ON(el->l_tree_depth); for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { rec = &el->l_recs[i]; + + if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > + OCFS2_I(inode)->ip_clusters) { + ret = -EBADR; + mlog_errno(ret); + ocfs2_error(inode->i_sb, + "Extent %d at e_blkno %"MLFu64" of inode %"MLFu64" goes past ip_clusters of %u\n", + i, + le64_to_cpu(rec->e_blkno), + OCFS2_I(inode)->ip_blkno, + OCFS2_I(inode)->ip_clusters); + return ret; + } + ret = ocfs2_extent_map_insert(inode, rec, le16_to_cpu(el->l_tree_depth)); if (ret) { @@ -364,8 +393,8 @@ static int ocfs2_extent_map_lookup_read(struct inode *inode, return ret; } - if (ent->e_tree_depth) - BUG(); /* FIXME: Make sure this isn't a corruption */ + /* FIXME: Make sure this isn't a corruption */ + BUG_ON(ent->e_tree_depth); *ret_ent = ent; @@ -423,8 +452,7 @@ static int ocfs2_extent_map_try_insert(struct inode *inode, le32_to_cpu(rec->e_clusters), NULL, NULL); - if (!old_ent) - BUG(); + BUG_ON(!old_ent); ret = -EEXIST; if (old_ent->e_tree_depth < tree_depth) @@ -528,6 +556,10 @@ static int ocfs2_extent_map_insert(struct inode *inode, OCFS2_I(inode)->ip_map.em_clusters) { ret = -EBADR; mlog_errno(ret); + ocfs2_error(inode->i_sb, + "Zero e_clusters on non-tail extent record at e_blkno %"MLFu64" on inode %"MLFu64"\n", + le64_to_cpu(rec->e_blkno), + OCFS2_I(inode)->ip_blkno); return ret; } @@ -590,12 +622,12 @@ static int ocfs2_extent_map_insert(struct inode *inode, * Existing record in the extent map: * * cpos = 10, len = 10 - * |---------| + * |---------| * * New Record: * * cpos = 10, len = 20 - * |------------------| + * |------------------| * * The passed record is the new on-disk record. The new_clusters value * is how many clusters were added to the file. If the append is a @@ -988,7 +1020,7 @@ int __init init_ocfs2_extent_maps(void) return 0; } -void __exit exit_ocfs2_extent_maps(void) +void exit_ocfs2_extent_maps(void) { kmem_cache_destroy(ocfs2_em_ent_cachep); } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index eaf33caa0a1..8a4048b55fd 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -933,9 +933,6 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, struct file *filp = iocb->ki_filp; struct inode *inode = filp->f_dentry->d_inode; loff_t newsize, saved_pos; -#ifdef OCFS2_ORACORE_WORKAROUNDS - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); -#endif mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, (unsigned int)count, @@ -951,14 +948,6 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, return -EIO; } -#ifdef OCFS2_ORACORE_WORKAROUNDS - /* ugh, work around some applications which open everything O_DIRECT + - * O_APPEND and really don't mean to use O_DIRECT. */ - if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS && - (filp->f_flags & O_APPEND) && (filp->f_flags & O_DIRECT)) - filp->f_flags &= ~O_DIRECT; -#endif - mutex_lock(&inode->i_mutex); /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ if (filp->f_flags & O_DIRECT) { @@ -1022,8 +1011,9 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, } newsize = count + saved_pos; - mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n", - saved_pos, newsize, i_size_read(inode)); + mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", + (long long) saved_pos, (long long) newsize, + (long long) i_size_read(inode)); /* No need for a higher level metadata lock if we're * never going past i_size. */ @@ -1042,8 +1032,9 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, spin_unlock(&OCFS2_I(inode)->ip_lock); mlog(0, "Writing at EOF, may need more allocation: " - "i_size = %lld, newsize = %"MLFu64", need %u clusters\n", - i_size_read(inode), newsize, clusters); + "i_size = %lld, newsize = %lld, need %u clusters\n", + (long long) i_size_read(inode), (long long) newsize, + clusters); /* We only want to continue the rest of this loop if * our extend will actually require more @@ -1077,27 +1068,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, /* communicate with ocfs2_dio_end_io */ ocfs2_iocb_set_rw_locked(iocb); -#ifdef OCFS2_ORACORE_WORKAROUNDS - if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS && - filp->f_flags & O_DIRECT) { - unsigned int saved_flags = filp->f_flags; - int sector_size = 1 << osb->s_sectsize_bits; - - if ((saved_pos & (sector_size - 1)) || - (count & (sector_size - 1)) || - ((unsigned long)buf & (sector_size - 1))) { - filp->f_flags |= O_SYNC; - filp->f_flags &= ~O_DIRECT; - } - - ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, - &iocb->ki_pos); - - filp->f_flags = saved_flags; - } else -#endif - ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, - &iocb->ki_pos); + ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); /* buffered aio wouldn't have proper lock coverage today */ BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); @@ -1138,9 +1109,6 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, int ret = 0, rw_level = -1, have_alloc_sem = 0; struct file *filp = iocb->ki_filp; struct inode *inode = filp->f_dentry->d_inode; -#ifdef OCFS2_ORACORE_WORKAROUNDS - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); -#endif mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, (unsigned int)count, @@ -1153,21 +1121,6 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, goto bail; } -#ifdef OCFS2_ORACORE_WORKAROUNDS - if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) { - if (filp->f_flags & O_DIRECT) { - int sector_size = 1 << osb->s_sectsize_bits; - - if ((pos & (sector_size - 1)) || - (count & (sector_size - 1)) || - ((unsigned long)buf & (sector_size - 1)) || - (i_size_read(inode) & (sector_size -1))) { - filp->f_flags &= ~O_DIRECT; - } - } - } -#endif - /* * buffered reads protect themselves in ->readpage(). O_DIRECT reads * need locks to protect pending reads from racing with truncate. diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c index 0bbd22f46c8..cbfd45a97a6 100644 --- a/fs/ocfs2/heartbeat.c +++ b/fs/ocfs2/heartbeat.c @@ -67,6 +67,7 @@ void ocfs2_init_node_maps(struct ocfs2_super *osb) ocfs2_node_map_init(&osb->mounted_map); ocfs2_node_map_init(&osb->recovery_map); ocfs2_node_map_init(&osb->umount_map); + ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs); } static void ocfs2_do_node_down(int node_num, diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index d4ecc062771..315472a5c19 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -41,6 +41,7 @@ #include "dlmglue.h" #include "extent_map.h" #include "file.h" +#include "heartbeat.h" #include "inode.h" #include "journal.h" #include "namei.h" @@ -544,6 +545,42 @@ bail: return status; } +/* + * Serialize with orphan dir recovery. If the process doing + * recovery on this orphan dir does an iget() with the dir + * i_mutex held, we'll deadlock here. Instead we detect this + * and exit early - recovery will wipe this inode for us. + */ +static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb, + int slot) +{ + int ret = 0; + + spin_lock(&osb->osb_lock); + if (ocfs2_node_map_test_bit(osb, &osb->osb_recovering_orphan_dirs, slot)) { + mlog(0, "Recovery is happening on orphan dir %d, will skip " + "this inode\n", slot); + ret = -EDEADLK; + goto out; + } + /* This signals to the orphan recovery process that it should + * wait for us to handle the wipe. */ + osb->osb_orphan_wipes[slot]++; +out: + spin_unlock(&osb->osb_lock); + return ret; +} + +static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb, + int slot) +{ + spin_lock(&osb->osb_lock); + osb->osb_orphan_wipes[slot]--; + spin_unlock(&osb->osb_lock); + + wake_up(&osb->osb_wipe_event); +} + static int ocfs2_wipe_inode(struct inode *inode, struct buffer_head *di_bh) { @@ -555,6 +592,11 @@ static int ocfs2_wipe_inode(struct inode *inode, /* We've already voted on this so it should be readonly - no * spinlock needed. */ orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; + + status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot); + if (status) + return status; + orphan_dir_inode = ocfs2_get_system_file_inode(osb, ORPHAN_DIR_SYSTEM_INODE, orphaned_slot); @@ -597,6 +639,7 @@ bail_unlock_dir: brelse(orphan_dir_bh); bail: iput(orphan_dir_inode); + ocfs2_signal_wipe_completion(osb, orphaned_slot); return status; } @@ -822,7 +865,8 @@ void ocfs2_delete_inode(struct inode *inode) status = ocfs2_wipe_inode(inode, di_bh); if (status < 0) { - mlog_errno(status); + if (status != -EDEADLK) + mlog_errno(status); goto bail_unlock_inode; } @@ -903,10 +947,10 @@ void ocfs2_clear_inode(struct inode *inode) "Clear inode of %"MLFu64", inode is locked\n", oi->ip_blkno); - mlog_bug_on_msg(down_trylock(&oi->ip_io_sem), - "Clear inode of %"MLFu64", io_sem is locked\n", + mlog_bug_on_msg(!mutex_trylock(&oi->ip_io_mutex), + "Clear inode of %"MLFu64", io_mutex is locked\n", oi->ip_blkno); - up(&oi->ip_io_sem); + mutex_unlock(&oi->ip_io_mutex); /* * down_trylock() returns 0, down_write_trylock() returns 1 diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 9b017743365..84c50796128 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -46,10 +46,10 @@ struct ocfs2_inode_info struct list_head ip_io_markers; int ip_orphaned_slot; - struct semaphore ip_io_sem; + struct mutex ip_io_mutex; /* Used by the journalling code to attach an inode to a - * handle. These are protected by ip_io_sem in order to lock + * handle. These are protected by ip_io_mutex in order to lock * out other I/O to the inode until we either commit or * abort. */ struct list_head ip_handle_list; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 303c8d96457..4be801f4559 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -147,8 +147,7 @@ struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb, mlog_entry("(max_buffs = %d)\n", max_buffs); - if (!osb || !osb->journal->j_journal) - BUG(); + BUG_ON(!osb || !osb->journal->j_journal); if (ocfs2_is_hard_readonly(osb)) { ret = -EROFS; @@ -401,7 +400,7 @@ int ocfs2_journal_access(struct ocfs2_journal_handle *handle, * j_trans_barrier for us. */ ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode); - down(&OCFS2_I(inode)->ip_io_sem); + mutex_lock(&OCFS2_I(inode)->ip_io_mutex); switch (type) { case OCFS2_JOURNAL_ACCESS_CREATE: case OCFS2_JOURNAL_ACCESS_WRITE: @@ -416,7 +415,7 @@ int ocfs2_journal_access(struct ocfs2_journal_handle *handle, status = -EINVAL; mlog(ML_ERROR, "Uknown access type!\n"); } - up(&OCFS2_I(inode)->ip_io_sem); + mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); if (status < 0) mlog(ML_ERROR, "Error %d getting %d access to buffer!\n", @@ -561,7 +560,11 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) SET_INODE_JOURNAL(inode); OCFS2_I(inode)->ip_open_count++; - status = ocfs2_meta_lock(inode, NULL, &bh, 1); + /* Skip recovery waits here - journal inode metadata never + * changes in a live cluster so it can be considered an + * exception to the rule. */ + status = ocfs2_meta_lock_full(inode, NULL, &bh, 1, + OCFS2_META_LOCK_RECOVERY); if (status < 0) { if (status != -ERESTARTSYS) mlog(ML_ERROR, "Could not get lock on journal!\n"); @@ -672,8 +675,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) mlog_entry_void(); - if (!osb) - BUG(); + BUG_ON(!osb); journal = osb->journal; if (!journal) @@ -805,8 +807,7 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full) mlog_entry_void(); - if (!journal) - BUG(); + BUG_ON(!journal); status = journal_wipe(journal->j_journal, full); if (status < 0) { @@ -1072,10 +1073,10 @@ restart: NULL); bail: - down(&osb->recovery_lock); + mutex_lock(&osb->recovery_lock); if (!status && !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { - up(&osb->recovery_lock); + mutex_unlock(&osb->recovery_lock); goto restart; } @@ -1083,7 +1084,7 @@ bail: mb(); /* sync with ocfs2_recovery_thread_running */ wake_up(&osb->recovery_event); - up(&osb->recovery_lock); + mutex_unlock(&osb->recovery_lock); mlog_exit(status); /* no one is callint kthread_stop() for us so the kthread() api @@ -1098,7 +1099,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) mlog_entry("(node_num=%d, osb->node_num = %d)\n", node_num, osb->node_num); - down(&osb->recovery_lock); + mutex_lock(&osb->recovery_lock); if (osb->disable_recovery) goto out; @@ -1120,7 +1121,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) } out: - up(&osb->recovery_lock); + mutex_unlock(&osb->recovery_lock); wake_up(&osb->recovery_event); mlog_exit_void(); @@ -1271,8 +1272,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb, /* Should not ever be called to recover ourselves -- in that * case we should've called ocfs2_journal_load instead. */ - if (osb->node_num == node_num) - BUG(); + BUG_ON(osb->node_num == node_num); slot_num = ocfs2_node_num_to_slot(si, node_num); if (slot_num == OCFS2_INVALID_SLOT) { @@ -1408,21 +1408,17 @@ bail: return status; } -static int ocfs2_recover_orphans(struct ocfs2_super *osb, - int slot) +static int ocfs2_queue_orphans(struct ocfs2_super *osb, + int slot, + struct inode **head) { - int status = 0; - int have_disk_lock = 0; - struct inode *inode = NULL; - struct inode *iter; + int status; struct inode *orphan_dir_inode = NULL; + struct inode *iter; unsigned long offset, blk, local; struct buffer_head *bh = NULL; struct ocfs2_dir_entry *de; struct super_block *sb = osb->sb; - struct ocfs2_inode_info *oi; - - mlog(0, "Recover inodes from orphan dir in slot %d\n", slot); orphan_dir_inode = ocfs2_get_system_file_inode(osb, ORPHAN_DIR_SYSTEM_INODE, @@ -1430,17 +1426,15 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, if (!orphan_dir_inode) { status = -ENOENT; mlog_errno(status); - goto out; - } + return status; + } mutex_lock(&orphan_dir_inode->i_mutex); status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0); if (status < 0) { - mutex_unlock(&orphan_dir_inode->i_mutex); mlog_errno(status); goto out; } - have_disk_lock = 1; offset = 0; iter = NULL; @@ -1451,11 +1445,10 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, if (!bh) status = -EINVAL; if (status < 0) { - mutex_unlock(&orphan_dir_inode->i_mutex); if (bh) brelse(bh); mlog_errno(status); - goto out; + goto out_unlock; } local = 0; @@ -1465,11 +1458,10 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, if (!ocfs2_check_dir_entry(orphan_dir_inode, de, bh, local)) { - mutex_unlock(&orphan_dir_inode->i_mutex); status = -EINVAL; mlog_errno(status); brelse(bh); - goto out; + goto out_unlock; } local += le16_to_cpu(de->rec_len); @@ -1504,18 +1496,95 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, mlog(0, "queue orphan %"MLFu64"\n", OCFS2_I(iter)->ip_blkno); - OCFS2_I(iter)->ip_next_orphan = inode; - inode = iter; + /* No locking is required for the next_orphan + * queue as there is only ever a single + * process doing orphan recovery. */ + OCFS2_I(iter)->ip_next_orphan = *head; + *head = iter; } brelse(bh); } - mutex_unlock(&orphan_dir_inode->i_mutex); +out_unlock: ocfs2_meta_unlock(orphan_dir_inode, 0); - have_disk_lock = 0; - +out: + mutex_unlock(&orphan_dir_inode->i_mutex); iput(orphan_dir_inode); - orphan_dir_inode = NULL; + return status; +} + +static int ocfs2_orphan_recovery_can_continue(struct ocfs2_super *osb, + int slot) +{ + int ret; + + spin_lock(&osb->osb_lock); + ret = !osb->osb_orphan_wipes[slot]; + spin_unlock(&osb->osb_lock); + return ret; +} + +static void ocfs2_mark_recovering_orphan_dir(struct ocfs2_super *osb, + int slot) +{ + spin_lock(&osb->osb_lock); + /* Mark ourselves such that new processes in delete_inode() + * know to quit early. */ + ocfs2_node_map_set_bit(osb, &osb->osb_recovering_orphan_dirs, slot); + while (osb->osb_orphan_wipes[slot]) { + /* If any processes are already in the middle of an + * orphan wipe on this dir, then we need to wait for + * them. */ + spin_unlock(&osb->osb_lock); + wait_event_interruptible(osb->osb_wipe_event, + ocfs2_orphan_recovery_can_continue(osb, slot)); + spin_lock(&osb->osb_lock); + } + spin_unlock(&osb->osb_lock); +} + +static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb, + int slot) +{ + ocfs2_node_map_clear_bit(osb, &osb->osb_recovering_orphan_dirs, slot); +} + +/* + * Orphan recovery. Each mounted node has it's own orphan dir which we + * must run during recovery. Our strategy here is to build a list of + * the inodes in the orphan dir and iget/iput them. The VFS does + * (most) of the rest of the work. + * + * Orphan recovery can happen at any time, not just mount so we have a + * couple of extra considerations. + * + * - We grab as many inodes as we can under the orphan dir lock - + * doing iget() outside the orphan dir risks getting a reference on + * an invalid inode. + * - We must be sure not to deadlock with other processes on the + * system wanting to run delete_inode(). This can happen when they go + * to lock the orphan dir and the orphan recovery process attempts to + * iget() inside the orphan dir lock. This can be avoided by + * advertising our state to ocfs2_delete_inode(). + */ +static int ocfs2_recover_orphans(struct ocfs2_super *osb, + int slot) +{ + int ret = 0; + struct inode *inode = NULL; + struct inode *iter; + struct ocfs2_inode_info *oi; + + mlog(0, "Recover inodes from orphan dir in slot %d\n", slot); + + ocfs2_mark_recovering_orphan_dir(osb, slot); + ret = ocfs2_queue_orphans(osb, slot, &inode); + ocfs2_clear_recovering_orphan_dir(osb, slot); + + /* Error here should be noted, but we want to continue with as + * many queued inodes as we've got. */ + if (ret) + mlog_errno(ret); while (inode) { oi = OCFS2_I(inode); @@ -1541,14 +1610,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, inode = iter; } -out: - if (have_disk_lock) - ocfs2_meta_unlock(orphan_dir_inode, 0); - - if (orphan_dir_inode) - iput(orphan_dir_inode); - - return status; + return ret; } static int ocfs2_wait_on_mount(struct ocfs2_super *osb) @@ -1584,10 +1646,9 @@ static int ocfs2_commit_thread(void *arg) while (!(kthread_should_stop() && atomic_read(&journal->j_num_trans) == 0)) { - wait_event_interruptible_timeout(osb->checkpoint_event, - atomic_read(&journal->j_num_trans) - || kthread_should_stop(), - OCFS2_CHECKPOINT_INTERVAL); + wait_event_interruptible(osb->checkpoint_event, + atomic_read(&journal->j_num_trans) + || kthread_should_stop()); status = ocfs2_commit_cache(osb); if (status < 0) diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 7d0a816184f..2f3a6acdac4 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -29,8 +29,6 @@ #include <linux/fs.h> #include <linux/jbd.h> -#define OCFS2_CHECKPOINT_INTERVAL (8 * HZ) - enum ocfs2_journal_state { OCFS2_JOURNAL_FREE = 0, OCFS2_JOURNAL_LOADED, diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index f468c600cf9..e89de9b6e49 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -33,6 +33,7 @@ #include <linux/rbtree.h> #include <linux/workqueue.h> #include <linux/kref.h> +#include <linux/mutex.h> #include "cluster/nodemanager.h" #include "cluster/heartbeat.h" @@ -173,9 +174,6 @@ enum ocfs2_mount_options OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */ -#ifdef OCFS2_ORACORE_WORKAROUNDS - OCFS2_MOUNT_COMPAT_OCFS = 1 << 30, /* ocfs1 compatibility mode */ -#endif }; #define OCFS2_OSB_SOFT_RO 0x0001 @@ -233,7 +231,7 @@ struct ocfs2_super struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/<maj_min> */ atomic_t vol_state; - struct semaphore recovery_lock; + struct mutex recovery_lock; struct task_struct *recovery_thread_task; int disable_recovery; wait_queue_head_t checkpoint_event; @@ -289,6 +287,10 @@ struct ocfs2_super struct inode *osb_tl_inode; struct buffer_head *osb_tl_bh; struct work_struct osb_truncate_log_wq; + + struct ocfs2_node_map osb_recovering_orphan_dirs; + unsigned int *osb_orphan_wipes; + wait_queue_head_t osb_wipe_event; }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index dfb8a5bedfc..c5b1ac547c1 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -138,7 +138,6 @@ /* Journal limits (in bytes) */ #define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) -#define OCFS2_MAX_JOURNAL_SIZE (500 * 1024 * 1024) struct ocfs2_system_inode_info { char *si_name; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 364d64bd5f1..8dd3aafec49 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -932,7 +932,7 @@ static void ocfs2_inode_init_once(void *data, oi->ip_dir_start_lookup = 0; init_rwsem(&oi->ip_alloc_sem); - init_MUTEX(&(oi->ip_io_sem)); + mutex_init(&oi->ip_io_mutex); oi->ip_blkno = 0ULL; oi->ip_clusters = 0; @@ -1137,9 +1137,9 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) /* disable any new recovery threads and wait for any currently * running ones to exit. Do this before setting the vol_state. */ - down(&osb->recovery_lock); + mutex_lock(&osb->recovery_lock); osb->disable_recovery = 1; - up(&osb->recovery_lock); + mutex_unlock(&osb->recovery_lock); wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); /* At this point, we know that no more recovery threads can be @@ -1254,8 +1254,7 @@ static int ocfs2_initialize_super(struct super_block *sb, osb->sb = sb; /* Save off for ocfs2_rw_direct */ osb->s_sectsize_bits = blksize_bits(sector_size); - if (!osb->s_sectsize_bits) - BUG(); + BUG_ON(!osb->s_sectsize_bits); osb->net_response_ids = 0; spin_lock_init(&osb->net_response_lock); @@ -1283,7 +1282,7 @@ static int ocfs2_initialize_super(struct super_block *sb, snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); - init_MUTEX(&osb->recovery_lock); + mutex_init(&osb->recovery_lock); osb->disable_recovery = 0; osb->recovery_thread_task = NULL; @@ -1326,6 +1325,16 @@ static int ocfs2_initialize_super(struct super_block *sb, } mlog(ML_NOTICE, "max_slots for this device: %u\n", osb->max_slots); + init_waitqueue_head(&osb->osb_wipe_event); + osb->osb_orphan_wipes = kcalloc(osb->max_slots, + sizeof(*osb->osb_orphan_wipes), + GFP_KERNEL); + if (!osb->osb_orphan_wipes) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + osb->s_feature_compat = le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat); osb->s_feature_ro_compat = @@ -1639,6 +1648,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) if (osb->slot_info) ocfs2_free_slot_info(osb->slot_info); + kfree(osb->osb_orphan_wipes); /* FIXME * This belongs in journal shutdown, but because we have to * allocate osb->journal at the start of ocfs2_initalize_osb(), diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c index 600a8bc5b54..fc29cb7a437 100644 --- a/fs/ocfs2/sysfile.c +++ b/fs/ocfs2/sysfile.c @@ -77,8 +77,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, if (arr && ((inode = *arr) != NULL)) { /* get a ref in addition to the array ref */ inode = igrab(inode); - if (!inode) - BUG(); + BUG_ON(!inode); return inode; } @@ -89,8 +88,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb, /* add one more if putting into array for first time */ if (arr && inode) { *arr = igrab(inode); - if (!*arr) - BUG(); + BUG_ON(!*arr); } return inode; } diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index 3a0458fd3e1..300b5bedfb2 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c @@ -388,7 +388,7 @@ out_free: } } -/* Item insertion is guarded by ip_io_sem, so the insertion path takes +/* Item insertion is guarded by ip_io_mutex, so the insertion path takes * advantage of this by not rechecking for a duplicate insert during * the slow case. Additionally, if the cache needs to be bumped up to * a tree, the code will not recheck after acquiring the lock -- @@ -418,7 +418,7 @@ void ocfs2_set_buffer_uptodate(struct inode *inode, (unsigned long long) bh->b_blocknr); /* No need to recheck under spinlock - insertion is guarded by - * ip_io_sem */ + * ip_io_mutex */ spin_lock(&oi->ip_lock); if (ocfs2_insert_can_use_array(oi, ci)) { /* Fast case - it's an array and there's a free @@ -440,7 +440,7 @@ void ocfs2_set_buffer_uptodate(struct inode *inode, /* Called against a newly allocated buffer. Most likely nobody should * be able to read this sort of metadata while it's still being - * allocated, but this is careful to take ip_io_sem anyway. */ + * allocated, but this is careful to take ip_io_mutex anyway. */ void ocfs2_set_new_buffer_uptodate(struct inode *inode, struct buffer_head *bh) { @@ -451,9 +451,9 @@ void ocfs2_set_new_buffer_uptodate(struct inode *inode, set_buffer_uptodate(bh); - down(&oi->ip_io_sem); + mutex_lock(&oi->ip_io_mutex); ocfs2_set_buffer_uptodate(inode, bh); - up(&oi->ip_io_sem); + mutex_unlock(&oi->ip_io_mutex); } /* Requires ip_lock. */ @@ -537,7 +537,7 @@ int __init init_ocfs2_uptodate_cache(void) return 0; } -void __exit exit_ocfs2_uptodate_cache(void) +void exit_ocfs2_uptodate_cache(void) { if (ocfs2_uptodate_cachep) kmem_cache_destroy(ocfs2_uptodate_cachep); diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h index e5aacdf4eab..01cd32d26b0 100644 --- a/fs/ocfs2/uptodate.h +++ b/fs/ocfs2/uptodate.h @@ -27,7 +27,7 @@ #define OCFS2_UPTODATE_H int __init init_ocfs2_uptodate_cache(void); -void __exit exit_ocfs2_uptodate_cache(void); +void exit_ocfs2_uptodate_cache(void); void ocfs2_metadata_cache_init(struct inode *inode); void ocfs2_metadata_cache_purge(struct inode *inode); diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c index 78010ad60e4..1e4a93835fe 100644 --- a/fs/partitions/ibm.c +++ b/fs/partitions/ibm.c @@ -52,6 +52,7 @@ int ibm_partition(struct parsed_partitions *state, struct block_device *bdev) { int blocksize, offset, size; + loff_t i_size; dasd_information_t *info; struct hd_geometry *geo; char type[5] = {0,}; @@ -63,6 +64,13 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) unsigned char *data; Sector sect; + blocksize = bdev_hardsect_size(bdev); + if (blocksize <= 0) + return 0; + i_size = i_size_read(bdev->bd_inode); + if (i_size == 0) + return 0; + if ((info = kmalloc(sizeof(dasd_information_t), GFP_KERNEL)) == NULL) goto out_noinfo; if ((geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL)) == NULL) @@ -73,9 +81,6 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) if (ioctl_by_bdev(bdev, BIODASDINFO, (unsigned long)info) != 0 || ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0) goto out_noioctl; - - if ((blocksize = bdev_hardsect_size(bdev)) <= 0) - goto out_badsect; /* * Get volume label, extract name and type. @@ -111,7 +116,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) } else { printk("CMS1/%8s:", name); offset = (info->label_block + 1); - size = bdev->bd_inode->i_size >> 9; + size = i_size >> 9; } put_partition(state, 1, offset*(blocksize >> 9), size-offset*(blocksize >> 9)); @@ -168,7 +173,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) else printk("(nonl)/%8s:", name); offset = (info->label_block + 1); - size = (bdev->bd_inode->i_size >> 9); + size = i_size >> 9; put_partition(state, 1, offset*(blocksize >> 9), size-offset*(blocksize >> 9)); } @@ -180,7 +185,6 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) return 1; out_readerr: -out_badsect: out_noioctl: kfree(label); out_nolab: diff --git a/fs/pipe.c b/fs/pipe.c index d722579df79..8aada8e426f 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -605,7 +605,7 @@ struct file_operations rdwr_fifo_fops = { .fasync = pipe_rdwr_fasync, }; -struct file_operations read_pipe_fops = { +static struct file_operations read_pipe_fops = { .llseek = no_llseek, .read = pipe_read, .readv = pipe_readv, @@ -617,7 +617,7 @@ struct file_operations read_pipe_fops = { .fasync = pipe_read_fasync, }; -struct file_operations write_pipe_fops = { +static struct file_operations write_pipe_fops = { .llseek = no_llseek, .read = bad_pipe_r, .write = pipe_write, @@ -629,7 +629,7 @@ struct file_operations write_pipe_fops = { .fasync = pipe_write_fasync, }; -struct file_operations rdwr_pipe_fops = { +static struct file_operations rdwr_pipe_fops = { .llseek = no_llseek, .read = pipe_read, .readv = pipe_readv, diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 6573f31f1fd..075d3e94560 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -204,10 +204,6 @@ int proc_fill_super(struct super_block *s, void *data, int silent) root_inode = proc_get_inode(s, PROC_ROOT_INO, &proc_root); if (!root_inode) goto out_no_root; - /* - * Fixup the root inode's nlink value - */ - root_inode->i_nlink += nr_processes(); root_inode->i_uid = 0; root_inode->i_gid = 0; s->s_root = d_alloc_root(root_inode); diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 8f8014285a3..1d24fead51a 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -548,7 +548,7 @@ static int show_stat(struct seq_file *p, void *v) } seq_printf(p, "intr %llu", (unsigned long long)sum); -#if !defined(CONFIG_PPC64) && !defined(CONFIG_ALPHA) +#if !defined(CONFIG_PPC64) && !defined(CONFIG_ALPHA) && !defined(CONFIG_IA64) for (i = 0; i < NR_IRQS; i++) seq_printf(p, " %u", kstat_irqs(i)); #endif diff --git a/fs/proc/root.c b/fs/proc/root.c index 68896283c8a..c3fd3611112 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -80,16 +80,16 @@ void __init proc_root_init(void) proc_bus = proc_mkdir("bus", NULL); } -static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd) +static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat +) { - /* - * nr_threads is actually protected by the tasklist_lock; - * however, it's conventional to do reads, especially for - * reporting, without any locking whatsoever. - */ - if (dir->i_ino == PROC_ROOT_INO) /* check for safety... */ - dir->i_nlink = proc_root.nlink + nr_threads; + generic_fillattr(dentry->d_inode, stat); + stat->nlink = proc_root.nlink + nr_processes(); + return 0; +} +static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, struct nameidata *nd) +{ if (!proc_lookup(dir, dentry, nd)) { return NULL; } @@ -134,6 +134,7 @@ static struct file_operations proc_root_operations = { */ static struct inode_operations proc_root_inode_operations = { .lookup = proc_root_lookup, + .getattr = proc_root_getattr, }; /* diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 0eaad41f465..91b7c15ab37 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -204,7 +204,6 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd, { pte_t *pte, ptent; spinlock_t *ptl; - unsigned long pfn; struct page *page; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); @@ -214,12 +213,12 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd, continue; mss->resident += PAGE_SIZE; - pfn = pte_pfn(ptent); - if (!pfn_valid(pfn)) + + page = vm_normal_page(vma, addr, ptent); + if (!page) continue; - page = pfn_to_page(pfn); - if (page_count(page) >= 2) { + if (page_mapcount(page) >= 2) { if (pte_dirty(ptent)) mss->shared_dirty += PAGE_SIZE; else @@ -289,7 +288,7 @@ static int show_smap(struct seq_file *m, void *v) struct mem_size_stats mss; memset(&mss, 0, sizeof mss); - if (vma->vm_mm) + if (vma->vm_mm && !is_vm_hugetlb_page(vma)) smaps_pgd_range(vma, vma->vm_start, vma->vm_end, &mss); return show_map_internal(m, v, &mss); } diff --git a/fs/quota_v2.c b/fs/quota_v2.c index a4ef91bb4f3..b4199ec3ece 100644 --- a/fs/quota_v2.c +++ b/fs/quota_v2.c @@ -35,7 +35,7 @@ static int v2_check_quota_file(struct super_block *sb, int type) size = sb->s_op->quota_read(sb, type, (char *)&dqhead, sizeof(struct v2_disk_dqheader), 0); if (size != sizeof(struct v2_disk_dqheader)) { - printk("quota_v2: failed read expected=%d got=%d\n", + printk("quota_v2: failed read expected=%zd got=%zd\n", sizeof(struct v2_disk_dqheader), size); return 0; } diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index c66bd5e4c05..14bd2246fb6 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -27,6 +27,7 @@ #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/highmem.h> +#include <linux/time.h> #include <linux/init.h> #include <linux/string.h> #include <linux/smp_lock.h> @@ -104,6 +105,7 @@ ramfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ error = 0; + dir->i_mtime = dir->i_ctime = CURRENT_TIME; } return error; } @@ -135,6 +137,7 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char * inode->i_gid = dir->i_gid; d_instantiate(dentry, inode); dget(dentry); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; } else iput(inode); } diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 9dd71e80703..d71ac657928 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -150,18 +150,15 @@ static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir) if (d_reclen <= 32) { local_buf = small_buf; } else { - local_buf = - reiserfs_kmalloc(d_reclen, GFP_NOFS, - inode->i_sb); + local_buf = kmalloc(d_reclen, + GFP_NOFS); if (!local_buf) { pathrelse(&path_to_entry); ret = -ENOMEM; goto out; } if (item_moved(&tmp_ih, &path_to_entry)) { - reiserfs_kfree(local_buf, - d_reclen, - inode->i_sb); + kfree(local_buf); goto research; } } @@ -174,15 +171,12 @@ static int reiserfs_readdir(struct file *filp, void *dirent, filldir_t filldir) (dirent, local_buf, d_reclen, d_off, d_ino, DT_UNKNOWN) < 0) { if (local_buf != small_buf) { - reiserfs_kfree(local_buf, - d_reclen, - inode->i_sb); + kfree(local_buf); } goto end; } if (local_buf != small_buf) { - reiserfs_kfree(local_buf, d_reclen, - inode->i_sb); + kfree(local_buf); } // next entry should be looked for with such offset next_pos = deh_offset(deh) + 1; diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index ad6fa964b0e..be12879bb17 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -192,6 +192,8 @@ static int reiserfs_allocate_blocks_for_region(struct reiserfs_transaction_handl allocated_blocks = kmalloc((blocks_to_allocate + will_prealloc) * sizeof(b_blocknr_t), GFP_NOFS); + if (!allocated_blocks) + return -ENOMEM; /* First we compose a key to point at the writing position, we want to do that outside of any locking region. */ @@ -1285,6 +1287,23 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t struct reiserfs_transaction_handle th; th.t_trans_id = 0; + /* If a filesystem is converted from 3.5 to 3.6, we'll have v3.5 items + * lying around (most of the disk, in fact). Despite the filesystem + * now being a v3.6 format, the old items still can't support large + * file sizes. Catch this case here, as the rest of the VFS layer is + * oblivious to the different limitations between old and new items. + * reiserfs_setattr catches this for truncates. This chunk is lifted + * from generic_write_checks. */ + if (get_inode_item_key_version (inode) == KEY_FORMAT_3_5 && + *ppos + count > MAX_NON_LFS) { + if (*ppos >= MAX_NON_LFS) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; + } + if (count > MAX_NON_LFS - (unsigned long)*ppos) + count = MAX_NON_LFS - (unsigned long)*ppos; + } + if (file->f_flags & O_DIRECT) { // Direct IO needs treatment ssize_t result, after_file_end = 0; if ((*ppos + count >= inode->i_size) @@ -1445,13 +1464,11 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t partially overwritten pages, if needed. And lock the pages, so that nobody else can access these until we are done. We get number of actual blocks needed as a result. */ - blocks_to_allocate = - reiserfs_prepare_file_region_for_write(inode, pos, - num_pages, - write_bytes, - prepared_pages); - if (blocks_to_allocate < 0) { - res = blocks_to_allocate; + res = reiserfs_prepare_file_region_for_write(inode, pos, + num_pages, + write_bytes, + prepared_pages); + if (res < 0) { reiserfs_release_claimed_blocks(inode->i_sb, num_pages << (PAGE_CACHE_SHIFT - @@ -1459,6 +1476,8 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t break; } + blocks_to_allocate = res; + /* First we correct our estimate of how many blocks we need */ reiserfs_release_claimed_blocks(inode->i_sb, (num_pages << diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c index 45829889dcd..aa22588019e 100644 --- a/fs/reiserfs/fix_node.c +++ b/fs/reiserfs/fix_node.c @@ -2021,38 +2021,6 @@ static int get_neighbors(struct tree_balance *p_s_tb, int n_h) return CARRY_ON; } -#ifdef CONFIG_REISERFS_CHECK -void *reiserfs_kmalloc(size_t size, gfp_t flags, struct super_block *s) -{ - void *vp; - static size_t malloced; - - vp = kmalloc(size, flags); - if (vp) { - REISERFS_SB(s)->s_kmallocs += size; - if (REISERFS_SB(s)->s_kmallocs > malloced + 200000) { - reiserfs_warning(s, - "vs-8301: reiserfs_kmalloc: allocated memory %d", - REISERFS_SB(s)->s_kmallocs); - malloced = REISERFS_SB(s)->s_kmallocs; - } - } - return vp; -} - -void reiserfs_kfree(const void *vp, size_t size, struct super_block *s) -{ - kfree(vp); - - REISERFS_SB(s)->s_kmallocs -= size; - if (REISERFS_SB(s)->s_kmallocs < 0) - reiserfs_warning(s, - "vs-8302: reiserfs_kfree: allocated memory %d", - REISERFS_SB(s)->s_kmallocs); - -} -#endif - static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh) { int max_num_of_items; @@ -2086,7 +2054,7 @@ static int get_mem_for_virtual_node(struct tree_balance *tb) /* we have to allocate more memory for virtual node */ if (tb->vn_buf) { /* free memory allocated before */ - reiserfs_kfree(tb->vn_buf, tb->vn_buf_size, tb->tb_sb); + kfree(tb->vn_buf); /* this is not needed if kfree is atomic */ check_fs = 1; } @@ -2095,24 +2063,15 @@ static int get_mem_for_virtual_node(struct tree_balance *tb) tb->vn_buf_size = size; /* get memory for virtual item */ - buf = - reiserfs_kmalloc(size, GFP_ATOMIC | __GFP_NOWARN, - tb->tb_sb); + buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN); if (!buf) { /* getting memory with GFP_KERNEL priority may involve balancing now (due to indirect_to_direct conversion on dcache shrinking). So, release path and collected resources here */ free_buffers_in_tb(tb); - buf = reiserfs_kmalloc(size, GFP_NOFS, tb->tb_sb); + buf = kmalloc(size, GFP_NOFS); if (!buf) { -#ifdef CONFIG_REISERFS_CHECK - reiserfs_warning(tb->tb_sb, - "vs-8345: get_mem_for_virtual_node: " - "kmalloc failed. reiserfs kmalloced %d bytes", - REISERFS_SB(tb->tb_sb)-> - s_kmallocs); -#endif tb->vn_buf_size = 0; } tb->vn_buf = buf; @@ -2619,7 +2578,6 @@ void unfix_nodes(struct tree_balance *tb) } } - if (tb->vn_buf) - reiserfs_kfree(tb->vn_buf, tb->vn_buf_size, tb->tb_sb); + kfree(tb->vn_buf); } diff --git a/fs/reiserfs/hashes.c b/fs/reiserfs/hashes.c index a3ec238fd9e..e664ac16fad 100644 --- a/fs/reiserfs/hashes.c +++ b/fs/reiserfs/hashes.c @@ -21,7 +21,6 @@ #include <linux/kernel.h> #include <linux/reiserfs_fs.h> #include <asm/types.h> -#include <asm/bug.h> #define DELTA 0x9E3779B9 #define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */ diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index ffa34b861bd..d60f6238c66 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -627,11 +627,6 @@ int reiserfs_get_block(struct inode *inode, sector_t block, reiserfs_write_lock(inode->i_sb); version = get_inode_item_key_version(inode); - if (block < 0) { - reiserfs_write_unlock(inode->i_sb); - return -EIO; - } - if (!file_capable(inode, block)) { reiserfs_write_unlock(inode->i_sb); return -EFBIG; @@ -934,12 +929,13 @@ int reiserfs_get_block(struct inode *inode, sector_t block, //pos_in_item * inode->i_sb->s_blocksize, TYPE_INDIRECT, 3); // key type is unimportant + RFALSE(cpu_key_k_offset(&tmp_key) > cpu_key_k_offset(&key), + "green-805: invalid offset"); blocks_needed = 1 + ((cpu_key_k_offset(&key) - cpu_key_k_offset(&tmp_key)) >> inode->i_sb-> s_blocksize_bits); - RFALSE(blocks_needed < 0, "green-805: invalid offset"); if (blocks_needed == 1) { un = &unf_single; @@ -2363,6 +2359,13 @@ static int reiserfs_write_full_page(struct page *page, int bh_per_page = PAGE_CACHE_SIZE / s->s_blocksize; th.t_trans_id = 0; + /* no logging allowed when nonblocking or from PF_MEMALLOC */ + if (checked && (current->flags & PF_MEMALLOC)) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + /* The page dirty bit is cleared before writepage is called, which * means we have to tell create_empty_buffers to make dirty buffers * The page really should be up to date at this point, so tossing @@ -2743,6 +2746,7 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) int ret = 1; struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); + lock_buffer(bh); spin_lock(&j->j_dirty_buffers_lock); if (!buffer_mapped(bh)) { goto free_jh; @@ -2758,7 +2762,7 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) if (buffer_journaled(bh) || buffer_journal_dirty(bh)) { ret = 0; } - } else if (buffer_dirty(bh) || buffer_locked(bh)) { + } else if (buffer_dirty(bh)) { struct reiserfs_journal_list *jl; struct reiserfs_jh *jh = bh->b_private; @@ -2784,6 +2788,7 @@ static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) reiserfs_free_jh(bh); } spin_unlock(&j->j_dirty_buffers_lock); + unlock_buffer(bh); return ret; } diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 4491fcf2a0e..5a9d2722fa0 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -152,18 +152,16 @@ static struct reiserfs_bitmap_node *allocate_bitmap_node(struct super_block struct reiserfs_bitmap_node *bn; static int id; - bn = reiserfs_kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS, - p_s_sb); + bn = kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_NOFS); if (!bn) { return NULL; } - bn->data = reiserfs_kmalloc(p_s_sb->s_blocksize, GFP_NOFS, p_s_sb); + bn->data = kzalloc(p_s_sb->s_blocksize, GFP_NOFS); if (!bn->data) { - reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb); + kfree(bn); return NULL; } bn->id = id++; - memset(bn->data, 0, p_s_sb->s_blocksize); INIT_LIST_HEAD(&bn->list); return bn; } @@ -197,8 +195,8 @@ static inline void free_bitmap_node(struct super_block *p_s_sb, struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); journal->j_used_bitmap_nodes--; if (journal->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) { - reiserfs_kfree(bn->data, p_s_sb->s_blocksize, p_s_sb); - reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb); + kfree(bn->data); + kfree(bn); } else { list_add(&bn->list, &journal->j_bitmap_nodes); journal->j_free_bitmap_nodes++; @@ -276,8 +274,8 @@ static int free_bitmap_nodes(struct super_block *p_s_sb) while (next != &journal->j_bitmap_nodes) { bn = list_entry(next, struct reiserfs_bitmap_node, list); list_del(next); - reiserfs_kfree(bn->data, p_s_sb->s_blocksize, p_s_sb); - reiserfs_kfree(bn, sizeof(struct reiserfs_bitmap_node), p_s_sb); + kfree(bn->data); + kfree(bn); next = journal->j_bitmap_nodes.next; journal->j_free_bitmap_nodes--; } @@ -581,7 +579,7 @@ static inline void put_journal_list(struct super_block *s, jl->j_trans_id, jl->j_refcount); } if (--jl->j_refcount == 0) - reiserfs_kfree(jl, sizeof(struct reiserfs_journal_list), s); + kfree(jl); } /* @@ -848,6 +846,14 @@ static int write_ordered_buffers(spinlock_t * lock, spin_lock(lock); goto loop_next; } + /* in theory, dirty non-uptodate buffers should never get here, + * but the upper layer io error paths still have a few quirks. + * Handle them here as gracefully as we can + */ + if (!buffer_uptodate(bh) && buffer_dirty(bh)) { + clear_buffer_dirty(bh); + ret = -EIO; + } if (buffer_dirty(bh)) { list_del_init(&jh->list); list_add(&jh->list, &tmp); @@ -879,6 +885,19 @@ static int write_ordered_buffers(spinlock_t * lock, if (!buffer_uptodate(bh)) { ret = -EIO; } + /* ugly interaction with invalidatepage here. + * reiserfs_invalidate_page will pin any buffer that has a valid + * journal head from an older transaction. If someone else sets + * our buffer dirty after we write it in the first loop, and + * then someone truncates the page away, nobody will ever write + * the buffer. We're safe if we write the page one last time + * after freeing the journal header. + */ + if (buffer_dirty(bh) && unlikely(bh->b_page->mapping == NULL)) { + spin_unlock(lock); + ll_rw_block(WRITE, 1, &bh); + spin_lock(lock); + } put_bh(bh); cond_resched_lock(lock); } @@ -977,6 +996,7 @@ static int flush_commit_list(struct super_block *s, struct reiserfs_journal *journal = SB_JOURNAL(s); int barrier = 0; int retval = 0; + int write_len; reiserfs_check_lock_depth(s, "flush_commit_list"); @@ -1018,24 +1038,35 @@ static int flush_commit_list(struct super_block *s, } if (!list_empty(&jl->j_bh_list)) { + int ret; unlock_kernel(); - write_ordered_buffers(&journal->j_dirty_buffers_lock, - journal, jl, &jl->j_bh_list); + ret = write_ordered_buffers(&journal->j_dirty_buffers_lock, + journal, jl, &jl->j_bh_list); + if (ret < 0 && retval == 0) + retval = ret; lock_kernel(); } BUG_ON(!list_empty(&jl->j_bh_list)); /* * for the description block and all the log blocks, submit any buffers - * that haven't already reached the disk + * that haven't already reached the disk. Try to write at least 256 + * log blocks. later on, we will only wait on blocks that correspond + * to this transaction, but while we're unplugging we might as well + * get a chunk of data on there. */ atomic_inc(&journal->j_async_throttle); - for (i = 0; i < (jl->j_len + 1); i++) { + write_len = jl->j_len + 1; + if (write_len < 256) + write_len = 256; + for (i = 0 ; i < write_len ; i++) { bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s); tbh = journal_find_get_block(s, bn); - if (buffer_dirty(tbh)) /* redundant, ll_rw_block() checks */ - ll_rw_block(SWRITE, 1, &tbh); - put_bh(tbh); + if (tbh) { + if (buffer_dirty(tbh)) + ll_rw_block(WRITE, 1, &tbh) ; + put_bh(tbh) ; + } } atomic_dec(&journal->j_async_throttle); @@ -1818,8 +1849,7 @@ void remove_journal_hash(struct super_block *sb, static void free_journal_ram(struct super_block *p_s_sb) { struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); - reiserfs_kfree(journal->j_current_jl, - sizeof(struct reiserfs_journal_list), p_s_sb); + kfree(journal->j_current_jl); journal->j_num_lists--; vfree(journal->j_cnode_free_orig); @@ -2093,21 +2123,15 @@ static int journal_read_transaction(struct super_block *p_s_sb, } trans_id = get_desc_trans_id(desc); /* now we know we've got a good transaction, and it was inside the valid time ranges */ - log_blocks = - reiserfs_kmalloc(get_desc_trans_len(desc) * - sizeof(struct buffer_head *), GFP_NOFS, p_s_sb); - real_blocks = - reiserfs_kmalloc(get_desc_trans_len(desc) * - sizeof(struct buffer_head *), GFP_NOFS, p_s_sb); + log_blocks = kmalloc(get_desc_trans_len(desc) * + sizeof(struct buffer_head *), GFP_NOFS); + real_blocks = kmalloc(get_desc_trans_len(desc) * + sizeof(struct buffer_head *), GFP_NOFS); if (!log_blocks || !real_blocks) { brelse(c_bh); brelse(d_bh); - reiserfs_kfree(log_blocks, - get_desc_trans_len(desc) * - sizeof(struct buffer_head *), p_s_sb); - reiserfs_kfree(real_blocks, - get_desc_trans_len(desc) * - sizeof(struct buffer_head *), p_s_sb); + kfree(log_blocks); + kfree(real_blocks); reiserfs_warning(p_s_sb, "journal-1169: kmalloc failed, unable to mount FS"); return -1; @@ -2145,12 +2169,8 @@ static int journal_read_transaction(struct super_block *p_s_sb, brelse_array(real_blocks, i); brelse(c_bh); brelse(d_bh); - reiserfs_kfree(log_blocks, - get_desc_trans_len(desc) * - sizeof(struct buffer_head *), p_s_sb); - reiserfs_kfree(real_blocks, - get_desc_trans_len(desc) * - sizeof(struct buffer_head *), p_s_sb); + kfree(log_blocks); + kfree(real_blocks); return -1; } } @@ -2166,12 +2186,8 @@ static int journal_read_transaction(struct super_block *p_s_sb, brelse_array(real_blocks, get_desc_trans_len(desc)); brelse(c_bh); brelse(d_bh); - reiserfs_kfree(log_blocks, - get_desc_trans_len(desc) * - sizeof(struct buffer_head *), p_s_sb); - reiserfs_kfree(real_blocks, - get_desc_trans_len(desc) * - sizeof(struct buffer_head *), p_s_sb); + kfree(log_blocks); + kfree(real_blocks); return -1; } memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data, @@ -2193,12 +2209,8 @@ static int journal_read_transaction(struct super_block *p_s_sb, get_desc_trans_len(desc) - i); brelse(c_bh); brelse(d_bh); - reiserfs_kfree(log_blocks, - get_desc_trans_len(desc) * - sizeof(struct buffer_head *), p_s_sb); - reiserfs_kfree(real_blocks, - get_desc_trans_len(desc) * - sizeof(struct buffer_head *), p_s_sb); + kfree(log_blocks); + kfree(real_blocks); return -1; } brelse(real_blocks[i]); @@ -2217,12 +2229,8 @@ static int journal_read_transaction(struct super_block *p_s_sb, journal->j_trans_id = trans_id + 1; brelse(c_bh); brelse(d_bh); - reiserfs_kfree(log_blocks, - le32_to_cpu(desc->j_len) * sizeof(struct buffer_head *), - p_s_sb); - reiserfs_kfree(real_blocks, - le32_to_cpu(desc->j_len) * sizeof(struct buffer_head *), - p_s_sb); + kfree(log_blocks); + kfree(real_blocks); return 0; } @@ -2311,8 +2319,7 @@ static int journal_read(struct super_block *p_s_sb) return 1; } jh = (struct reiserfs_journal_header *)(journal->j_header_bh->b_data); - if (le32_to_cpu(jh->j_first_unflushed_offset) >= 0 && - le32_to_cpu(jh->j_first_unflushed_offset) < + if (le32_to_cpu(jh->j_first_unflushed_offset) < SB_ONDISK_JOURNAL_SIZE(p_s_sb) && le32_to_cpu(jh->j_last_flush_trans_id) > 0) { oldest_start = @@ -2471,14 +2478,8 @@ static int journal_read(struct super_block *p_s_sb) static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s) { struct reiserfs_journal_list *jl; - retry: - jl = reiserfs_kmalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS, - s); - if (!jl) { - yield(); - goto retry; - } - memset(jl, 0, sizeof(*jl)); + jl = kzalloc(sizeof(struct reiserfs_journal_list), + GFP_NOFS | __GFP_NOFAIL); INIT_LIST_HEAD(&jl->j_list); INIT_LIST_HEAD(&jl->j_working_list); INIT_LIST_HEAD(&jl->j_tail_bh_list); @@ -2821,6 +2822,9 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th, journal->j_cnode_free < (journal->j_trans_max * 3)) { return 1; } + /* protected by the BKL here */ + journal->j_len_alloc += new_alloc; + th->t_blocks_allocated += new_alloc ; return 0; } @@ -3042,14 +3046,12 @@ struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct } return th; } - th = reiserfs_kmalloc(sizeof(struct reiserfs_transaction_handle), - GFP_NOFS, s); + th = kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS); if (!th) return NULL; ret = journal_begin(th, s, nblocks); if (ret) { - reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), - s); + kfree(th); return NULL; } @@ -3067,8 +3069,7 @@ int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th) ret = -EIO; if (th->t_refcount == 0) { SB_JOURNAL(s)->j_persistent_trans--; - reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), - s); + kfree(th); } return ret; } diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 8f8d8d01107..284f7852de8 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -247,7 +247,7 @@ static int linear_search_in_dir_item(struct cpu_key *key, /* mark, that this generation number is used */ if (de->de_gen_number_bit_string) set_bit(GET_GENERATION_NUMBER(deh_offset(deh)), - (unsigned long *)de->de_gen_number_bit_string); + de->de_gen_number_bit_string); // calculate pointer to name and namelen de->de_entry_num = i; @@ -431,7 +431,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, struct reiserfs_de_head *deh; INITIALIZE_PATH(path); struct reiserfs_dir_entry de; - int bit_string[MAX_GENERATION_NUMBER / (sizeof(int) * 8) + 1]; + DECLARE_BITMAP(bit_string, MAX_GENERATION_NUMBER + 1); int gen_number; char small_buf[32 + DEH_SIZE]; /* 48 bytes now and we avoid kmalloc if we create file with short name */ @@ -456,7 +456,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, /* get memory for composing the entry */ buflen = DEH_SIZE + ROUND_UP(namelen); if (buflen > sizeof(small_buf)) { - buffer = reiserfs_kmalloc(buflen, GFP_NOFS, dir->i_sb); + buffer = kmalloc(buflen, GFP_NOFS); if (buffer == 0) return -ENOMEM; } else @@ -486,11 +486,11 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, /* find the proper place for the new entry */ memset(bit_string, 0, sizeof(bit_string)); - de.de_gen_number_bit_string = (char *)bit_string; + de.de_gen_number_bit_string = bit_string; retval = reiserfs_find_entry(dir, name, namelen, &path, &de); if (retval != NAME_NOT_FOUND) { if (buffer != small_buf) - reiserfs_kfree(buffer, buflen, dir->i_sb); + kfree(buffer); pathrelse(&path); if (retval == IO_ERROR) { @@ -508,14 +508,14 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, } gen_number = - find_first_zero_bit((unsigned long *)bit_string, + find_first_zero_bit(bit_string, MAX_GENERATION_NUMBER + 1); if (gen_number > MAX_GENERATION_NUMBER) { /* there is no free generation number */ reiserfs_warning(dir->i_sb, "reiserfs_add_entry: Congratulations! we have got hash function screwed up"); if (buffer != small_buf) - reiserfs_kfree(buffer, buflen, dir->i_sb); + kfree(buffer); pathrelse(&path); return -EBUSY; } @@ -535,7 +535,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, &entry_key); if (buffer != small_buf) - reiserfs_kfree(buffer, buflen, dir->i_sb); + kfree(buffer); pathrelse(&path); return -EBUSY; } @@ -546,7 +546,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, reiserfs_paste_into_item(th, &path, &entry_key, dir, buffer, paste_size); if (buffer != small_buf) - reiserfs_kfree(buffer, buflen, dir->i_sb); + kfree(buffer); if (retval) { reiserfs_check_path(&path); return retval; @@ -1065,7 +1065,7 @@ static int reiserfs_symlink(struct inode *parent_dir, goto out_failed; } - name = reiserfs_kmalloc(item_len, GFP_NOFS, parent_dir->i_sb); + name = kmalloc(item_len, GFP_NOFS); if (!name) { drop_new_inode(inode); retval = -ENOMEM; @@ -1079,14 +1079,14 @@ static int reiserfs_symlink(struct inode *parent_dir, retval = journal_begin(&th, parent_dir->i_sb, jbegin_count); if (retval) { drop_new_inode(inode); - reiserfs_kfree(name, item_len, parent_dir->i_sb); + kfree(name); goto out_failed; } retval = reiserfs_new_inode(&th, parent_dir, mode, name, strlen(symname), dentry, inode); - reiserfs_kfree(name, item_len, parent_dir->i_sb); + kfree(name); if (retval) { /* reiserfs_new_inode iputs for us */ goto out_failed; } diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index fc2f43c75df..ef6caed9336 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -88,7 +88,6 @@ static int show_super(struct seq_file *m, struct super_block *sb) seq_printf(m, "state: \t%s\n" "mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n" "gen. counter: \t%i\n" - "s_kmallocs: \t%i\n" "s_disk_reads: \t%i\n" "s_disk_writes: \t%i\n" "s_fix_nodes: \t%i\n" @@ -128,7 +127,7 @@ static int show_super(struct seq_file *m, struct super_block *sb) "SMALL_TAILS " : "NO_TAILS ", replay_only(sb) ? "REPLAY_ONLY " : "", convert_reiserfs(sb) ? "CONV " : "", - atomic_read(&r->s_generation_counter), SF(s_kmallocs), + atomic_read(&r->s_generation_counter), SF(s_disk_reads), SF(s_disk_writes), SF(s_fix_nodes), SF(s_do_balance), SF(s_unneeded_left_neighbor), SF(s_good_search_by_key_reada), SF(s_bmaps), diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 397d9590c8f..d63da756eb4 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -472,12 +472,6 @@ static void reiserfs_put_super(struct super_block *s) print_statistics(s); - if (REISERFS_SB(s)->s_kmallocs != 0) { - reiserfs_warning(s, - "vs-2004: reiserfs_put_super: allocated memory left %d", - REISERFS_SB(s)->s_kmallocs); - } - if (REISERFS_SB(s)->reserved_blocks != 0) { reiserfs_warning(s, "green-2005: reiserfs_put_super: reserved blocks left %d", @@ -1130,8 +1124,6 @@ static void handle_attrs(struct super_block *s) "reiserfs: cannot support attributes until flag is set in super-block"); REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_ATTRS); } - } else if (le32_to_cpu(rs->s_flags) & reiserfs_attrs_cleared) { - REISERFS_SB(s)->s_mount_opt |= REISERFS_ATTRS; } } diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index cc061bfd437..ffb79c48c5b 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -368,15 +368,13 @@ static int __xattr_readdir(struct file *filp, void *dirent, filldir_t filldir) if (d_reclen <= 32) { local_buf = small_buf; } else { - local_buf = - reiserfs_kmalloc(d_reclen, GFP_NOFS, inode->i_sb); + local_buf = kmalloc(d_reclen, GFP_NOFS); if (!local_buf) { pathrelse(&path_to_entry); return -ENOMEM; } if (item_moved(&tmp_ih, &path_to_entry)) { - reiserfs_kfree(local_buf, d_reclen, - inode->i_sb); + kfree(local_buf); /* sigh, must retry. Do this same offset again */ next_pos = d_off; @@ -399,13 +397,12 @@ static int __xattr_readdir(struct file *filp, void *dirent, filldir_t filldir) if (filldir(dirent, local_buf, d_reclen, d_off, d_ino, DT_UNKNOWN) < 0) { if (local_buf != small_buf) { - reiserfs_kfree(local_buf, d_reclen, - inode->i_sb); + kfree(local_buf); } goto end; } if (local_buf != small_buf) { - reiserfs_kfree(local_buf, d_reclen, inode->i_sb); + kfree(local_buf); } } /* while */ @@ -1322,109 +1319,44 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) return err; } -static int -__reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd, - int need_lock) +static int reiserfs_check_acl(struct inode *inode, int mask) { - umode_t mode = inode->i_mode; - - if (mask & MAY_WRITE) { - /* - * Nobody gets write access to a read-only fs. - */ - if (IS_RDONLY(inode) && - (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) - return -EROFS; + struct posix_acl *acl; + int error = -EAGAIN; /* do regular unix permission checks by default */ - /* - * Nobody gets write access to an immutable file. - */ - if (IS_IMMUTABLE(inode)) - return -EACCES; - } + reiserfs_read_lock_xattr_i(inode); + reiserfs_read_lock_xattrs(inode->i_sb); - /* We don't do permission checks on the internal objects. - * Permissions are determined by the "owning" object. */ - if (is_reiserfs_priv_object(inode)) - return 0; + acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); - if (current->fsuid == inode->i_uid) { - mode >>= 6; -#ifdef CONFIG_REISERFS_FS_POSIX_ACL - } else if (reiserfs_posixacl(inode->i_sb) && - get_inode_sd_version(inode) != STAT_DATA_V1) { - struct posix_acl *acl; - - /* ACL can't contain additional permissions if - the ACL_MASK entry is 0 */ - if (!(mode & S_IRWXG)) - goto check_groups; - - if (need_lock) { - reiserfs_read_lock_xattr_i(inode); - reiserfs_read_lock_xattrs(inode->i_sb); - } - acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); - if (need_lock) { - reiserfs_read_unlock_xattrs(inode->i_sb); - reiserfs_read_unlock_xattr_i(inode); - } - if (IS_ERR(acl)) { - if (PTR_ERR(acl) == -ENODATA) - goto check_groups; - return PTR_ERR(acl); - } + reiserfs_read_unlock_xattrs(inode->i_sb); + reiserfs_read_unlock_xattr_i(inode); - if (acl) { - int err = posix_acl_permission(inode, acl, mask); + if (acl) { + if (!IS_ERR(acl)) { + error = posix_acl_permission(inode, acl, mask); posix_acl_release(acl); - if (err == -EACCES) { - goto check_capabilities; - } - return err; - } else { - goto check_groups; - } -#endif - } else { - check_groups: - if (in_group_p(inode->i_gid)) - mode >>= 3; + } else if (PTR_ERR(acl) != -ENODATA) + error = PTR_ERR(acl); } - /* - * If the DACs are ok we don't need any capability check. - */ - if (((mode & mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == mask)) - return 0; + return error; +} - check_capabilities: +int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd) +{ /* - * Read/write DACs are always overridable. - * Executable DACs are overridable if at least one exec bit is set. + * We don't do permission checks on the internal objects. + * Permissions are determined by the "owning" object. */ - if (!(mask & MAY_EXEC) || - (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode)) - if (capable(CAP_DAC_OVERRIDE)) - return 0; + if (is_reiserfs_priv_object(inode)) + return 0; /* - * Searching includes executable on directories, else just read. + * Stat data v1 doesn't support ACLs. */ - if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))) - if (capable(CAP_DAC_READ_SEARCH)) - return 0; - - return -EACCES; -} - -int reiserfs_permission(struct inode *inode, int mask, struct nameidata *nd) -{ - return __reiserfs_permission(inode, mask, nd, 1); -} - -int -reiserfs_permission_locked(struct inode *inode, int mask, struct nameidata *nd) -{ - return __reiserfs_permission(inode, mask, nd, 0); + if (get_inode_sd_version(inode) == STAT_DATA_V1) + return generic_permission(inode, mask, NULL); + else + return generic_permission(inode, mask, reiserfs_check_acl); } diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 43de3ba8333..ab8894c3b9e 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -228,7 +228,8 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type) acl = ERR_PTR(retval); } else { acl = posix_acl_from_disk(value, retval); - *p_acl = posix_acl_dup(acl); + if (!IS_ERR(acl)) + *p_acl = posix_acl_dup(acl); } kfree(value); diff --git a/fs/select.c b/fs/select.c index c0f02d36c60..1815a57d225 100644 --- a/fs/select.c +++ b/fs/select.c @@ -398,11 +398,15 @@ asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp, ret = core_sys_select(n, inp, outp, exp, &timeout); if (tvp) { + struct timeval rtv; + if (current->personality & STICKY_TIMEOUTS) goto sticky; - tv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)); - tv.tv_sec = timeout; - if (copy_to_user(tvp, &tv, sizeof(tv))) { + rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)); + rtv.tv_sec = timeout; + if (timeval_compare(&rtv, &tv) >= 0) + rtv = tv; + if (copy_to_user(tvp, &rtv, sizeof(rtv))) { sticky: /* * If an application puts its timeval in read-only @@ -460,11 +464,16 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp, ret = core_sys_select(n, inp, outp, exp, &timeout); if (tsp) { + struct timespec rts; + if (current->personality & STICKY_TIMEOUTS) goto sticky; - ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000; - ts.tv_sec = timeout; - if (copy_to_user(tsp, &ts, sizeof(ts))) { + rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * + 1000; + rts.tv_sec = timeout; + if (timespec_compare(&rts, &ts) >= 0) + rts = ts; + if (copy_to_user(tsp, &rts, sizeof(rts))) { sticky: /* * If an application puts its timeval in read-only @@ -510,9 +519,9 @@ asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp, if (sig) { if (!access_ok(VERIFY_READ, sig, sizeof(void *)+sizeof(size_t)) - || __get_user(up, (sigset_t * __user *)sig) + || __get_user(up, (sigset_t __user * __user *)sig) || __get_user(sigsetsize, - (size_t * __user)(sig+sizeof(void *)))) + (size_t __user *)(sig+sizeof(void *)))) return -EFAULT; } @@ -758,12 +767,17 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds, sigprocmask(SIG_SETMASK, &sigsaved, NULL); if (tsp && timeout >= 0) { + struct timespec rts; + if (current->personality & STICKY_TIMEOUTS) goto sticky; /* Yes, we know it's actually an s64, but it's also positive. */ - ts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * 1000; - ts.tv_sec = timeout; - if (copy_to_user(tsp, &ts, sizeof(ts))) { + rts.tv_nsec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)) * + 1000; + rts.tv_sec = timeout; + if (timespec_compare(&rts, &ts) >= 0) + rts = ts; + if (copy_to_user(tsp, &rts, sizeof(rts))) { sticky: /* * If an application puts its timeval in read-only diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c index c6c33e15143..0424d06b147 100644 --- a/fs/smbfs/dir.c +++ b/fs/smbfs/dir.c @@ -209,6 +209,8 @@ init_cache: ctl.valid = 1; read_really: result = server->ops->readdir(filp, dirent, filldir, &ctl); + if (result == -ERESTARTSYS && page) + ClearPageUptodate(page); if (ctl.idx == -1) goto invalid_cache; /* retry */ ctl.head.end = ctl.fpos - 1; @@ -217,7 +219,8 @@ finished: if (page) { cache->head = ctl.head; kunmap(page); - SetPageUptodate(page); + if (result != -ERESTARTSYS) + SetPageUptodate(page); unlock_page(page); page_cache_release(page); } diff --git a/fs/stat.c b/fs/stat.c index 24211b030f3..9948cc1685a 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -261,6 +261,7 @@ asmlinkage long sys_newlstat(char __user *filename, struct stat __user *statbuf) return error; } +#ifndef __ARCH_WANT_STAT64 asmlinkage long sys_newfstatat(int dfd, char __user *filename, struct stat __user *statbuf, int flag) { @@ -281,6 +282,7 @@ asmlinkage long sys_newfstatat(int dfd, char __user *filename, out: return error; } +#endif asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf) { @@ -395,6 +397,26 @@ asmlinkage long sys_fstat64(unsigned long fd, struct stat64 __user * statbuf) return error; } +asmlinkage long sys_fstatat64(int dfd, char __user *filename, + struct stat64 __user *statbuf, int flag) +{ + struct kstat stat; + int error = -EINVAL; + + if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0) + goto out; + + if (flag & AT_SYMLINK_NOFOLLOW) + error = vfs_lstat_fd(dfd, filename, &stat); + else + error = vfs_stat_fd(dfd, filename, &stat); + + if (!error) + error = cp_new_stat64(&stat, statbuf); + +out: + return error; +} #endif /* __ARCH_WANT_STAT64 */ void inode_add_bytes(struct inode *inode, loff_t bytes) diff --git a/fs/super.c b/fs/super.c index c177b92419c..e20b5580afd 100644 --- a/fs/super.c +++ b/fs/super.c @@ -247,8 +247,9 @@ void generic_shutdown_super(struct super_block *sb) /* Forget any remaining inodes */ if (invalidate_inodes(sb)) { - printk("VFS: Busy inodes after unmount. " - "Self-destruct in 5 seconds. Have a nice day...\n"); + printk("VFS: Busy inodes after unmount of %s. " + "Self-destruct in 5 seconds. Have a nice day...\n", + sb->s_id); } unlock_kernel(); @@ -665,6 +666,16 @@ static int test_bdev_super(struct super_block *s, void *data) return (void *)s->s_bdev == data; } +static void bdev_uevent(struct block_device *bdev, enum kobject_action action) +{ + if (bdev->bd_disk) { + if (bdev->bd_part) + kobject_uevent(&bdev->bd_part->kobj, action); + else + kobject_uevent(&bdev->bd_disk->kobj, action); + } +} + struct super_block *get_sb_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)) @@ -706,8 +717,10 @@ struct super_block *get_sb_bdev(struct file_system_type *fs_type, up_write(&s->s_umount); deactivate_super(s); s = ERR_PTR(error); - } else + } else { s->s_flags |= MS_ACTIVE; + bdev_uevent(bdev, KOBJ_MOUNT); + } } return s; @@ -723,6 +736,7 @@ void kill_block_super(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; + bdev_uevent(bdev, KOBJ_UMOUNT); generic_shutdown_super(sb); sync_blockdev(bdev); close_bdev_excl(bdev); diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 4fae57d9d11..201049ac8a9 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -579,10 +579,9 @@ static void udf_table_free_blocks(struct super_block * sb, { loffset = nextoffset; aed->lengthAllocDescs = cpu_to_le32(adsize); - if (obh) - sptr = UDF_I_DATA(inode) + nextoffset - udf_file_entry_alloc_offset(inode) + UDF_I_LENEATTR(inode) - adsize; - else - sptr = obh->b_data + nextoffset - adsize; + sptr = UDF_I_DATA(inode) + nextoffset - + udf_file_entry_alloc_offset(inode) + + UDF_I_LENEATTR(inode) - adsize; dptr = nbh->b_data + sizeof(struct allocExtDesc); memcpy(dptr, sptr, adsize); nextoffset = sizeof(struct allocExtDesc) + adsize; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 395e582ee54..d04cff2273b 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1045,10 +1045,14 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) } inode->i_uid = le32_to_cpu(fe->uid); - if ( inode->i_uid == -1 ) inode->i_uid = UDF_SB(inode->i_sb)->s_uid; + if (inode->i_uid == -1 || UDF_QUERY_FLAG(inode->i_sb, + UDF_FLAG_UID_IGNORE)) + inode->i_uid = UDF_SB(inode->i_sb)->s_uid; inode->i_gid = le32_to_cpu(fe->gid); - if ( inode->i_gid == -1 ) inode->i_gid = UDF_SB(inode->i_sb)->s_gid; + if (inode->i_gid == -1 || UDF_QUERY_FLAG(inode->i_sb, + UDF_FLAG_GID_IGNORE)) + inode->i_gid = UDF_SB(inode->i_sb)->s_gid; inode->i_nlink = le16_to_cpu(fe->fileLinkCount); if (!inode->i_nlink) @@ -1335,10 +1339,14 @@ udf_update_inode(struct inode *inode, int do_sync) return err; } - if (inode->i_uid != UDF_SB(inode->i_sb)->s_uid) + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET)) + fe->uid = cpu_to_le32(-1); + else if (inode->i_uid != UDF_SB(inode->i_sb)->s_uid) fe->uid = cpu_to_le32(inode->i_uid); - if (inode->i_gid != UDF_SB(inode->i_sb)->s_gid) + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_FORGET)) + fe->gid = cpu_to_le32(-1); + else if (inode->i_gid != UDF_SB(inode->i_sb)->s_gid) fe->gid = cpu_to_le32(inode->i_gid); udfperms = ((inode->i_mode & S_IRWXO) ) | diff --git a/fs/udf/namei.c b/fs/udf/namei.c index ca732e79c48..ab9a7629d23 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -296,7 +296,7 @@ static struct dentry * udf_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { struct inode *inode = NULL; - struct fileIdentDesc cfi, *fi; + struct fileIdentDesc cfi; struct udf_fileident_bh fibh; if (dentry->d_name.len > UDF_NAME_LEN-2) @@ -318,7 +318,7 @@ udf_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) else #endif /* UDF_RECOVERY */ - if ((fi = udf_find_entry(dir, dentry, &fibh, &cfi))) + if (udf_find_entry(dir, dentry, &fibh, &cfi)) { if (fibh.sbh != fibh.ebh) udf_release_data(fibh.ebh); diff --git a/fs/udf/super.c b/fs/udf/super.c index 4a6f49adc60..368d8f81fe5 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -269,7 +269,7 @@ enum { Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock, Opt_anchor, Opt_volume, Opt_partition, Opt_fileset, Opt_rootdir, Opt_utf8, Opt_iocharset, - Opt_err + Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore }; static match_table_t tokens = { @@ -282,6 +282,10 @@ static match_table_t tokens = { {Opt_adinicb, "adinicb"}, {Opt_shortad, "shortad"}, {Opt_longad, "longad"}, + {Opt_uforget, "uid=forget"}, + {Opt_uignore, "uid=ignore"}, + {Opt_gforget, "gid=forget"}, + {Opt_gignore, "gid=ignore"}, {Opt_gid, "gid=%u"}, {Opt_uid, "uid=%u"}, {Opt_umask, "umask=%o"}, @@ -414,6 +418,18 @@ udf_parse_options(char *options, struct udf_options *uopt) uopt->flags |= (1 << UDF_FLAG_NLS_MAP); break; #endif + case Opt_uignore: + uopt->flags |= (1 << UDF_FLAG_UID_IGNORE); + break; + case Opt_uforget: + uopt->flags |= (1 << UDF_FLAG_UID_FORGET); + break; + case Opt_gignore: + uopt->flags |= (1 << UDF_FLAG_GID_IGNORE); + break; + case Opt_gforget: + uopt->flags |= (1 << UDF_FLAG_GID_FORGET); + break; default: printk(KERN_ERR "udf: bad mount option \"%s\" " "or missing value\n", p); diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 663669810be..110f8d62616 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -20,6 +20,10 @@ #define UDF_FLAG_VARCONV 8 #define UDF_FLAG_NLS_MAP 9 #define UDF_FLAG_UTF8 10 +#define UDF_FLAG_UID_FORGET 11 /* save -1 for uid to disk */ +#define UDF_FLAG_UID_IGNORE 12 /* use sb uid instead of on disk uid */ +#define UDF_FLAG_GID_FORGET 13 +#define UDF_FLAG_GID_IGNORE 14 #define UDF_PART_FLAG_UNALLOC_BITMAP 0x0001 #define UDF_PART_FLAG_UNALLOC_TABLE 0x0002 diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index e0c04e36a05..3c3f62ce2ad 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -376,7 +376,7 @@ out: * This function gets the block which contains the fragment. */ -static int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create) +int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create) { struct super_block * sb = inode->i_sb; struct ufs_sb_private_info * uspi = UFS_SB(sb)->s_uspi; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index d4aacee593f..e9055ef7f5a 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -388,7 +388,8 @@ static int ufs_parse_options (char * options, unsigned * mount_options) /* * Read on-disk structures associated with cylinder groups */ -static int ufs_read_cylinder_structures (struct super_block *sb) { +static int ufs_read_cylinder_structures (struct super_block *sb) +{ struct ufs_sb_info * sbi = UFS_SB(sb); struct ufs_sb_private_info * uspi; struct ufs_super_block *usb; @@ -415,6 +416,7 @@ static int ufs_read_cylinder_structures (struct super_block *sb) { base = space = kmalloc(size, GFP_KERNEL); if (!base) goto failed; + sbi->s_csp = (struct ufs_csum *)space; for (i = 0; i < blks; i += uspi->s_fpb) { size = uspi->s_bsize; if (i + uspi->s_fpb > blks) @@ -430,7 +432,6 @@ static int ufs_read_cylinder_structures (struct super_block *sb) { goto failed; ubh_ubhcpymem (space, ubh, size); - sbi->s_csp[ufs_fragstoblks(i)]=(struct ufs_csum *)space; space += size; ubh_brelse (ubh); @@ -486,7 +487,8 @@ failed: * Put on-disk structures associated with cylinder groups and * write them back to disk */ -static void ufs_put_cylinder_structures (struct super_block *sb) { +static void ufs_put_cylinder_structures (struct super_block *sb) +{ struct ufs_sb_info * sbi = UFS_SB(sb); struct ufs_sb_private_info * uspi; struct ufs_buffer_head * ubh; @@ -499,7 +501,7 @@ static void ufs_put_cylinder_structures (struct super_block *sb) { size = uspi->s_cssize; blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift; - base = space = (char*) sbi->s_csp[0]; + base = space = (char*) sbi->s_csp; for (i = 0; i < blks; i += uspi->s_fpb) { size = uspi->s_bsize; if (i + uspi->s_fpb > blks) diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index 61d2e35012a..02e86291ef8 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -29,6 +29,11 @@ * Idea from Pierre del Perugia <delperug@gla.ecoledoc.ibp.fr> */ +/* + * Modified to avoid infinite loop on 2006 by + * Evgeniy Dushistov <dushistov@mail.ru> + */ + #include <linux/errno.h> #include <linux/fs.h> #include <linux/ufs_fs.h> @@ -65,19 +70,16 @@ #define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift) #define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift) -#define DATA_BUFFER_USED(bh) \ - (atomic_read(&bh->b_count)>1 || buffer_locked(bh)) static int ufs_trunc_direct (struct inode * inode) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block * sb; struct ufs_sb_private_info * uspi; - struct buffer_head * bh; __fs32 * p; unsigned frag1, frag2, frag3, frag4, block1, block2; unsigned frag_to_free, free_count; - unsigned i, j, tmp; + unsigned i, tmp; int retry; UFSD(("ENTER\n")) @@ -117,15 +119,7 @@ static int ufs_trunc_direct (struct inode * inode) ufs_panic (sb, "ufs_trunc_direct", "internal error"); frag1 = ufs_fragnum (frag1); frag2 = ufs_fragnum (frag2); - for (j = frag1; j < frag2; j++) { - bh = sb_find_get_block (sb, tmp + j); - if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *p)) { - retry = 1; - brelse (bh); - goto next1; - } - bforget (bh); - } + inode->i_blocks -= (frag2-frag1) << uspi->s_nspfshift; mark_inode_dirty(inode); ufs_free_fragments (inode, tmp + frag1, frag2 - frag1); @@ -140,15 +134,7 @@ next1: tmp = fs32_to_cpu(sb, *p); if (!tmp) continue; - for (j = 0; j < uspi->s_fpb; j++) { - bh = sb_find_get_block(sb, tmp + j); - if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *p)) { - retry = 1; - brelse (bh); - goto next2; - } - bforget (bh); - } + *p = 0; inode->i_blocks -= uspi->s_nspb; mark_inode_dirty(inode); @@ -162,7 +148,6 @@ next1: frag_to_free = tmp; free_count = uspi->s_fpb; } -next2:; } if (free_count > 0) @@ -179,15 +164,7 @@ next2:; if (!tmp ) ufs_panic(sb, "ufs_truncate_direct", "internal error"); frag4 = ufs_fragnum (frag4); - for (j = 0; j < frag4; j++) { - bh = sb_find_get_block (sb, tmp + j); - if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *p)) { - retry = 1; - brelse (bh); - goto next1; - } - bforget (bh); - } + *p = 0; inode->i_blocks -= frag4 << uspi->s_nspfshift; mark_inode_dirty(inode); @@ -204,9 +181,8 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p) struct super_block * sb; struct ufs_sb_private_info * uspi; struct ufs_buffer_head * ind_ubh; - struct buffer_head * bh; __fs32 * ind; - unsigned indirect_block, i, j, tmp; + unsigned indirect_block, i, tmp; unsigned frag_to_free, free_count; int retry; @@ -238,15 +214,7 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p) tmp = fs32_to_cpu(sb, *ind); if (!tmp) continue; - for (j = 0; j < uspi->s_fpb; j++) { - bh = sb_find_get_block(sb, tmp + j); - if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *ind)) { - retry = 1; - brelse (bh); - goto next; - } - bforget (bh); - } + *ind = 0; ubh_mark_buffer_dirty(ind_ubh); if (free_count == 0) { @@ -261,7 +229,6 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, __fs32 *p) } inode->i_blocks -= uspi->s_nspb; mark_inode_dirty(inode); -next:; } if (free_count > 0) { @@ -430,9 +397,7 @@ void ufs_truncate (struct inode * inode) struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block * sb; struct ufs_sb_private_info * uspi; - struct buffer_head * bh; - unsigned offset; - int err, retry; + int retry; UFSD(("ENTER\n")) sb = inode->i_sb; @@ -442,6 +407,9 @@ void ufs_truncate (struct inode * inode) return; if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return; + + block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block); + lock_kernel(); while (1) { retry = ufs_trunc_direct(inode); @@ -457,15 +425,7 @@ void ufs_truncate (struct inode * inode) blk_run_address_space(inode->i_mapping); yield(); } - offset = inode->i_size & uspi->s_fshift; - if (offset) { - bh = ufs_bread (inode, inode->i_size >> uspi->s_fshift, 0, &err); - if (bh) { - memset (bh->b_data + offset, 0, uspi->s_fsize - offset); - mark_buffer_dirty (bh); - brelse (bh); - } - } + inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; ufsi->i_lastfrag = DIRECT_FRAGMENT; unlock_kernel(); diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 12062678940..74d8be87f98 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -540,7 +540,7 @@ xfs_probe_cluster( /* First sum forwards in this page */ do { - if (mapped != buffer_mapped(bh)) + if (!buffer_uptodate(bh) || (mapped != buffer_mapped(bh))) return total; total += bh->b_size; } while ((bh = bh->b_this_page) != head); @@ -747,10 +747,11 @@ xfs_convert_page( struct backing_dev_info *bdi; bdi = inode->i_mapping->backing_dev_info; + wbc->nr_to_write--; if (bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; done = 1; - } else if (--wbc->nr_to_write <= 0) { + } else if (wbc->nr_to_write <= 0) { done = 1; } } @@ -1462,4 +1463,5 @@ struct address_space_operations linvfs_aops = { .commit_write = generic_commit_write, .bmap = linvfs_bmap, .direct_IO = linvfs_direct_IO, + .migratepage = buffer_migrate_page, }; diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index e44b7c1a3a3..bfb4f2917bb 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -822,6 +822,13 @@ xfs_buf_rele( XB_TRACE(bp, "rele", bp->b_relse); + if (unlikely(!hash)) { + ASSERT(!bp->b_relse); + if (atomic_dec_and_test(&bp->b_hold)) + xfs_buf_free(bp); + return; + } + if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) { if (bp->b_relse) { atomic_inc(&bp->b_hold); @@ -1514,6 +1521,7 @@ xfs_mapping_buftarg( struct address_space *mapping; static struct address_space_operations mapping_aops = { .sync_page = block_sync_page, + .migratepage = fail_migrate_page, }; inode = new_inode(bdev->bd_inode->i_sb); diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 76c6df34d0d..d7f6f2d8ac8 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -262,6 +262,31 @@ has_fs_struct(struct task_struct *task) return (task->fs != init_task.fs); } +STATIC inline void +cleanup_inode( + vnode_t *dvp, + vnode_t *vp, + struct dentry *dentry, + int mode) +{ + struct dentry teardown = {}; + int err2; + + /* Oh, the horror. + * If we can't add the ACL or we fail in + * linvfs_init_security we must back out. + * ENOSPC can hit here, among other things. + */ + teardown.d_inode = LINVFS_GET_IP(vp); + teardown.d_name = dentry->d_name; + + if (S_ISDIR(mode)) + VOP_RMDIR(dvp, &teardown, NULL, err2); + else + VOP_REMOVE(dvp, &teardown, NULL, err2); + VN_RELE(vp); +} + STATIC int linvfs_mknod( struct inode *dir, @@ -316,30 +341,19 @@ linvfs_mknod( } if (!error) + { error = linvfs_init_security(vp, dir); + if (error) + cleanup_inode(dvp, vp, dentry, mode); + } if (default_acl) { if (!error) { error = _ACL_INHERIT(vp, &va, default_acl); - if (!error) { + if (!error) VMODIFY(vp); - } else { - struct dentry teardown = {}; - int err2; - - /* Oh, the horror. - * If we can't add the ACL we must back out. - * ENOSPC can hit here, among other things. - */ - teardown.d_inode = ip = LINVFS_GET_IP(vp); - teardown.d_name = dentry->d_name; - - if (S_ISDIR(mode)) - VOP_RMDIR(dvp, &teardown, NULL, err2); - else - VOP_REMOVE(dvp, &teardown, NULL, err2); - VN_RELE(vp); - } + else + cleanup_inode(dvp, vp, dentry, mode); } _ACL_FREE(default_acl); } @@ -659,6 +673,8 @@ linvfs_setattr( if (ia_valid & ATTR_ATIME) { vattr.va_mask |= XFS_AT_ATIME; vattr.va_atime = attr->ia_atime; + if (ia_valid & ATTR_ATIME_SET) + inode->i_atime = attr->ia_atime; } if (ia_valid & ATTR_MTIME) { vattr.va_mask |= XFS_AT_MTIME; diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 53a00fb217f..7c0e39dc618 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -68,6 +68,9 @@ kmem_zone_t *qm_dqzone; kmem_zone_t *qm_dqtrxzone; STATIC kmem_shaker_t xfs_qm_shaker; +STATIC cred_t xfs_zerocr; +STATIC xfs_inode_t xfs_zeroino; + STATIC void xfs_qm_list_init(xfs_dqlist_t *, char *, int); STATIC void xfs_qm_list_destroy(xfs_dqlist_t *); @@ -1393,8 +1396,6 @@ xfs_qm_qino_alloc( xfs_trans_t *tp; int error; unsigned long s; - cred_t zerocr; - xfs_inode_t zeroino; int committed; tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE); @@ -1406,11 +1407,9 @@ xfs_qm_qino_alloc( xfs_trans_cancel(tp, 0); return error; } - memset(&zerocr, 0, sizeof(zerocr)); - memset(&zeroino, 0, sizeof(zeroino)); - if ((error = xfs_dir_ialloc(&tp, &zeroino, S_IFREG, 1, 0, - &zerocr, 0, 1, ip, &committed))) { + if ((error = xfs_dir_ialloc(&tp, &xfs_zeroino, S_IFREG, 1, 0, + &xfs_zerocr, 0, 1, ip, &committed))) { xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); return error; diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 06fc061c50f..5b413946b1c 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -130,7 +130,8 @@ xfs_growfs_rt_alloc( /* * Lock the inode. */ - if ((error = xfs_trans_iget(mp, tp, ino, 0, XFS_ILOCK_EXCL, &ip))) + if ((error = xfs_trans_iget(mp, tp, ino, 0, + XFS_ILOCK_EXCL, &ip))) goto error_exit; XFS_BMAP_INIT(&flist, &firstblock); /* @@ -170,8 +171,8 @@ xfs_growfs_rt_alloc( /* * Lock the bitmap inode. */ - if ((error = xfs_trans_iget(mp, tp, ino, 0, XFS_ILOCK_EXCL, - &ip))) + if ((error = xfs_trans_iget(mp, tp, ino, 0, + XFS_ILOCK_EXCL, &ip))) goto error_exit; /* * Get a buffer for the block. @@ -2023,8 +2024,8 @@ xfs_growfs_rt( /* * Lock out other callers by grabbing the bitmap inode lock. */ - if ((error = xfs_trans_iget(mp, tp, 0, mp->m_sb.sb_rbmino, - XFS_ILOCK_EXCL, &ip))) + if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, + XFS_ILOCK_EXCL, &ip))) goto error_exit; ASSERT(ip == mp->m_rbmip); /* @@ -2037,8 +2038,8 @@ xfs_growfs_rt( /* * Get the summary inode into the transaction. */ - if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, - 0, XFS_ILOCK_EXCL, &ip))) + if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0, + XFS_ILOCK_EXCL, &ip))) goto error_exit; ASSERT(ip == mp->m_rsumip); /* @@ -2158,10 +2159,9 @@ xfs_rtallocate_extent( /* * Lock out other callers by grabbing the bitmap inode lock. */ - error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, XFS_ILOCK_EXCL, &ip); - if (error) { + if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, + XFS_ILOCK_EXCL, &ip))) return error; - } sumbp = NULL; /* * Allocate by size, or near another block, or exactly at some block. @@ -2221,10 +2221,9 @@ xfs_rtfree_extent( /* * Synchronize by locking the bitmap inode. */ - error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, XFS_ILOCK_EXCL, &ip); - if (error) { + if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, + XFS_ILOCK_EXCL, &ip))) return error; - } #if defined(__KERNEL__) && defined(DEBUG) /* * Check to see that this whole range is currently allocated. @@ -2365,8 +2364,8 @@ xfs_rtpick_extent( __uint64_t seq; /* sequence number of file creation */ __uint64_t *seqp; /* pointer to seqno in inode */ - error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, XFS_ILOCK_EXCL, &ip); - if (error) + if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0, + XFS_ILOCK_EXCL, &ip))) return error; ASSERT(ip == mp->m_rbmip); seqp = (__uint64_t *)&ip->i_d.di_atime; |