From ea455f8ab68338ba69f5d3362b342c115bea8e13 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 12 Jan 2009 23:20:31 +0100 Subject: ocfs2: Push out dropping of dentry lock to ocfs2_wq Dropping of last reference to dentry lock is a complicated operation involving dropping of reference to inode. This can get complicated and quota code in particular needs to obtain some quota locks which leads to potential deadlock. Thus we defer dropping of inode reference to ocfs2_wq. Signed-off-by: Jan Kara Signed-off-by: Mark Fasheh --- fs/ocfs2/dcache.c | 42 +++++++++++++++++++++++++++++++++++++++--- fs/ocfs2/dcache.h | 9 ++++++++- fs/ocfs2/ocfs2.h | 6 ++++++ fs/ocfs2/super.c | 3 +++ 4 files changed, 56 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index b1cc7c381e8..e9d7c2038c0 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -38,6 +38,7 @@ #include "dlmglue.h" #include "file.h" #include "inode.h" +#include "super.h" static int ocfs2_dentry_revalidate(struct dentry *dentry, @@ -294,6 +295,34 @@ out_attach: return ret; } +static DEFINE_SPINLOCK(dentry_list_lock); + +/* We limit the number of dentry locks to drop in one go. We have + * this limit so that we don't starve other users of ocfs2_wq. */ +#define DL_INODE_DROP_COUNT 64 + +/* Drop inode references from dentry locks */ +void ocfs2_drop_dl_inodes(struct work_struct *work) +{ + struct ocfs2_super *osb = container_of(work, struct ocfs2_super, + dentry_lock_work); + struct ocfs2_dentry_lock *dl; + int drop_count = DL_INODE_DROP_COUNT; + + spin_lock(&dentry_list_lock); + while (osb->dentry_lock_list && drop_count--) { + dl = osb->dentry_lock_list; + osb->dentry_lock_list = dl->dl_next; + spin_unlock(&dentry_list_lock); + iput(dl->dl_inode); + kfree(dl); + spin_lock(&dentry_list_lock); + } + if (osb->dentry_lock_list) + queue_work(ocfs2_wq, &osb->dentry_lock_work); + spin_unlock(&dentry_list_lock); +} + /* * ocfs2_dentry_iput() and friends. * @@ -318,16 +347,23 @@ out_attach: static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl) { - iput(dl->dl_inode); ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); ocfs2_lock_res_free(&dl->dl_lockres); - kfree(dl); + + /* We leave dropping of inode reference to ocfs2_wq as that can + * possibly lead to inode deletion which gets tricky */ + spin_lock(&dentry_list_lock); + if (!osb->dentry_lock_list) + queue_work(ocfs2_wq, &osb->dentry_lock_work); + dl->dl_next = osb->dentry_lock_list; + osb->dentry_lock_list = dl; + spin_unlock(&dentry_list_lock); } void ocfs2_dentry_lock_put(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl) { - int unlock = 0; + int unlock; BUG_ON(dl->dl_count == 0); diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h index c091c34d988..d06e16c0664 100644 --- a/fs/ocfs2/dcache.h +++ b/fs/ocfs2/dcache.h @@ -29,8 +29,13 @@ extern struct dentry_operations ocfs2_dentry_ops; struct ocfs2_dentry_lock { + /* Use count of dentry lock */ unsigned int dl_count; - u64 dl_parent_blkno; + union { + /* Linked list of dentry locks to release */ + struct ocfs2_dentry_lock *dl_next; + u64 dl_parent_blkno; + }; /* * The ocfs2_dentry_lock keeps an inode reference until @@ -47,6 +52,8 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode, void ocfs2_dentry_lock_put(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl); +void ocfs2_drop_dl_inodes(struct work_struct *work); + struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, int skip_unhashed); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index ad5c24a29ed..077384135f4 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -210,6 +210,7 @@ struct ocfs2_journal; struct ocfs2_slot_info; struct ocfs2_recovery_map; struct ocfs2_quota_recovery; +struct ocfs2_dentry_lock; struct ocfs2_super { struct task_struct *commit_task; @@ -325,6 +326,11 @@ struct ocfs2_super struct list_head blocked_lock_list; unsigned long blocked_lock_count; + /* List of dentry locks to release. Anyone can add locks to + * the list, ocfs2_wq processes the list */ + struct ocfs2_dentry_lock *dentry_lock_list; + struct work_struct dentry_lock_work; + wait_queue_head_t osb_mount_event; /* Truncate log info */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 43ed11345b5..b1cb38fbe80 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1887,6 +1887,9 @@ static int ocfs2_initialize_super(struct super_block *sb, INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); journal->j_state = OCFS2_JOURNAL_FREE; + INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes); + osb->dentry_lock_list = NULL; + /* get some pseudo constants for clustersize bits */ osb->s_clustersize_bits = le32_to_cpu(di->id2.i_super.s_clustersize_bits); -- cgit v1.2.3 From f8afead7169f0f28a4b421bcbdb510e52a2d094d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 12 Jan 2009 23:20:32 +0100 Subject: ocfs2: Fix possible deadlock in ocfs2_write_dquot() It could happen that some limit has been set via quotactl() and in parallel ->mark_dirty() is called from another thread doing e.g. dquot_alloc_space(). In such case ocfs2_write_dquot() must not try to sync the dquot because that needs global quota lock but that ranks above transaction start. Signed-off-by: Jan Kara Signed-off-by: Mark Fasheh --- fs/ocfs2/quota_global.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index f4efa89baee..1ed0f7c8686 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -754,7 +754,9 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot) if (dquot->dq_flags & mask) sync = 1; spin_unlock(&dq_data_lock); - if (!sync) { + /* This is a slight hack but we can't afford getting global quota + * lock if we already have a transaction started. */ + if (!sync || journal_current_handle()) { status = ocfs2_write_dquot(dquot); goto out; } -- cgit v1.2.3 From 0e0333429a6280e6eb3c98845e4eed90d5f8078a Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Wed, 17 Dec 2008 14:23:52 -0800 Subject: configfs: Silence lockdep on mkdir(), rmdir() and configfs_depend_item() When attaching default groups (subdirs) of a new group (in mkdir() or in configfs_register()), configfs recursively takes inode's mutexes along the path from the parent of the new group to the default subdirs. This is needed to ensure that the VFS will not race with operations on these sub-dirs. This is safe for the following reasons: - the VFS allows one to lock first an inode and second one of its children (The lock subclasses for this pattern are respectively I_MUTEX_PARENT and I_MUTEX_CHILD); - from this rule any inode path can be recursively locked in descending order as long as it stays under a single mountpoint and does not follow symlinks. Unfortunately lockdep does not know (yet?) how to handle such recursion. I've tried to use Peter Zijlstra's lock_set_subclass() helper to upgrade i_mutexes from I_MUTEX_CHILD to I_MUTEX_PARENT when we know that we might recursively lock some of their descendant, but this usage does not seem to fit the purpose of lock_set_subclass() because it leads to several i_mutex locked with subclass I_MUTEX_PARENT by the same task. >From inside configfs it is not possible to serialize those recursive locking with a top-level one, because mkdir() and rmdir() are already called with inodes locked by the VFS. So using some mutex_lock_nest_lock() is not an option. I am proposing two solutions: 1) one that wraps recursive mutex_lock()s with lockdep_off()/lockdep_on(). 2) (as suggested earlier by Peter Zijlstra) one that puts the i_mutexes recursively locked in different classes based on their depth from the top-level config_group created. This induces an arbitrary limit (MAX_LOCK_DEPTH - 2 == 46) on the nesting of configfs default groups whenever lockdep is activated but this limit looks reasonably high. Unfortunately, this alos isolates VFS operations on configfs default groups from the others and thus lowers the chances to detect locking issues. This patch implements solution 1). Solution 2) looks better from lockdep's point of view, but fails with configfs_depend_item(). This needs to rework the locking scheme of configfs_depend_item() by removing the variable lock recursion depth, and I think that it's doable thanks to the configfs_dirent_lock. For now, let's stick to solution 1). Signed-off-by: Louis Rilling Acked-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/configfs/dir.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 8e93341f3e8..9c235839114 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -553,12 +553,24 @@ static void detach_groups(struct config_group *group) child = sd->s_dentry; + /* + * Note: we hide this from lockdep since we have no way + * to teach lockdep about recursive + * I_MUTEX_PARENT -> I_MUTEX_CHILD patterns along a path + * in an inode tree, which are valid as soon as + * I_MUTEX_PARENT -> I_MUTEX_CHILD is valid from a + * parent inode to one of its children. + */ + lockdep_off(); mutex_lock(&child->d_inode->i_mutex); + lockdep_on(); configfs_detach_group(sd->s_element); child->d_inode->i_flags |= S_DEAD; + lockdep_off(); mutex_unlock(&child->d_inode->i_mutex); + lockdep_on(); d_delete(child); dput(child); @@ -748,11 +760,22 @@ static int configfs_attach_item(struct config_item *parent_item, * We are going to remove an inode and its dentry but * the VFS may already have hit and used them. Thus, * we must lock them as rmdir() would. + * + * Note: we hide this from lockdep since we have no way + * to teach lockdep about recursive + * I_MUTEX_PARENT -> I_MUTEX_CHILD patterns along a path + * in an inode tree, which are valid as soon as + * I_MUTEX_PARENT -> I_MUTEX_CHILD is valid from a + * parent inode to one of its children. */ + lockdep_off(); mutex_lock(&dentry->d_inode->i_mutex); + lockdep_on(); configfs_remove_dir(item); dentry->d_inode->i_flags |= S_DEAD; + lockdep_off(); mutex_unlock(&dentry->d_inode->i_mutex); + lockdep_on(); d_delete(dentry); } } @@ -787,14 +810,25 @@ static int configfs_attach_group(struct config_item *parent_item, * * We must also lock the inode to remove it safely in case of * error, as rmdir() would. + * + * Note: we hide this from lockdep since we have no way + * to teach lockdep about recursive + * I_MUTEX_PARENT -> I_MUTEX_CHILD patterns along a path + * in an inode tree, which are valid as soon as + * I_MUTEX_PARENT -> I_MUTEX_CHILD is valid from a + * parent inode to one of its children. */ + lockdep_off(); mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); + lockdep_on(); ret = populate_groups(to_config_group(item)); if (ret) { configfs_detach_item(item); dentry->d_inode->i_flags |= S_DEAD; } + lockdep_off(); mutex_unlock(&dentry->d_inode->i_mutex); + lockdep_on(); if (ret) d_delete(dentry); } @@ -956,7 +990,17 @@ static int configfs_depend_prep(struct dentry *origin, BUG_ON(!origin || !sd); /* Lock this guy on the way down */ + /* + * Note: we hide this from lockdep since we have no way + * to teach lockdep about recursive + * I_MUTEX_PARENT -> I_MUTEX_CHILD patterns along a path + * in an inode tree, which are valid as soon as + * I_MUTEX_PARENT -> I_MUTEX_CHILD is valid from a + * parent inode to one of its children. + */ + lockdep_off(); mutex_lock(&sd->s_dentry->d_inode->i_mutex); + lockdep_on(); if (sd->s_element == target) /* Boo-yah */ goto out; @@ -970,7 +1014,9 @@ static int configfs_depend_prep(struct dentry *origin, } /* We looped all our children and didn't find target */ + lockdep_off(); mutex_unlock(&sd->s_dentry->d_inode->i_mutex); + lockdep_on(); ret = -ENOENT; out: @@ -990,11 +1036,16 @@ static void configfs_depend_rollback(struct dentry *origin, struct dentry *dentry = item->ci_dentry; while (dentry != origin) { + /* See comments in configfs_depend_prep() */ + lockdep_off(); mutex_unlock(&dentry->d_inode->i_mutex); + lockdep_on(); dentry = dentry->d_parent; } + lockdep_off(); mutex_unlock(&origin->d_inode->i_mutex); + lockdep_on(); } int configfs_depend_item(struct configfs_subsystem *subsys, @@ -1329,8 +1380,16 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) } /* Wait until the racing operation terminates */ + /* + * Note: we hide this from lockdep since we are locked + * with subclass I_MUTEX_NORMAL from vfs_rmdir() (why + * not I_MUTEX_CHILD?), and I_MUTEX_XATTR or + * I_MUTEX_QUOTA are not relevant for the locked inode. + */ + lockdep_off(); mutex_lock(wait_mutex); mutex_unlock(wait_mutex); + lockdep_on(); } } while (ret == -EAGAIN); -- cgit v1.2.3 From 554e7f9e043e29da79c044f7a55efe4fad40701e Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 8 Jan 2009 08:21:43 +0800 Subject: ocfs2: Access the xattr bucket only before modifying it. In ocfs2_xattr_value_truncate, we may call b-tree codes which will extend the journal transaction. It has a potential problem that it may let the already-accessed-but-not-dirtied buffers gone. So we'd better access the bucket after we call ocfs2_xattr_value_truncate. And as for the root buffer for the xattr value, b-tree code will acess and dirty it, so we don't need to worry about it. Signed-off-by: Tao Ma Signed-off-by: Mark Fasheh --- fs/ocfs2/xattr.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index e1d638af6ac..915039fffe6 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -4729,13 +4729,6 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode, vb.vb_xv = (struct ocfs2_xattr_value_root *) (vb.vb_bh->b_data + offset % blocksize); - ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) { - mlog_errno(ret); - goto out; - } - /* * From here on out we have to dirty the bucket. The generic * value calls only modify one of the bucket's bhs, but we need @@ -4748,12 +4741,18 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode, ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt); if (ret) { mlog_errno(ret); - goto out_dirty; + goto out; + } + + ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; } xe->xe_value_size = cpu_to_le64(len); -out_dirty: ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket); out: -- cgit v1.2.3 From a4b91965d39d5d53b470d6aa62cba155a6f3ffe1 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 29 Jan 2009 17:12:31 -0800 Subject: ocfs2: Wakeup the downconvert thread after a successful cancel convert When two nodes holding PR locks on a resource concurrently attempt to upconvert the locks to EX, the master sends a BAST to one of the nodes. This message tells that node to first cancel convert the upconvert request, followed by downconvert to a NL. Only when this lock is downconverted to NL, can the master upconvert the first node's lock to EX. While the fs was doing the cancel convert, it was forgetting to wake up the dc thread after a successful cancel, leading to a deadlock. Reported-and-Tested-by: David Teigland Signed-off-by: Sunil Mushran Signed-off-by: Mark Fasheh --- fs/ocfs2/dlmglue.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index b0c4cadd4c4..206a2370876 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2860,6 +2860,10 @@ static void ocfs2_unlock_ast(void *opaque, int error) case OCFS2_UNLOCK_CANCEL_CONVERT: mlog(0, "Cancel convert success for %s\n", lockres->l_name); lockres->l_action = OCFS2_AST_INVALID; + /* Downconvert thread may have requeued this lock, we + * need to wake it. */ + if (lockres->l_flags & OCFS2_LOCK_BLOCKED) + ocfs2_wake_downconvert_thread(ocfs2_get_lockres_osb(lockres)); break; case OCFS2_UNLOCK_DROP_LOCK: lockres->l_level = DLM_LOCK_IV; -- cgit v1.2.3 From fd4ef231962ab44fd1004e87f9d7c6809f00cd64 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Thu, 29 Jan 2009 15:06:21 -0800 Subject: ocfs2: add quota call to ocfs2_remove_btree_range() We weren't reclaiming the clusters which get free'd from this function, so any user punching holes in a file would still have those bytes accounted against him/her. Add the call to vfs_dq_free_space_nodirty() to fix this. Interestingly enough, the journal credits calculation already took this into account. Signed-off-by: Mark Fasheh Acked-by: Jan Kara --- fs/ocfs2/alloc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index d861096c9d8..60fe74035db 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5390,6 +5390,9 @@ int ocfs2_remove_btree_range(struct inode *inode, goto out; } + vfs_dq_free_space_nodirty(inode, + ocfs2_clusters_to_bytes(inode->i_sb, len)); + ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac, dealloc); if (ret) { -- cgit v1.2.3