From 80e755fedebc8de0599a79efad2c656503df2e62 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 31 Mar 2010 21:52:10 -0700 Subject: ceph: allow writeback of snapped pages older than 'oldest' snapc On snap deletion, we don't regenerate ceph_cap_snaps for inodes with dirty pages because deletion does not affect metadata writeback. However, we did run into problems when we went to write back the pages because the 'oldest' snapc is determined by the oldest cap_snap, and that may be the newer snapc that reflects the deletion. This caused confusion and an infinite loop in ceph_update_writeable_page(). Change the snapc checks to allow writeback of any snapc that is equal to OR older than the 'oldest' snapc. When there are no cap_snaps, we were also using the realm's latest snapc for writeback, which complicates ceph_put_wrbufffer_cap_refs(). Instead, use i_head_snapc, the most snapc used for the most recent ('head') data. This makes the writeback snapc (ceph_osd_request.r_snapc) _always_ match a capsnap or i_head_snapc. Also, in writepags_finish(), drop the snapc referenced by the _page_ and do not assume it matches the request snapc (it may not anymore). Signed-off-by: Sage Weil --- fs/ceph/addr.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'fs/ceph/addr.c') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index ce8ef610772..a313e9baeed 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -356,8 +356,8 @@ static struct ceph_snap_context *__get_oldest_context(struct inode *inode, break; } } - if (!snapc && ci->i_snap_realm) { - snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); + if (!snapc && ci->i_head_snapc) { + snapc = ceph_get_snap_context(ci->i_head_snapc); dout(" head snapc %p has %d dirty pages\n", snapc, ci->i_wrbuffer_ref_head); } @@ -412,7 +412,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) dout("writepage %p page %p not dirty?\n", inode, page); goto out; } - if (snapc != get_oldest_context(inode, &snap_size)) { + if (snapc->seq > get_oldest_context(inode, &snap_size)->seq) { dout("writepage %p page %p snapc %p not writeable - noop\n", inode, page, (void *)page->private); /* we should only noop if called by kswapd */ @@ -557,9 +557,9 @@ static void writepages_finish(struct ceph_osd_request *req, dout("inode %p skipping page %p\n", inode, page); wbc->pages_skipped++; } + ceph_put_snap_context((void *)page->private); page->private = 0; ClearPagePrivate(page); - ceph_put_snap_context(snapc); dout("unlocking %d %p\n", i, page); end_page_writeback(page); @@ -617,7 +617,7 @@ static int ceph_writepages_start(struct address_space *mapping, int range_whole = 0; int should_loop = 1; pgoff_t max_pages = 0, max_pages_ever = 0; - struct ceph_snap_context *snapc = NULL, *last_snapc = NULL; + struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; struct pagevec pvec; int done = 0; int rc = 0; @@ -769,9 +769,10 @@ get_more_pages: } /* only if matching snap context */ - if (snapc != (void *)page->private) { - dout("page snapc %p != oldest %p\n", - (void *)page->private, snapc); + pgsnapc = (void *)page->private; + if (pgsnapc->seq > snapc->seq) { + dout("page snapc %p %lld > oldest %p %lld\n", + pgsnapc, pgsnapc->seq, snapc, snapc->seq); unlock_page(page); if (!locked_pages) continue; /* keep looking for snap */ @@ -935,8 +936,8 @@ static int ceph_update_writeable_page(struct file *file, int pos_in_page = pos & ~PAGE_CACHE_MASK; int end_in_page = pos_in_page + len; loff_t i_size; - struct ceph_snap_context *snapc; int r; + struct ceph_snap_context *snapc, *oldest; retry_locked: /* writepages currently holds page lock, but if we change that later, */ @@ -946,16 +947,16 @@ retry_locked: BUG_ON(!ci->i_snap_realm); down_read(&mdsc->snap_rwsem); BUG_ON(!ci->i_snap_realm->cached_context); - if (page->private && - (void *)page->private != ci->i_snap_realm->cached_context) { + snapc = (void *)page->private; + if (snapc && snapc != ci->i_head_snapc) { /* * this page is already dirty in another (older) snap * context! is it writeable now? */ - snapc = get_oldest_context(inode, NULL); + oldest = get_oldest_context(inode, NULL); up_read(&mdsc->snap_rwsem); - if (snapc != (void *)page->private) { + if (snapc->seq > oldest->seq) { dout(" page %p snapc %p not current or oldest\n", page, (void *)page->private); /* -- cgit v1.2.3 From 6298a33757ba7361bb8f506c106daad77e5ac8cf Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 31 Mar 2010 22:01:38 -0700 Subject: ceph: fix snap context reference leaks The get_oldest_context() helper takes a reference to the returned snap context, but most callers weren't dropping that reference. Fix them. Also drop the unused locked __get_oldest_context() variant. Signed-off-by: Sage Weil --- fs/ceph/addr.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) (limited to 'fs/ceph/addr.c') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index a313e9baeed..41f1f713b7b 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -336,16 +336,15 @@ out: /* * Get ref for the oldest snapc for an inode with dirty data... that is, the * only snap context we are allowed to write back. - * - * Caller holds i_lock. */ -static struct ceph_snap_context *__get_oldest_context(struct inode *inode, - u64 *snap_size) +static struct ceph_snap_context *get_oldest_context(struct inode *inode, + u64 *snap_size) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc = NULL; struct ceph_cap_snap *capsnap = NULL; + spin_lock(&inode->i_lock); list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, capsnap->context, capsnap->dirty_pages); @@ -361,16 +360,6 @@ static struct ceph_snap_context *__get_oldest_context(struct inode *inode, dout(" head snapc %p has %d dirty pages\n", snapc, ci->i_wrbuffer_ref_head); } - return snapc; -} - -static struct ceph_snap_context *get_oldest_context(struct inode *inode, - u64 *snap_size) -{ - struct ceph_snap_context *snapc = NULL; - - spin_lock(&inode->i_lock); - snapc = __get_oldest_context(inode, snap_size); spin_unlock(&inode->i_lock); return snapc; } @@ -391,7 +380,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) int len = PAGE_CACHE_SIZE; loff_t i_size; int err = 0; - struct ceph_snap_context *snapc; + struct ceph_snap_context *snapc, *oldest; u64 snap_size = 0; long writeback_stat; @@ -412,13 +401,16 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) dout("writepage %p page %p not dirty?\n", inode, page); goto out; } - if (snapc->seq > get_oldest_context(inode, &snap_size)->seq) { + oldest = get_oldest_context(inode, &snap_size); + if (snapc->seq > oldest->seq) { dout("writepage %p page %p snapc %p not writeable - noop\n", inode, page, (void *)page->private); /* we should only noop if called by kswapd */ WARN_ON((current->flags & PF_MEMALLOC) == 0); + ceph_put_snap_context(oldest); goto out; } + ceph_put_snap_context(oldest); /* is this a partial page at end of file? */ if (snap_size) @@ -457,7 +449,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) ClearPagePrivate(page); end_page_writeback(page); ceph_put_wrbuffer_cap_refs(ci, 1, snapc); - ceph_put_snap_context(snapc); + ceph_put_snap_context(snapc); /* page's reference */ out: return err; } @@ -914,7 +906,10 @@ static int context_is_writeable_or_written(struct inode *inode, struct ceph_snap_context *snapc) { struct ceph_snap_context *oldest = get_oldest_context(inode, NULL); - return !oldest || snapc->seq <= oldest->seq; + int ret = !oldest || snapc->seq <= oldest->seq; + + ceph_put_snap_context(oldest); + return ret; } /* @@ -957,13 +952,14 @@ retry_locked: up_read(&mdsc->snap_rwsem); if (snapc->seq > oldest->seq) { + ceph_put_snap_context(oldest); dout(" page %p snapc %p not current or oldest\n", - page, (void *)page->private); + page, snapc); /* * queue for writeback, and wait for snapc to * be writeable or written */ - snapc = ceph_get_snap_context((void *)page->private); + snapc = ceph_get_snap_context(snapc); unlock_page(page); ceph_queue_writeback(inode); r = wait_event_interruptible(ci->i_cap_wq, @@ -973,6 +969,7 @@ retry_locked: return r; return -EAGAIN; } + ceph_put_snap_context(oldest); /* yay, writeable, do it now (without dropping page lock) */ dout(" page %p snapc %p not current, but oldest\n", -- cgit v1.2.3