From c10f5e12bafde7f7a2f9b75d76f7a68d62154e91 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 16 Apr 2010 12:56:11 -0700 Subject: ceph: clear dir complete on d_move d_move() reorders the d_subdirs list, breaking the readdir result caching. Unless/until d_move preserves that ordering, clear CEPH_I_COMPLETE on rename. Signed-off-by: Sage Weil --- fs/ceph/inode.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 26f883c275e..261f3e6c0bc 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -997,6 +997,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, dn, dn->d_name.len, dn->d_name.name); dout("fill_trace doing d_move %p -> %p\n", req->r_old_dentry, dn); + + /* d_move screws up d_subdirs order */ + ceph_i_clear(dir, CEPH_I_COMPLETE); + d_move(req->r_old_dentry, dn); dout(" src %p '%.*s' dst %p '%.*s'\n", req->r_old_dentry, -- cgit v1.2.3 From 91dee39eebcfb47085c4d457a584b0e9723b6ca0 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 19 Apr 2010 10:15:44 -0700 Subject: ceph: fix snap realm splits The snap realm split was checking i_snap_realm, not the list_head, to determine if an inode belonged in the new realm. The check always failed, which meant we always moved the inode, corrupting the old realm's list and causing various crashes. Also wait to release old realm reference to avoid possibility of use after free. Signed-off-by: Sage Weil --- fs/ceph/snap.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 2b881262ef6..d5114db7045 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -869,16 +869,20 @@ skip_inode: continue; ci = ceph_inode(inode); spin_lock(&inode->i_lock); - if (!ci->i_snap_realm) - goto split_skip_inode; - ceph_put_snap_realm(mdsc, ci->i_snap_realm); - spin_lock(&realm->inodes_with_caps_lock); - list_add(&ci->i_snap_realm_item, - &realm->inodes_with_caps); - ci->i_snap_realm = realm; - spin_unlock(&realm->inodes_with_caps_lock); - ceph_get_snap_realm(mdsc, realm); -split_skip_inode: + if (list_empty(&ci->i_snap_realm_item)) { + struct ceph_snap_realm *oldrealm = + ci->i_snap_realm; + + dout(" moving %p to split realm %llx %p\n", + inode, realm->ino, realm); + spin_lock(&realm->inodes_with_caps_lock); + list_add(&ci->i_snap_realm_item, + &realm->inodes_with_caps); + ci->i_snap_realm = realm; + spin_unlock(&realm->inodes_with_caps_lock); + ceph_get_snap_realm(mdsc, realm); + ceph_put_snap_realm(mdsc, oldrealm); + } spin_unlock(&inode->i_lock); iput(inode); } -- cgit v1.2.3 From c8f16584ac85444d51d8753c5df502350cfc7bb7 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 19 Apr 2010 13:50:26 -0700 Subject: ceph: print more useful version info on module load Decouple the client version from the server side. Print relevant protocol and map version info instead. Signed-off-by: Sage Weil --- fs/ceph/super.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 75d02eaa127..f888cf487b7 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -996,9 +996,10 @@ static int __init init_ceph(void) if (ret) goto out_icache; - pr_info("loaded %d.%d.%d (mon/mds/osd proto %d/%d/%d)\n", - CEPH_VERSION_MAJOR, CEPH_VERSION_MINOR, CEPH_VERSION_PATCH, - CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL); + pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n", + CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL, + CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT, + CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT); return 0; out_icache: -- cgit v1.2.3 From 0b0c06d1476290cea248923c0ee7be9fd61cacea Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 20 Apr 2010 10:27:13 -0700 Subject: ceph: fix leaked spinlock during mds reconnect Signed-off-by: Sage Weil --- fs/ceph/caps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index aa2239fa9a3..0c168180686 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1861,8 +1861,8 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, } else { pr_err("%p auth cap %p not mds%d ???\n", inode, cap, session->s_mds); - spin_unlock(&inode->i_lock); } + spin_unlock(&inode->i_lock); } } -- cgit v1.2.3 From d45d0d970f495e04a4e4f46acd74e90f4a4564f9 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 20 Apr 2010 15:20:33 -0700 Subject: ceph: add missing #includes Signed-off-by: Sage Weil --- fs/ceph/auth.c | 1 + fs/ceph/auth_none.h | 2 ++ fs/ceph/super.h | 1 + 3 files changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c index f6394b94b86..818afe72e6c 100644 --- a/fs/ceph/auth.c +++ b/fs/ceph/auth.c @@ -3,6 +3,7 @@ #include #include #include +#include #include "types.h" #include "auth_none.h" diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h index 56c05533a31..8164df1a08b 100644 --- a/fs/ceph/auth_none.h +++ b/fs/ceph/auth_none.h @@ -1,6 +1,8 @@ #ifndef _FS_CEPH_AUTH_NONE_H #define _FS_CEPH_AUTH_NONE_H +#include + #include "auth.h" /* diff --git a/fs/ceph/super.h b/fs/ceph/super.h index e30dfbb056c..13513b80d87 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 684be25c52a1e43638ced160be0b0b46596e7f2b Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 21 Apr 2010 20:45:59 -0700 Subject: ceph: fix seq counting for skipped messages Increment in_seq even when the message is skipped for some reason. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index cdaaa131add..e7b91e093f5 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -1379,6 +1379,7 @@ static int read_partial_message(struct ceph_connection *con) con->in_base_pos = -front_len - middle_len - data_len - sizeof(m->footer); con->in_tag = CEPH_MSGR_TAG_READY; + con->in_seq++; return 0; } if (IS_ERR(con->in_msg)) { @@ -2030,6 +2031,7 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg) ceph_msg_put(con->in_msg); con->in_msg = NULL; con->in_tag = CEPH_MSGR_TAG_READY; + con->in_seq++; } else { dout("con_revoke_pages %p msg %p pages %p no-op\n", con, con->in_msg, msg); -- cgit v1.2.3 From ae18756b9fa7bb93132cff06cd8575e3d46633f9 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 22 Apr 2010 07:47:01 -0700 Subject: ceph: discard incoming messages with bad seq # We can get old message seq #'s after a tcp reconnect for stateful sessions (i.e., the MDS). If we get a higher seq #, that is an error, and we shouldn't see any bad seq #'s for stateless (mon, osd) connections. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'fs') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index e7b91e093f5..509f57d9ccb 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -1334,6 +1334,7 @@ static int read_partial_message(struct ceph_connection *con) unsigned front_len, middle_len, data_len, data_off; int datacrc = con->msgr->nocrc; int skip; + u64 seq; dout("read_partial_message con %p msg %p\n", con, m); @@ -1368,6 +1369,25 @@ static int read_partial_message(struct ceph_connection *con) return -EIO; data_off = le16_to_cpu(con->in_hdr.data_off); + /* verify seq# */ + seq = le64_to_cpu(con->in_hdr.seq); + if ((s64)seq - (s64)con->in_seq < 1) { + pr_info("skipping %s%lld %s seq %lld, expected %lld\n", + ENTITY_NAME(con->peer_name), + pr_addr(&con->peer_addr.in_addr), + seq, con->in_seq + 1); + con->in_base_pos = -front_len - middle_len - data_len - + sizeof(m->footer); + con->in_tag = CEPH_MSGR_TAG_READY; + con->in_seq++; + return 0; + } else if ((s64)seq - (s64)con->in_seq > 1) { + pr_err("read_partial_message bad seq %lld expected %lld\n", + seq, con->in_seq + 1); + con->error_msg = "bad message sequence # for incoming message"; + return -EBADMSG; + } + /* allocate message? */ if (!con->in_msg) { dout("got hdr type %d front %d data %d\n", con->in_hdr.type, -- cgit v1.2.3 From 5c6a2cdb4fe8aaf6b54f022c14f13d2a12b45914 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 22 Apr 2010 13:48:59 -0700 Subject: ceph: fix direct io truncate offset truncate_inode_pages_range wants the end offset to align with the last byte in a page. Signed-off-by: Sage Weil --- fs/ceph/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 4add3d5da2c..ed6f19721d6 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -665,7 +665,8 @@ more: * throw out any page cache pages in this range. this * may block. */ - truncate_inode_pages_range(inode->i_mapping, pos, pos+len); + truncate_inode_pages_range(inode->i_mapping, pos, + (pos+len) | (PAGE_CACHE_SIZE-1)); } else { pages = alloc_page_vector(num_pages); if (IS_ERR(pages)) { -- cgit v1.2.3 From ea1409f96197c1bffe5d7d5bc967b3445edcc1fa Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 28 Apr 2010 16:12:06 -0700 Subject: ceph: clear dir complete, invalidate dentry on replayed rename If a rename operation is resent to the MDS following an MDS restart, the client does not get a full reply (containing the resulting metadata) back. In that case, a ceph_rename() needs to compensate by doing anything useful that fill_inode() would have, like d_move(). It also needs to invalidate the dentry (to workaround the vfs_rename_dir() bug) and clear the dir complete flag, just like fill_trace(). Signed-off-by: Sage Weil --- fs/ceph/dir.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index ea8ee2e526a..650d2db5ed2 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -880,7 +880,16 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, * do_request, above). If there is no trace, we need * to do it here. */ + + /* d_move screws up d_subdirs order */ + ceph_i_clear(new_dir, CEPH_I_COMPLETE); + d_move(old_dentry, new_dentry); + + /* ensure target dentry is invalidated, despite + rehashing bug in vfs_rename_dir */ + new_dentry->d_time = jiffies; + ceph_dentry(new_dentry)->lease_shared_gen = 0; } ceph_mdsc_put_request(req); return err; -- cgit v1.2.3 From 7ff899da02cb674211858fcd919f8b4511a4423f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 23 Apr 2010 10:25:33 -0700 Subject: ceph: fix lockless caps check The __ variant requires caller to hold i_lock. Signed-off-by: Sage Weil --- fs/ceph/addr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 412593703d1..4b42c2bb603 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -509,7 +509,7 @@ static void writepages_finish(struct ceph_osd_request *req, u64 bytes = 0; struct ceph_client *client = ceph_inode_to_client(inode); long writeback_stat; - unsigned issued = __ceph_caps_issued(ci, NULL); + unsigned issued = ceph_caps_issued(ci); /* parse reply */ replyhead = msg->front.iov_base; -- cgit v1.2.3 From b0930f8d38c6ab76dc8222a5a910a21392d38208 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 29 Apr 2010 13:26:53 -0700 Subject: ceph: remove bad auth_x kmem_cache It's useless, since our allocations are already a power of 2. And it was allocated per-instance (not globally), which caused a name collision when we tried to mount a second file system with auth_x enabled. Signed-off-by: Sage Weil --- fs/ceph/auth_x.c | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c index d9001a4dc8c..fee5a08da88 100644 --- a/fs/ceph/auth_x.c +++ b/fs/ceph/auth_x.c @@ -12,8 +12,6 @@ #include "auth.h" #include "decode.h" -struct kmem_cache *ceph_x_ticketbuf_cachep; - #define TEMP_TICKET_BUF_LEN 256 static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed); @@ -131,13 +129,12 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, char *ticket_buf; u8 struct_v; - dbuf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, GFP_NOFS | GFP_ATOMIC); + dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS); if (!dbuf) return -ENOMEM; ret = -ENOMEM; - ticket_buf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, - GFP_NOFS | GFP_ATOMIC); + ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS); if (!ticket_buf) goto out_dbuf; @@ -251,9 +248,9 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, ret = 0; out: - kmem_cache_free(ceph_x_ticketbuf_cachep, ticket_buf); + kfree(ticket_buf); out_dbuf: - kmem_cache_free(ceph_x_ticketbuf_cachep, dbuf); + kfree(dbuf); return ret; bad: @@ -605,8 +602,6 @@ static void ceph_x_destroy(struct ceph_auth_client *ac) remove_ticket_handler(ac, th); } - kmem_cache_destroy(ceph_x_ticketbuf_cachep); - kfree(ac->private); ac->private = NULL; } @@ -641,26 +636,20 @@ int ceph_x_init(struct ceph_auth_client *ac) int ret; dout("ceph_x_init %p\n", ac); + ret = -ENOMEM; xi = kzalloc(sizeof(*xi), GFP_NOFS); if (!xi) - return -ENOMEM; + goto out; - ret = -ENOMEM; - ceph_x_ticketbuf_cachep = kmem_cache_create("ceph_x_ticketbuf", - TEMP_TICKET_BUF_LEN, 8, - (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), - NULL); - if (!ceph_x_ticketbuf_cachep) - goto done_nomem; ret = -EINVAL; if (!ac->secret) { pr_err("no secret set (for auth_x protocol)\n"); - goto done_nomem; + goto out_nomem; } ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret); if (ret) - goto done_nomem; + goto out_nomem; xi->starting = true; xi->ticket_handlers = RB_ROOT; @@ -670,10 +659,9 @@ int ceph_x_init(struct ceph_auth_client *ac) ac->ops = &ceph_x_ops; return 0; -done_nomem: +out_nomem: kfree(xi); - if (ceph_x_ticketbuf_cachep) - kmem_cache_destroy(ceph_x_ticketbuf_cachep); +out: return ret; } -- cgit v1.2.3