From 9c1ee184a30394e54165fa4c15923cabd952c106 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 13 May 2009 18:36:58 -0400 Subject: ext4: Fix sub-block zeroing for writes into preallocated extents We need to mark the buffer_head mapping preallocated space as new during write_begin. Otherwise we don't zero out the page cache content properly for a partial write. This will cause file corruption with preallocation. Now that we mark the buffer_head new we also need to have a valid buffer_head blocknr so that unmap_underlying_metadata() unmaps the correct block. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 ++ fs/ext4/inode.c | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e4033215834..172656c2a3b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2875,6 +2875,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, if (allocated > max_blocks) allocated = max_blocks; set_buffer_unwritten(bh_result); + bh_result->b_bdev = inode->i_sb->s_bdev; + bh_result->b_blocknr = newblock; goto out2; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e91f978c7f1..d4b634ae06b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2323,6 +2323,13 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, set_buffer_delay(bh_result); } else if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); + /* + * With sub-block writes into unwritten extents + * we also need to mark the buffer as new so that + * the unwritten parts of the buffer gets correctly zeroed. + */ + if (buffer_unwritten(bh_result)) + set_buffer_new(bh_result); ret = 0; } -- cgit v1.2.3 From 33b9817e2ae097c7b8d256e3510ac6c54fc6d9d0 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 12 May 2009 14:40:37 -0400 Subject: ext4: Use a fake block number for delayed new buffer_head Use a very large unsigned number (~0xffff) as as the fake block number for the delayed new buffer. The VFS should never try to write out this number, but if it does, this will make it obvious. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d4b634ae06b..0ac31a06422 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2297,6 +2297,10 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { int ret = 0; + sector_t invalid_block = ~((sector_t) 0xffff); + + if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) + invalid_block = ~0; BUG_ON(create == 0); BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); @@ -2318,7 +2322,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, /* not enough space to reserve */ return ret; - map_bh(bh_result, inode->i_sb, 0); + map_bh(bh_result, inode->i_sb, invalid_block); set_buffer_new(bh_result); set_buffer_delay(bh_result); } else if (ret > 0) { -- cgit v1.2.3 From 2a8964d63d50dd2d65d71d342bc7fb6ef4117614 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 14 May 2009 17:05:39 -0400 Subject: ext4: Clear the unwritten buffer_head flag after the extent is initialized The BH_Unwritten flag indicates that the buffer is allocated on disk but has not been written; that is, the disk was part of a persistent preallocation area. That flag should only be set when a get_blocks() function is looking up a inode's logical to physical block mapping. When ext4_get_blocks_wrap() is called with create=1, the uninitialized extent is converted into an initialized one, so the BH_Unwritten flag is no longer appropriate. Hence, we need to make sure the BH_Unwritten is not left set, since the combination of BH_Mapped and BH_Unwritten is not allowed; among other things, it will result ext4's get_block() to be called over and over again during the write_begin phase of write(2). Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0ac31a06422..2a9ffd528dd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1149,6 +1149,7 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, int retval; clear_buffer_mapped(bh); + clear_buffer_unwritten(bh); /* * Try to see if we can get the block without requesting @@ -1178,6 +1179,18 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, if (retval > 0 && buffer_mapped(bh)) return retval; + /* + * When we call get_blocks without the create flag, the + * BH_Unwritten flag could have gotten set if the blocks + * requested were part of a uninitialized extent. We need to + * clear this flag now that we are committed to convert all or + * part of the uninitialized extent to be an initialized + * extent. This is because we need to avoid the combination + * of BH_Unwritten and BH_Mapped flags being simultaneously + * set on the buffer_head. + */ + clear_buffer_unwritten(bh); + /* * New blocks allocate and/or writing to uninitialized extent * will possibly result in updating i_data, so we take -- cgit v1.2.3 From 2ec0ae3acec47f628179ee95fe2c4da01b5e9fc4 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 15 May 2009 09:07:28 -0400 Subject: ext4: Fix race in ext4_inode_info.i_cached_extent If two CPU's simultaneously call ext4_ext_get_blocks() at the same time, there is nothing protecting the i_cached_extent structure from being used and updated at the same time. This could potentially cause the wrong location on disk to be read or written to, including potentially causing the corruption of the block group descriptors and/or inode table. This bug has been in the ext4 code since almost the very beginning of ext4's development. Fortunately once the data is stored in the page cache cache, ext4_get_blocks() doesn't need to be called, so trying to replicate this problem to the point where we could identify its root cause was *extremely* difficult. Many thanks to Kevin Shanahan for working over several months to be able to reproduce this easily so we could finally nail down the cause of the corruption. Signed-off-by: "Theodore Ts'o" Reviewed-by: "Aneesh Kumar K.V" --- fs/ext4/extents.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 172656c2a3b..e3a55eb8b26 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1841,11 +1841,13 @@ ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, { struct ext4_ext_cache *cex; BUG_ON(len == 0); + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); cex = &EXT4_I(inode)->i_cached_extent; cex->ec_type = type; cex->ec_block = block; cex->ec_len = len; cex->ec_start = start; + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); } /* @@ -1902,12 +1904,17 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, struct ext4_extent *ex) { struct ext4_ext_cache *cex; + int ret = EXT4_EXT_CACHE_NO; + /* + * We borrow i_block_reservation_lock to protect i_cached_extent + */ + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); cex = &EXT4_I(inode)->i_cached_extent; /* has cache valid data? */ if (cex->ec_type == EXT4_EXT_CACHE_NO) - return EXT4_EXT_CACHE_NO; + goto errout; BUG_ON(cex->ec_type != EXT4_EXT_CACHE_GAP && cex->ec_type != EXT4_EXT_CACHE_EXTENT); @@ -1918,11 +1925,11 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block, ext_debug("%u cached by %u:%u:%llu\n", block, cex->ec_block, cex->ec_len, cex->ec_start); - return cex->ec_type; + ret = cex->ec_type; } - - /* not in cache */ - return EXT4_EXT_CACHE_NO; +errout: + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + return ret; } /* -- cgit v1.2.3