From 918941a3f3d46c2a69971b4718aaf13b1be2f1a7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 17 Aug 2009 18:50:08 +0200 Subject: ocfs2: Use __generic_file_aio_write instead of generic_file_aio_write_nolock Use the new helper. We have to submit data pages ourselves in case of O_SYNC write because __generic_file_aio_write does not do it for us. OCFS2 developpers might think about moving the sync out of i_mutex which seems to be easily possible but that's out of scope of this patch. CC: ocfs2-devel@oss.oracle.com Acked-by: Joel Becker Signed-off-by: Jan Kara --- fs/ocfs2/file.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index aa501d3f93f..60022738938 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1871,8 +1871,7 @@ relock: goto out_dio; } } else { - written = generic_file_aio_write_nolock(iocb, iov, nr_segs, - *ppos); + written = __generic_file_aio_write(iocb, iov, nr_segs, ppos); } out_dio: @@ -1880,18 +1879,21 @@ out_dio: BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) { - /* - * The generic write paths have handled getting data - * to disk, but since we don't make use of the dirty - * inode list, a manual journal commit is necessary - * here. - */ - if (old_size != i_size_read(inode) || - old_clusters != OCFS2_I(inode)->ip_clusters) { + ret = filemap_fdatawrite_range(file->f_mapping, pos, + pos + count - 1); + if (ret < 0) + written = ret; + + if (!ret && (old_size != i_size_read(inode) || + old_clusters != OCFS2_I(inode)->ip_clusters)) { ret = jbd2_journal_force_commit(osb->journal->j_journal); if (ret < 0) written = ret; } + + if (!ret) + ret = filemap_fdatawait_range(file->f_mapping, pos, + pos + count - 1); } /* -- cgit v1.2.3 From eef99380679e20e7edc096aa4d8a98b875404d79 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Aug 2009 17:43:41 +0200 Subject: vfs: Rename generic_file_aio_write_nolock generic_file_aio_write_nolock() is now used only by block devices and raw character device. Filesystems should use __generic_file_aio_write() in case generic_file_aio_write() doesn't suit them. So rename the function to blkdev_aio_write() and move it to fs/blockdev.c. Signed-off-by: Christoph Hellwig Signed-off-by: Jan Kara --- fs/block_dev.c | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 94dfda24c06..3581a4e5394 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1404,6 +1404,33 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) return blkdev_ioctl(bdev, mode, cmd, arg); } +/* + * Write data to the block device. Only intended for the block device itself + * and the raw driver which basically is a fake block device. + * + * Does not take i_mutex for the write and thus is not for general purpose + * use. + */ +ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *file = iocb->ki_filp; + ssize_t ret; + + BUG_ON(iocb->ki_pos != pos); + + ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + if (ret > 0 || ret == -EIOCBQUEUED) { + ssize_t err; + + err = generic_write_sync(file, pos, ret); + if (err < 0 && ret > 0) + ret = err; + } + return ret; +} +EXPORT_SYMBOL_GPL(blkdev_aio_write); + /* * Try to release a page associated with block device when the system * is under memory pressure. @@ -1436,7 +1463,7 @@ const struct file_operations def_blk_fops = { .read = do_sync_read, .write = do_sync_write, .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write_nolock, + .aio_write = blkdev_aio_write, .mmap = generic_file_mmap, .fsync = block_fsync, .unlocked_ioctl = block_ioctl, -- cgit v1.2.3 From 148f948ba877f4d3cdef036b1ff6d9f68986706a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 17 Aug 2009 19:52:36 +0200 Subject: vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode Introduce new function for generic inode syncing (vfs_fsync_range) and use it from fsync() path. Introduce also new helper for syncing after a sync write (generic_write_sync) using the generic function. Use these new helpers for syncing from generic VFS functions. This makes O_SYNC writes to block devices acquire i_mutex for syncing. If we really care about this, we can make block_fsync() drop the i_mutex and reacquire it before it returns. CC: Evgeniy Polyakov CC: ocfs2-devel@oss.oracle.com CC: Joel Becker CC: Felix Blyakher CC: xfs@oss.sgi.com CC: Anton Altaparmakov CC: linux-ntfs-dev@lists.sourceforge.net CC: OGAWA Hirofumi CC: linux-ext4@vger.kernel.org CC: tytso@mit.edu Acked-by: Christoph Hellwig Signed-off-by: Jan Kara --- fs/splice.c | 22 ++++++---------------- fs/sync.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 54 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 73766d24f97..819023733f8 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -976,25 +976,15 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, if (ret > 0) { unsigned long nr_pages; + int err; - *ppos += ret; nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - /* - * If file or inode is SYNC and we actually wrote some data, - * sync it. - */ - if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err; - - mutex_lock(&inode->i_mutex); - err = generic_osync_inode(inode, mapping, - OSYNC_METADATA|OSYNC_DATA); - mutex_unlock(&inode->i_mutex); - - if (err) - ret = err; - } + err = generic_write_sync(out, *ppos, ret); + if (err) + ret = err; + else + *ppos += ret; balance_dirty_pages_ratelimited_nr(mapping, nr_pages); } diff --git a/fs/sync.c b/fs/sync.c index 103cc7fdd3d..4e15da01923 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -178,19 +178,23 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync) } /** - * vfs_fsync - perform a fsync or fdatasync on a file + * vfs_fsync_range - helper to sync a range of data & metadata to disk * @file: file to sync * @dentry: dentry of @file - * @data: only perform a fdatasync operation + * @start: offset in bytes of the beginning of data range to sync + * @end: offset in bytes of the end of data range (inclusive) + * @datasync: perform only datasync * - * Write back data and metadata for @file to disk. If @datasync is - * set only metadata needed to access modified file data is written. + * Write back data in range @start..@end and metadata for @file to disk. If + * @datasync is set only metadata needed to access modified file data is + * written. * * In case this function is called from nfsd @file may be %NULL and * only @dentry is set. This can only happen when the filesystem * implements the export_operations API. */ -int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) +int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start, + loff_t end, int datasync) { const struct file_operations *fop; struct address_space *mapping; @@ -214,7 +218,7 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) goto out; } - ret = filemap_fdatawrite(mapping); + ret = filemap_fdatawrite_range(mapping, start, end); /* * We need to protect against concurrent writers, which could cause @@ -225,12 +229,32 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) if (!ret) ret = err; mutex_unlock(&mapping->host->i_mutex); - err = filemap_fdatawait(mapping); + + err = filemap_fdatawait_range(mapping, start, end); if (!ret) ret = err; out: return ret; } +EXPORT_SYMBOL(vfs_fsync_range); + +/** + * vfs_fsync - perform a fsync or fdatasync on a file + * @file: file to sync + * @dentry: dentry of @file + * @datasync: only perform a fdatasync operation + * + * Write back data and metadata for @file to disk. If @datasync is + * set only metadata needed to access modified file data is written. + * + * In case this function is called from nfsd @file may be %NULL and + * only @dentry is set. This can only happen when the filesystem + * implements the export_operations API. + */ +int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) +{ + return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync); +} EXPORT_SYMBOL(vfs_fsync); static int do_fsync(unsigned int fd, int datasync) @@ -256,6 +280,23 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd) return do_fsync(fd, 1); } +/** + * generic_write_sync - perform syncing after a write if file / inode is sync + * @file: file to which the write happened + * @pos: offset where the write started + * @count: length of the write + * + * This is just a simple wrapper about our general syncing function. + */ +int generic_write_sync(struct file *file, loff_t pos, loff_t count) +{ + if (!(file->f_flags & O_SYNC) && !IS_SYNC(file->f_mapping->host)) + return 0; + return vfs_fsync_range(file, file->f_path.dentry, pos, + pos + count - 1, 1); +} +EXPORT_SYMBOL(generic_write_sync); + /* * sys_sync_file_range() permits finely controlled syncing over a segment of * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is -- cgit v1.2.3 From a2a735ad666a04306a708b5a0109cc1fe113f569 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 18 Aug 2009 17:54:11 +0200 Subject: ext2: Update comment about generic_osync_inode We rely on generic_write_sync() now. CC: linux-ext4@vger.kernel.org Signed-off-by: Jan Kara --- fs/ext2/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index e27130341d4..1c1638f873a 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -482,7 +482,7 @@ static int ext2_alloc_branch(struct inode *inode, unlock_buffer(bh); mark_buffer_dirty_inode(bh, inode); /* We used to sync bh here if IS_SYNC(inode). - * But we now rely upon generic_osync_inode() + * But we now rely upon generic_write_sync() * and b_inode_buffers. But not for directories. */ if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) -- cgit v1.2.3 From e367626b6164aeecb97fb7c20509ed8696babc26 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 18 Aug 2009 17:51:23 +0200 Subject: ext3: Remove syncing logic from ext3_file_write Syncing is now properly done by generic_file_aio_write() so no special logic is needed in ext3. CC: linux-ext4@vger.kernel.org Signed-off-by: Jan Kara --- fs/ext3/file.c | 61 +--------------------------------------------------------- 1 file changed, 1 insertion(+), 60 deletions(-) (limited to 'fs') diff --git a/fs/ext3/file.c b/fs/ext3/file.c index 29925321478..388bbdfa0b4 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -51,71 +51,12 @@ static int ext3_release_file (struct inode * inode, struct file * filp) return 0; } -static ssize_t -ext3_file_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_path.dentry->d_inode; - ssize_t ret; - int err; - - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); - - /* - * Skip flushing if there was an error, or if nothing was written. - */ - if (ret <= 0) - return ret; - - /* - * If the inode is IS_SYNC, or is O_SYNC and we are doing data - * journalling then we need to make sure that we force the transaction - * to disk to keep all metadata uptodate synchronously. - */ - if (file->f_flags & O_SYNC) { - /* - * If we are non-data-journaled, then the dirty data has - * already been flushed to backing store by generic_osync_inode, - * and the inode has been flushed too if there have been any - * modifications other than mere timestamp updates. - * - * Open question --- do we care about flushing timestamps too - * if the inode is IS_SYNC? - */ - if (!ext3_should_journal_data(inode)) - return ret; - - goto force_commit; - } - - /* - * So we know that there has been no forced data flush. If the inode - * is marked IS_SYNC, we need to force one ourselves. - */ - if (!IS_SYNC(inode)) - return ret; - - /* - * Open question #2 --- should we force data to disk here too? If we - * don't, the only impact is that data=writeback filesystems won't - * flush data to disk automatically on IS_SYNC, only metadata (but - * historically, that is what ext2 has done.) - */ - -force_commit: - err = ext3_force_commit(inode->i_sb); - if (err) - return err; - return ret; -} - const struct file_operations ext3_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, .write = do_sync_write, .aio_read = generic_file_aio_read, - .aio_write = ext3_file_write, + .aio_write = generic_file_aio_write, .unlocked_ioctl = ext3_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext3_compat_ioctl, -- cgit v1.2.3 From 0d34ec62e18984ac9476208660372306ef54e70d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 18 Aug 2009 17:48:27 +0200 Subject: ext4: Remove syncing logic from ext4_file_write The syncing is now properly handled by generic_file_aio_write() so no special ext4 code is needed. CC: linux-ext4@vger.kernel.org CC: tytso@mit.edu Signed-off-by: Jan Kara --- fs/ext4/file.c | 53 ++--------------------------------------------------- 1 file changed, 2 insertions(+), 51 deletions(-) (limited to 'fs') diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 27f3c5354c0..5ca3eca70a1 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -58,10 +58,7 @@ static ssize_t ext4_file_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_path.dentry->d_inode; - ssize_t ret; - int err; + struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; /* * If we have encountered a bitmap-format file, the size limit @@ -81,53 +78,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, } } - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); - /* - * Skip flushing if there was an error, or if nothing was written. - */ - if (ret <= 0) - return ret; - - /* - * If the inode is IS_SYNC, or is O_SYNC and we are doing data - * journalling then we need to make sure that we force the transaction - * to disk to keep all metadata uptodate synchronously. - */ - if (file->f_flags & O_SYNC) { - /* - * If we are non-data-journaled, then the dirty data has - * already been flushed to backing store by generic_osync_inode, - * and the inode has been flushed too if there have been any - * modifications other than mere timestamp updates. - * - * Open question --- do we care about flushing timestamps too - * if the inode is IS_SYNC? - */ - if (!ext4_should_journal_data(inode)) - return ret; - - goto force_commit; - } - - /* - * So we know that there has been no forced data flush. If the inode - * is marked IS_SYNC, we need to force one ourselves. - */ - if (!IS_SYNC(inode)) - return ret; - - /* - * Open question #2 --- should we force data to disk here too? If we - * don't, the only impact is that data=writeback filesystems won't - * flush data to disk automatically on IS_SYNC, only metadata (but - * historically, that is what ext2 has done.) - */ - -force_commit: - err = ext4_force_commit(inode->i_sb); - if (err) - return err; - return ret; + return generic_file_aio_write(iocb, iov, nr_segs, pos); } static struct vm_operations_struct ext4_file_vm_ops = { -- cgit v1.2.3 From ebbbf757c6b8577ac2fb6181c08c2059153bb0e2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 18 Aug 2009 18:13:58 +0200 Subject: ntfs: Use new syncing helpers and update comments Use new syncing helpers in .write and .aio_write functions. Also remove superfluous syncing in ntfs_file_buffered_write() and update comments about generic_osync_inode(). CC: Anton Altaparmakov CC: linux-ntfs-dev@lists.sourceforge.net Signed-off-by: Jan Kara --- fs/ntfs/file.c | 16 ++++------------ fs/ntfs/mft.c | 13 ++++++------- 2 files changed, 10 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 3140a4429af..4350d4993b1 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -2076,14 +2076,6 @@ err_out: *ppos = pos; if (cached_page) page_cache_release(cached_page); - /* For now, when the user asks for O_SYNC, we actually give O_DSYNC. */ - if (likely(!status)) { - if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(vi))) { - if (!mapping->a_ops->writepage || !is_sync_kiocb(iocb)) - status = generic_osync_inode(vi, mapping, - OSYNC_METADATA|OSYNC_DATA); - } - } pagevec_lru_add_file(&lru_pvec); ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", written ? "written" : "status", (unsigned long)written, @@ -2145,8 +2137,8 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, mutex_lock(&inode->i_mutex); ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err = sync_page_range(inode, mapping, pos, ret); + if (ret > 0) { + int err = generic_write_sync(file, pos, ret); if (err < 0) ret = err; } @@ -2173,8 +2165,8 @@ static ssize_t ntfs_file_writev(struct file *file, const struct iovec *iov, if (ret == -EIOCBQUEUED) ret = wait_on_sync_kiocb(&kiocb); mutex_unlock(&inode->i_mutex); - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err = sync_page_range(inode, mapping, *ppos - ret, ret); + if (ret > 0) { + int err = generic_write_sync(file, *ppos - ret, ret); if (err < 0) ret = err; } diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 23bf68453d7..1caa0ef0b2b 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -384,13 +384,12 @@ unm_err_out: * it is dirty in the inode meta data rather than the data page cache of the * inode, and thus there are no data pages that need writing out. Therefore, a * full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the - * other hand, is not sufficient, because I_DIRTY_DATASYNC needs to be set to - * ensure ->write_inode is called from generic_osync_inode() and this needs to - * happen or the file data would not necessarily hit the device synchronously, - * even though the vfs inode has the O_SYNC flag set. Also, I_DIRTY_DATASYNC - * simply "feels" better than just I_DIRTY_SYNC, since the file data has not - * actually hit the block device yet, which is not what I_DIRTY_SYNC on its own - * would suggest. + * other hand, is not sufficient, because ->write_inode needs to be called even + * in case of fdatasync. This needs to happen or the file data would not + * necessarily hit the device synchronously, even though the vfs inode has the + * O_SYNC flag set. Also, I_DIRTY_DATASYNC simply "feels" better than just + * I_DIRTY_SYNC, since the file data has not actually hit the block device yet, + * which is not what I_DIRTY_SYNC on its own would suggest. */ void __mark_mft_record_dirty(ntfs_inode *ni) { -- cgit v1.2.3 From d23c937b0f740888765676f6f82f509dbbb2bbad Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 18 Aug 2009 18:24:31 +0200 Subject: ocfs2: Update syncing after splicing to match generic version Update ocfs2 specific splicing code to use generic syncing helper. The sync now does not happen under rw_lock because generic_write_sync() acquires i_mutex which ranks above rw_lock. That should not matter because standard fsync path does not hold it either. Acked-by: Joel Becker Acked-by: Mark Fasheh CC: ocfs2-devel@oss.oracle.com Signed-off-by: Jan Kara --- fs/ocfs2/file.c | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 60022738938..221c5e98957 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1993,31 +1993,16 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, if (ret > 0) { unsigned long nr_pages; + int err; - *ppos += ret; nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - /* - * If file or inode is SYNC and we actually wrote some data, - * sync it. - */ - if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err; - - mutex_lock(&inode->i_mutex); - err = ocfs2_rw_lock(inode, 1); - if (err < 0) { - mlog_errno(err); - } else { - err = generic_osync_inode(inode, mapping, - OSYNC_METADATA|OSYNC_DATA); - ocfs2_rw_unlock(inode, 1); - } - mutex_unlock(&inode->i_mutex); + err = generic_write_sync(out, *ppos, ret); + if (err) + ret = err; + else + *ppos += ret; - if (err) - ret = err; - } balance_dirty_pages_ratelimited_nr(mapping, nr_pages); } -- cgit v1.2.3 From af0f4414f343429971d33b0dd8dccc85c1f3dcd2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 18 Aug 2009 18:32:55 +0200 Subject: xfs: Convert sync_page_range() to simple filemap_write_and_wait_range() Christoph Hellwig says that it is enough for XFS to call filemap_write_and_wait_range() instead of sync_page_range() because we do all the metadata syncing when forcing the log. CC: Felix Blyakher CC: xfs@oss.sgi.com CC: Christoph Hellwig Signed-off-by: Jan Kara --- fs/xfs/linux-2.6/xfs_lrw.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 7078974a6ee..fde63a3c4ec 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -817,7 +817,8 @@ write_retry: xfs_iunlock(xip, iolock); if (need_i_mutex) mutex_unlock(&inode->i_mutex); - error2 = sync_page_range(inode, mapping, pos, ret); + error2 = filemap_write_and_wait_range(mapping, pos, + pos + ret - 1); if (!error) error = error2; if (need_i_mutex) -- cgit v1.2.3 From 2f3d675bcd4a84251d6e8eea8096ec8fc795e5d6 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 17 Aug 2009 17:00:02 +0200 Subject: fat: Opencode sync_page_range_nolock() fat_cont_expand() is the only user of sync_page_range_nolock(). It's also the only user of generic_osync_inode() which does not have a file open. So opencode needed actions for FAT so that we can convert generic_osync_inode() to a standard syncing path. Update a comment about generic_osync_inode(). CC: OGAWA Hirofumi Signed-off-by: Jan Kara --- fs/fat/file.c | 22 ++++++++++++++++++++-- fs/fat/misc.c | 4 ++-- 2 files changed, 22 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fat/file.c b/fs/fat/file.c index f042b965c95..e8c159de236 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -176,8 +176,26 @@ static int fat_cont_expand(struct inode *inode, loff_t size) inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - if (IS_SYNC(inode)) - err = sync_page_range_nolock(inode, mapping, start, count); + if (IS_SYNC(inode)) { + int err2; + + /* + * Opencode syncing since we don't have a file open to use + * standard fsync path. + */ + err = filemap_fdatawrite_range(mapping, start, + start + count - 1); + err2 = sync_mapping_buffers(mapping); + if (!err) + err = err2; + err2 = write_inode_now(inode, 1); + if (!err) + err = err2; + if (!err) { + err = filemap_fdatawait_range(mapping, start, + start + count - 1); + } + } out: return err; } diff --git a/fs/fat/misc.c b/fs/fat/misc.c index a6c20473dfd..4e35be873e0 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -119,8 +119,8 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster) MSDOS_I(inode)->i_start = new_dclus; MSDOS_I(inode)->i_logstart = new_dclus; /* - * Since generic_osync_inode() synchronize later if - * this is not directory, we don't here. + * Since generic_write_sync() synchronizes regular files later, + * we sync here only directories. */ if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) { ret = fat_sync_inode(inode); -- cgit v1.2.3 From 18f2ee705d98034b0f229a3202d827468d4bffd9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 18 Aug 2009 18:43:15 +0200 Subject: vfs: Remove generic_osync_inode() and sync_page_range{_nolock}() Remove these three functions since nobody uses them anymore. Signed-off-by: Jan Kara --- fs/fs-writeback.c | 54 ------------------------------------------------------ 1 file changed, 54 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index da86ef58e42..628235cf44b 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1242,57 +1242,3 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc) return ret; } EXPORT_SYMBOL(sync_inode); - -/** - * generic_osync_inode - flush all dirty data for a given inode to disk - * @inode: inode to write - * @mapping: the address_space that should be flushed - * @what: what to write and wait upon - * - * This can be called by file_write functions for files which have the - * O_SYNC flag set, to flush dirty writes to disk. - * - * @what is a bitmask, specifying which part of the inode's data should be - * written and waited upon. - * - * OSYNC_DATA: i_mapping's dirty data - * OSYNC_METADATA: the buffers at i_mapping->private_list - * OSYNC_INODE: the inode itself - */ - -int generic_osync_inode(struct inode *inode, struct address_space *mapping, int what) -{ - int err = 0; - int need_write_inode_now = 0; - int err2; - - if (what & OSYNC_DATA) - err = filemap_fdatawrite(mapping); - if (what & (OSYNC_METADATA|OSYNC_DATA)) { - err2 = sync_mapping_buffers(mapping); - if (!err) - err = err2; - } - if (what & OSYNC_DATA) { - err2 = filemap_fdatawait(mapping); - if (!err) - err = err2; - } - - spin_lock(&inode_lock); - if ((inode->i_state & I_DIRTY) && - ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC))) - need_write_inode_now = 1; - spin_unlock(&inode_lock); - - if (need_write_inode_now) { - err2 = write_inode_now(inode, 1); - if (!err) - err = err2; - } - else - inode_sync_wait(inode); - - return err; -} -EXPORT_SYMBOL(generic_osync_inode); -- cgit v1.2.3 From 2daea67e966dc0c42067ebea015ddac6834cef88 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Sep 2009 12:39:39 +0200 Subject: fsync: wait for data writeout completion before calling ->fsync Currenly vfs_fsync(_range) first calls filemap_fdatawrite to write out the data, the calls into ->fsync to write out the metadata and then finally calls filemap_fdatawait to wait for the data I/O to complete. What sounds like a clever micro-optimization actually is nast trap for many filesystems. For many modern filesystems i_size or other inode information is only updated on I/O completion and we need to wait for I/O to finish before we can write out the metadata. For old fashionen filesystems that instanciate blocks during the actual write and also update the metadata at that point it opens up a large window were we could expose uninitialized blocks after a crash. While a few filesystems that need it already wait for the I/O to finish inside their ->fsync methods it is rather suboptimal as it is done under the i_mutex and also always for the whole file instead of just a part as we could do for O_SYNC handling. Here is a small audit of all fsync instances in the tree: - spufs_mfc_fsync: - ps3flash_fsync: - vol_cdev_fsync: - printer_fsync: - fb_deferred_io_fsync: - bad_file_fsync: - simple_sync_file: don't care - filesystems/drivers do't use the page cache or are purely in-memory. - simple_fsync: - file_fsync: - affs_file_fsync: - fat_file_fsync: - jfs_fsync: - ubifs_fsync: - reiserfs_dir_fsync: - reiserfs_sync_file: never touch pagecache themselves. We need to wait before if we do not want to expose stale data after an allocation. - afs_fsync: - fuse_fsync_common: do the waiting writeback itself in awkward ways, would benefit from proper semantics - block_fsync: Does a filemap_write_and_wait on the block device inode. Because we now have f_mapping that is the same inode we call it on in vfs_fsync. So just removing it and letting the VFS do the work in one go would be an improvement. - btrfs_sync_file: - cifs_fsync: - xfs_file_fsync: need the wait first and currently do it themselves. would benefit from doing it outside i_mutex. - coda_fsync: - ecryptfs_fsync: - exofs_file_fsync: - shm_fsync: only passes the fsync through to the lower layer - ext3_sync_file: doesn't seem to care, comments are confusing. - ext4_sync_file: would need the wait to work correctly for delalloc mode with late i_size updates. Otherwise the ext3 comment applies. currently implemens it's own writeback and wait in an odd way, could benefit from doing it properly. - gfs2_fsync: not needed for journaled data mode, but probably harmless there. Currently writes back data asynchronously itself. Needs some major audit. - hostfs_fsync: just calls fsync/datasync on the host FD. Without the wait before data might not even be inflight yet if we're unlucky. - hpfs_file_fsync: - ncp_fsync: no-ops. Dangerous before and after. - jffs2_fsync: just calls jffs2_flush_wbuf_gc, not sure how this relates to data. - nfs_fsync_dir: just increments stats, claims all directory operations are synchronous - nfs_file_fsync: only writes out data??? Looks very odd. - nilfs_sync_file: looks like it expects all data done, but not sure from the code - ntfs_dir_fsync: - ntfs_file_fsync: appear to do their own data writeback. Very convoluted code. - ocfs2_sync_file: does it's own data writeback, but no wait. probably needs the wait. - smb_fsync: according to a comment expects all pages written already, probably needs the wait before. This patch only changes vfs_fsync_range, removal of the wait in the methods that have it is left to the filesystem maintainers. Note that most filesystems really do need an audit for their fsync methods given the gems found in this very brief audit. Signed-off-by: Christoph Hellwig Signed-off-by: Jan Kara --- fs/sync.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/sync.c b/fs/sync.c index 4e15da01923..192340930bb 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -218,7 +218,7 @@ int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start, goto out; } - ret = filemap_fdatawrite_range(mapping, start, end); + ret = filemap_write_and_wait_range(mapping, start, end); /* * We need to protect against concurrent writers, which could cause @@ -230,9 +230,6 @@ int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start, ret = err; mutex_unlock(&mapping->host->i_mutex); - err = filemap_fdatawait_range(mapping, start, end); - if (!ret) - ret = err; out: return ret; } -- cgit v1.2.3