From d3bccb6f4b886060aa0f58976b92b77d951f5434 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 17 Aug 2009 19:30:27 +0200 Subject: vfs: Introduce filemap_fdatawait_range This simple helper saves some filesystems conversion from byte offset to page numbers and also makes the fdata* interface more complete. Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara --- include/linux/fs.h | 2 ++ mm/filemap.c | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/include/linux/fs.h b/include/linux/fs.h index a79f48373e7..beb0e2774b2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2086,6 +2086,8 @@ extern int write_inode_now(struct inode *, int); extern int filemap_fdatawrite(struct address_space *); extern int filemap_flush(struct address_space *); extern int filemap_fdatawait(struct address_space *); +extern int filemap_fdatawait_range(struct address_space *, loff_t lstart, + loff_t lend); extern int filemap_write_and_wait(struct address_space *mapping); extern int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); diff --git a/mm/filemap.c b/mm/filemap.c index ccea3b665c1..65b2e50efcd 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -306,6 +306,26 @@ int wait_on_page_writeback_range(struct address_space *mapping, return ret; } +/** + * filemap_fdatawait_range - wait for all under-writeback pages to complete in a given range + * @mapping: address space structure to wait for + * @start: offset in bytes where the range starts + * @end: offset in bytes where the range ends (inclusive) + * + * Walk the list of under-writeback pages of the given address space + * in the given range and wait for all of them. + * + * This is just a simple wrapper so that callers don't have to convert offsets + * to page indexes themselves + */ +int filemap_fdatawait_range(struct address_space *mapping, loff_t start, + loff_t end) +{ + return wait_on_page_writeback_range(mapping, start >> PAGE_CACHE_SHIFT, + end >> PAGE_CACHE_SHIFT); +} +EXPORT_SYMBOL(filemap_fdatawait_range); + /** * sync_page_range - write and wait on all pages in the passed range * @inode: target inode -- cgit v1.2.3 From e4dd9de3c66bc7e26c5c7f149a060c5a67cf06a0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 17 Aug 2009 18:10:06 +0200 Subject: vfs: Export __generic_file_aio_write() and add some comments Rename __generic_file_aio_write_nolock() to __generic_file_aio_write(), add comments to write helpers explaining how they should be used and export __generic_file_aio_write() since it will be used by some filesystems. CC: ocfs2-devel@oss.oracle.com CC: Joel Becker Acked-by: Evgeniy Polyakov Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara --- include/linux/fs.h | 2 ++ mm/filemap.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 52 insertions(+), 7 deletions(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index beb0e2774b2..ea099d3a18d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2204,6 +2204,8 @@ extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size); int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk); extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t); +extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, + loff_t *); extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t); extern ssize_t generic_file_aio_write_nolock(struct kiocb *, const struct iovec *, unsigned long, loff_t); diff --git a/mm/filemap.c b/mm/filemap.c index 65b2e50efcd..554a396d85e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2368,9 +2368,27 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, } EXPORT_SYMBOL(generic_file_buffered_write); -static ssize_t -__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) +/** + * __generic_file_aio_write - write data to a file + * @iocb: IO state structure (file, offset, etc.) + * @iov: vector with data to write + * @nr_segs: number of segments in the vector + * @ppos: position where to write + * + * This function does all the work needed for actually writing data to a + * file. It does all basic checks, removes SUID from the file, updates + * modification times and calls proper subroutines depending on whether we + * do direct IO or a standard buffered write. + * + * It expects i_mutex to be grabbed unless we work on a block device or similar + * object which does not need locking at all. + * + * This function does *not* take care of syncing data in case of O_SYNC write. + * A caller has to handle it. This is mainly due to the fact that we want to + * avoid syncing under i_mutex. + */ +ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) { struct file *file = iocb->ki_filp; struct address_space * mapping = file->f_mapping; @@ -2467,7 +2485,23 @@ out: current->backing_dev_info = NULL; return written ? written : err; } +EXPORT_SYMBOL(__generic_file_aio_write); + +/** + * generic_file_aio_write_nolock - write data, usually to a device + * @iocb: IO state structure + * @iov: vector with data to write + * @nr_segs: number of segments in the vector + * @pos: position in file where to write + * + * This is a wrapper around __generic_file_aio_write() which takes care of + * syncing the file in case of O_SYNC file. It does not take i_mutex for the + * write itself but may do so during syncing. It is meant for users like block + * devices which do not need i_mutex during write. If your filesystem needs to + * do a write but already holds i_mutex, use __generic_file_aio_write() + * directly and then sync the file like generic_file_aio_write(). + */ ssize_t generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { @@ -2478,8 +2512,7 @@ ssize_t generic_file_aio_write_nolock(struct kiocb *iocb, BUG_ON(iocb->ki_pos != pos); - ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, - &iocb->ki_pos); + ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { ssize_t err; @@ -2492,6 +2525,17 @@ ssize_t generic_file_aio_write_nolock(struct kiocb *iocb, } EXPORT_SYMBOL(generic_file_aio_write_nolock); +/** + * generic_file_aio_write - write data to a file + * @iocb: IO state structure + * @iov: vector with data to write + * @nr_segs: number of segments in the vector + * @pos: position in file where to write + * + * This is a wrapper around __generic_file_aio_write() to be used by most + * filesystems. It takes care of syncing the file in case of O_SYNC file + * and acquires i_mutex as needed. + */ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { @@ -2503,8 +2547,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, BUG_ON(iocb->ki_pos != pos); mutex_lock(&inode->i_mutex); - ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, - &iocb->ki_pos); + ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { -- cgit v1.2.3 From c7b50db21fe8c295092518e224d60b95e69da3b0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 18 Aug 2009 16:18:20 +0200 Subject: vfs: Remove syncing from generic_file_direct_write() and generic_file_buffered_write() generic_file_direct_write() and generic_file_buffered_write() called generic_osync_inode() if it was called on O_SYNC file or IS_SYNC inode. But this is superfluous since generic_file_aio_write() does the syncing as well. Also XFS and OCFS2 which call these functions directly handle syncing themselves. So let's have a single place where syncing happens: generic_file_aio_write(). We slightly change the behavior by syncing only the range of file to which the write happened for buffered writes but that should be all that is required. CC: ocfs2-devel@oss.oracle.com CC: Joel Becker CC: Felix Blyakher CC: xfs@oss.sgi.com Signed-off-by: Jan Kara --- mm/filemap.c | 35 ++++++----------------------------- 1 file changed, 6 insertions(+), 29 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 554a396d85e..f863e1d7e22 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2187,20 +2187,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, } *ppos = end; } - - /* - * Sync the fs metadata but not the minor inode changes and - * of course not the data as we did direct DMA for the IO. - * i_mutex is held, which protects generic_osync_inode() from - * livelocking. AIO O_DIRECT ops attempt to sync metadata here. - */ out: - if ((written >= 0 || written == -EIOCBQUEUED) && - ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); - if (err < 0) - written = err; - } return written; } EXPORT_SYMBOL(generic_file_direct_write); @@ -2332,8 +2319,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; - const struct address_space_operations *a_ops = mapping->a_ops; - struct inode *inode = mapping->host; ssize_t status; struct iov_iter i; @@ -2343,16 +2328,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, if (likely(status >= 0)) { written += status; *ppos = pos + status; - - /* - * For now, when the user asks for O_SYNC, we'll actually give - * O_DSYNC - */ - if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - if (!a_ops->writepage || !is_sync_kiocb(iocb)) - status = generic_osync_inode(inode, mapping, - OSYNC_METADATA|OSYNC_DATA); - } } /* @@ -2514,11 +2489,12 @@ ssize_t generic_file_aio_write_nolock(struct kiocb *iocb, ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + if ((ret > 0 || ret == -EIOCBQUEUED) && + ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { ssize_t err; err = sync_page_range_nolock(inode, mapping, pos, ret); - if (err < 0) + if (err < 0 && ret > 0) ret = err; } return ret; @@ -2550,11 +2526,12 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + if ((ret > 0 || ret == -EIOCBQUEUED) && + ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { ssize_t err; err = sync_page_range(inode, mapping, pos, ret); - if (err < 0) + if (err < 0 && ret > 0) ret = err; } return ret; -- cgit v1.2.3 From b04f932171e8a99bb8ca2f863b2b9eadb85741e8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 17 Aug 2009 18:14:33 +0200 Subject: pohmelfs: Use __generic_file_aio_write instead of generic_file_aio_write_nolock Use new helper __generic_file_aio_write(). Since the fs takes care of syncing by itself afterwards, there are no more changes needed. CC: Evgeniy Polyakov Signed-off-by: Jan Kara --- drivers/staging/pohmelfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c index e63c9bea6c5..e9d41e439aa 100644 --- a/drivers/staging/pohmelfs/inode.c +++ b/drivers/staging/pohmelfs/inode.c @@ -921,7 +921,7 @@ ssize_t pohmelfs_write(struct file *file, const char __user *buf, if (ret) goto err_out_unlock; - ret = generic_file_aio_write_nolock(&kiocb, &iov, 1, pos); + ret = __generic_file_aio_write(&kiocb, &iov, 1, &kiocb.ki_pos); *ppos = kiocb.ki_pos; mutex_unlock(&inode->i_mutex); -- cgit v1.2.3 From 918941a3f3d46c2a69971b4718aaf13b1be2f1a7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 17 Aug 2009 18:50:08 +0200 Subject: ocfs2: Use __generic_file_aio_write instead of generic_file_aio_write_nolock Use the new helper. We have to submit data pages ourselves in case of O_SYNC write because __generic_file_aio_write does not do it for us. OCFS2 developpers might think about moving the sync out of i_mutex which seems to be easily possible but that's out of scope of this patch. CC: ocfs2-devel@oss.oracle.com Acked-by: Joel Becker Signed-off-by: Jan Kara --- fs/ocfs2/file.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index aa501d3f93f..60022738938 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1871,8 +1871,7 @@ relock: goto out_dio; } } else { - written = generic_file_aio_write_nolock(iocb, iov, nr_segs, - *ppos); + written = __generic_file_aio_write(iocb, iov, nr_segs, ppos); } out_dio: @@ -1880,18 +1879,21 @@ out_dio: BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) { - /* - * The generic write paths have handled getting data - * to disk, but since we don't make use of the dirty - * inode list, a manual journal commit is necessary - * here. - */ - if (old_size != i_size_read(inode) || - old_clusters != OCFS2_I(inode)->ip_clusters) { + ret = filemap_fdatawrite_range(file->f_mapping, pos, + pos + count - 1); + if (ret < 0) + written = ret; + + if (!ret && (old_size != i_size_read(inode) || + old_clusters != OCFS2_I(inode)->ip_clusters)) { ret = jbd2_journal_force_commit(osb->journal->j_journal); if (ret < 0) written = ret; } + + if (!ret) + ret = filemap_fdatawait_range(file->f_mapping, pos, + pos + count - 1); } /* -- cgit v1.2.3 From eef99380679e20e7edc096aa4d8a98b875404d79 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 20 Aug 2009 17:43:41 +0200 Subject: vfs: Rename generic_file_aio_write_nolock generic_file_aio_write_nolock() is now used only by block devices and raw character device. Filesystems should use __generic_file_aio_write() in case generic_file_aio_write() doesn't suit them. So rename the function to blkdev_aio_write() and move it to fs/blockdev.c. Signed-off-by: Christoph Hellwig Signed-off-by: Jan Kara --- drivers/char/raw.c | 2 +- fs/block_dev.c | 29 ++++++++++++++++++++++++++++- include/linux/fs.h | 6 ++++-- mm/filemap.c | 39 --------------------------------------- 4 files changed, 33 insertions(+), 43 deletions(-) diff --git a/drivers/char/raw.c b/drivers/char/raw.c index 05f9d18b936..40268db02e2 100644 --- a/drivers/char/raw.c +++ b/drivers/char/raw.c @@ -246,7 +246,7 @@ static const struct file_operations raw_fops = { .read = do_sync_read, .aio_read = generic_file_aio_read, .write = do_sync_write, - .aio_write = generic_file_aio_write_nolock, + .aio_write = blkdev_aio_write, .open = raw_open, .release= raw_release, .ioctl = raw_ioctl, diff --git a/fs/block_dev.c b/fs/block_dev.c index 94dfda24c06..3581a4e5394 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1404,6 +1404,33 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) return blkdev_ioctl(bdev, mode, cmd, arg); } +/* + * Write data to the block device. Only intended for the block device itself + * and the raw driver which basically is a fake block device. + * + * Does not take i_mutex for the write and thus is not for general purpose + * use. + */ +ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *file = iocb->ki_filp; + ssize_t ret; + + BUG_ON(iocb->ki_pos != pos); + + ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + if (ret > 0 || ret == -EIOCBQUEUED) { + ssize_t err; + + err = generic_write_sync(file, pos, ret); + if (err < 0 && ret > 0) + ret = err; + } + return ret; +} +EXPORT_SYMBOL_GPL(blkdev_aio_write); + /* * Try to release a page associated with block device when the system * is under memory pressure. @@ -1436,7 +1463,7 @@ const struct file_operations def_blk_fops = { .read = do_sync_read, .write = do_sync_write, .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write_nolock, + .aio_write = blkdev_aio_write, .mmap = generic_file_mmap, .fsync = block_fsync, .unlocked_ioctl = block_ioctl, diff --git a/include/linux/fs.h b/include/linux/fs.h index ea099d3a18d..6c1be3a4ede 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2207,8 +2207,6 @@ extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsig extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t *); extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t); -extern ssize_t generic_file_aio_write_nolock(struct kiocb *, const struct iovec *, - unsigned long, loff_t); extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *, unsigned long *, loff_t, loff_t *, size_t, size_t); extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *, @@ -2218,6 +2216,10 @@ extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t l extern int generic_segment_checks(const struct iovec *iov, unsigned long *nr_segs, size_t *count, int access_flags); +/* fs/block_dev.c */ +extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos); + /* fs/splice.c */ extern ssize_t generic_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); diff --git a/mm/filemap.c b/mm/filemap.c index f863e1d7e22..3587554f45e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2462,45 +2462,6 @@ out: } EXPORT_SYMBOL(__generic_file_aio_write); - -/** - * generic_file_aio_write_nolock - write data, usually to a device - * @iocb: IO state structure - * @iov: vector with data to write - * @nr_segs: number of segments in the vector - * @pos: position in file where to write - * - * This is a wrapper around __generic_file_aio_write() which takes care of - * syncing the file in case of O_SYNC file. It does not take i_mutex for the - * write itself but may do so during syncing. It is meant for users like block - * devices which do not need i_mutex during write. If your filesystem needs to - * do a write but already holds i_mutex, use __generic_file_aio_write() - * directly and then sync the file like generic_file_aio_write(). - */ -ssize_t generic_file_aio_write_nolock(struct kiocb *iocb, - const struct iovec *iov, unsigned long nr_segs, loff_t pos) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - ssize_t ret; - - BUG_ON(iocb->ki_pos != pos); - - ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); - - if ((ret > 0 || ret == -EIOCBQUEUED) && - ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - ssize_t err; - - err = sync_page_range_nolock(inode, mapping, pos, ret); - if (err < 0 && ret > 0) - ret = err; - } - return ret; -} -EXPORT_SYMBOL(generic_file_aio_write_nolock); - /** * generic_file_aio_write - write data to a file * @iocb: IO state structure -- cgit v1.2.3 From 148f948ba877f4d3cdef036b1ff6d9f68986706a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 17 Aug 2009 19:52:36 +0200 Subject: vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode Introduce new function for generic inode syncing (vfs_fsync_range) and use it from fsync() path. Introduce also new helper for syncing after a sync write (generic_write_sync) using the generic function. Use these new helpers for syncing from generic VFS functions. This makes O_SYNC writes to block devices acquire i_mutex for syncing. If we really care about this, we can make block_fsync() drop the i_mutex and reacquire it before it returns. CC: Evgeniy Polyakov CC: ocfs2-devel@oss.oracle.com CC: Joel Becker CC: Felix Blyakher CC: xfs@oss.sgi.com CC: Anton Altaparmakov CC: linux-ntfs-dev@lists.sourceforge.net CC: OGAWA Hirofumi CC: linux-ext4@vger.kernel.org CC: tytso@mit.edu Acked-by: Christoph Hellwig Signed-off-by: Jan Kara --- fs/splice.c | 22 ++++++---------------- fs/sync.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++------- include/linux/fs.h | 3 +++ mm/filemap.c | 11 ++++------- 4 files changed, 61 insertions(+), 30 deletions(-) diff --git a/fs/splice.c b/fs/splice.c index 73766d24f97..819023733f8 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -976,25 +976,15 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, if (ret > 0) { unsigned long nr_pages; + int err; - *ppos += ret; nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - /* - * If file or inode is SYNC and we actually wrote some data, - * sync it. - */ - if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err; - - mutex_lock(&inode->i_mutex); - err = generic_osync_inode(inode, mapping, - OSYNC_METADATA|OSYNC_DATA); - mutex_unlock(&inode->i_mutex); - - if (err) - ret = err; - } + err = generic_write_sync(out, *ppos, ret); + if (err) + ret = err; + else + *ppos += ret; balance_dirty_pages_ratelimited_nr(mapping, nr_pages); } diff --git a/fs/sync.c b/fs/sync.c index 103cc7fdd3d..4e15da01923 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -178,19 +178,23 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync) } /** - * vfs_fsync - perform a fsync or fdatasync on a file + * vfs_fsync_range - helper to sync a range of data & metadata to disk * @file: file to sync * @dentry: dentry of @file - * @data: only perform a fdatasync operation + * @start: offset in bytes of the beginning of data range to sync + * @end: offset in bytes of the end of data range (inclusive) + * @datasync: perform only datasync * - * Write back data and metadata for @file to disk. If @datasync is - * set only metadata needed to access modified file data is written. + * Write back data in range @start..@end and metadata for @file to disk. If + * @datasync is set only metadata needed to access modified file data is + * written. * * In case this function is called from nfsd @file may be %NULL and * only @dentry is set. This can only happen when the filesystem * implements the export_operations API. */ -int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) +int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start, + loff_t end, int datasync) { const struct file_operations *fop; struct address_space *mapping; @@ -214,7 +218,7 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) goto out; } - ret = filemap_fdatawrite(mapping); + ret = filemap_fdatawrite_range(mapping, start, end); /* * We need to protect against concurrent writers, which could cause @@ -225,12 +229,32 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) if (!ret) ret = err; mutex_unlock(&mapping->host->i_mutex); - err = filemap_fdatawait(mapping); + + err = filemap_fdatawait_range(mapping, start, end); if (!ret) ret = err; out: return ret; } +EXPORT_SYMBOL(vfs_fsync_range); + +/** + * vfs_fsync - perform a fsync or fdatasync on a file + * @file: file to sync + * @dentry: dentry of @file + * @datasync: only perform a fdatasync operation + * + * Write back data and metadata for @file to disk. If @datasync is + * set only metadata needed to access modified file data is written. + * + * In case this function is called from nfsd @file may be %NULL and + * only @dentry is set. This can only happen when the filesystem + * implements the export_operations API. + */ +int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) +{ + return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync); +} EXPORT_SYMBOL(vfs_fsync); static int do_fsync(unsigned int fd, int datasync) @@ -256,6 +280,23 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd) return do_fsync(fd, 1); } +/** + * generic_write_sync - perform syncing after a write if file / inode is sync + * @file: file to which the write happened + * @pos: offset where the write started + * @count: length of the write + * + * This is just a simple wrapper about our general syncing function. + */ +int generic_write_sync(struct file *file, loff_t pos, loff_t count) +{ + if (!(file->f_flags & O_SYNC) && !IS_SYNC(file->f_mapping->host)) + return 0; + return vfs_fsync_range(file, file->f_path.dentry, pos, + pos + count - 1, 1); +} +EXPORT_SYMBOL(generic_write_sync); + /* * sys_sync_file_range() permits finely controlled syncing over a segment of * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is diff --git a/include/linux/fs.h b/include/linux/fs.h index 6c1be3a4ede..e2c7f516766 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2098,7 +2098,10 @@ extern int __filemap_fdatawrite_range(struct address_space *mapping, extern int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, loff_t end); +extern int vfs_fsync_range(struct file *file, struct dentry *dentry, + loff_t start, loff_t end, int datasync); extern int vfs_fsync(struct file *file, struct dentry *dentry, int datasync); +extern int generic_write_sync(struct file *file, loff_t pos, loff_t count); extern void sync_supers(void); extern void emergency_sync(void); extern void emergency_remount(void); diff --git a/mm/filemap.c b/mm/filemap.c index 3587554f45e..849293c4f41 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -39,11 +39,10 @@ /* * FIXME: remove all knowledge of the buffer layer from the core VM */ -#include /* for generic_osync_inode */ +#include /* for try_to_free_buffers */ #include - /* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. @@ -2477,8 +2476,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = file->f_mapping->host; ssize_t ret; BUG_ON(iocb->ki_pos != pos); @@ -2487,11 +2485,10 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); - if ((ret > 0 || ret == -EIOCBQUEUED) && - ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + if (ret > 0 || ret == -EIOCBQUEUED) { ssize_t err; - err = sync_page_range(inode, mapping, pos, ret); + err = generic_write_sync(file, pos, ret); if (err < 0 && ret > 0) ret = err; } -- cgit v1.2.3 From a2a735ad666a04306a708b5a0109cc1fe113f569 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 18 Aug 2009 17:54:11 +0200 Subject: ext2: Update comment about generic_osync_inode We rely on generic_write_sync() now. CC: linux-ext4@vger.kernel.org Signed-off-by: Jan Kara --- fs/ext2/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index e27130341d4..1c1638f873a 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -482,7 +482,7 @@ static int ext2_alloc_branch(struct inode *inode, unlock_buffer(bh); mark_buffer_dirty_inode(bh, inode); /* We used to sync bh here if IS_SYNC(inode). - * But we now rely upon generic_osync_inode() + * But we now rely upon generic_write_sync() * and b_inode_buffers. But not for directories. */ if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) -- cgit v1.2.3 From e367626b6164aeecb97fb7c20509ed8696babc26 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 18 Aug 2009 17:51:23 +0200 Subject: ext3: Remove syncing logic from ext3_file_write Syncing is now properly done by generic_file_aio_write() so no special logic is needed in ext3. CC: linux-ext4@vger.kernel.org Signed-off-by: Jan Kara --- fs/ext3/file.c | 61 +--------------------------------------------------------- 1 file changed, 1 insertion(+), 60 deletions(-) diff --git a/fs/ext3/file.c b/fs/ext3/file.c index 29925321478..388bbdfa0b4 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -51,71 +51,12 @@ static int ext3_release_file (struct inode * inode, struct file * filp) return 0; } -static ssize_t -ext3_file_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_path.dentry->d_inode; - ssize_t ret; - int err; - - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); - - /* - * Skip flushing if there was an error, or if nothing was written. - */ - if (ret <= 0) - return ret; - - /* - * If the inode is IS_SYNC, or is O_SYNC and we are doing data - * journalling then we need to make sure that we force the transaction - * to disk to keep all metadata uptodate synchronously. - */ - if (file->f_flags & O_SYNC) { - /* - * If we are non-data-journaled, then the dirty data has - * already been flushed to backing store by generic_osync_inode, - * and the inode has been flushed too if there have been any - * modifications other than mere timestamp updates. - * - * Open question --- do we care about flushing timestamps too - * if the inode is IS_SYNC? - */ - if (!ext3_should_journal_data(inode)) - return ret; - - goto force_commit; - } - - /* - * So we know that there has been no forced data flush. If the inode - * is marked IS_SYNC, we need to force one ourselves. - */ - if (!IS_SYNC(inode)) - return ret; - - /* - * Open question #2 --- should we force data to disk here too? If we - * don't, the only impact is that data=writeback filesystems won't - * flush data to disk automatically on IS_SYNC, only metadata (but - * historically, that is what ext2 has done.) - */ - -force_commit: - err = ext3_force_commit(inode->i_sb); - if (err) - return err; - return ret; -} - const struct file_operations ext3_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, .write = do_sync_write, .aio_read = generic_file_aio_read, - .aio_write = ext3_file_write, + .aio_write = generic_file_aio_write, .unlocked_ioctl = ext3_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext3_compat_ioctl, -- cgit v1.2.3 From 0d34ec62e18984ac9476208660372306ef54e70d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 18 Aug 2009 17:48:27 +0200 Subject: ext4: Remove syncing logic from ext4_file_write The syncing is now properly handled by generic_file_aio_write() so no special ext4 code is needed. CC: linux-ext4@vger.kernel.org CC: tytso@mit.edu Signed-off-by: Jan Kara --- fs/ext4/file.c | 53 ++--------------------------------------------------- 1 file changed, 2 insertions(+), 51 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 27f3c5354c0..5ca3eca70a1 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -58,10 +58,7 @@ static ssize_t ext4_file_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_path.dentry->d_inode; - ssize_t ret; - int err; + struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; /* * If we have encountered a bitmap-format file, the size limit @@ -81,53 +78,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, } } - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); - /* - * Skip flushing if there was an error, or if nothing was written. - */ - if (ret <= 0) - return ret; - - /* - * If the inode is IS_SYNC, or is O_SYNC and we are doing data - * journalling then we need to make sure that we force the transaction - * to disk to keep all metadata uptodate synchronously. - */ - if (file->f_flags & O_SYNC) { - /* - * If we are non-data-journaled, then the dirty data has - * already been flushed to backing store by generic_osync_inode, - * and the inode has been flushed too if there have been any - * modifications other than mere timestamp updates. - * - * Open question --- do we care about flushing timestamps too - * if the inode is IS_SYNC? - */ - if (!ext4_should_journal_data(inode)) - return ret; - - goto force_commit; - } - - /* - * So we know that there has been no forced data flush. If the inode - * is marked IS_SYNC, we need to force one ourselves. - */ - if (!IS_SYNC(inode)) - return ret; - - /* - * Open question #2 --- should we force data to disk here too? If we - * don't, the only impact is that data=writeback filesystems won't - * flush data to disk automatically on IS_SYNC, only metadata (but - * historically, that is what ext2 has done.) - */ - -force_commit: - err = ext4_force_commit(inode->i_sb); - if (err) - return err; - return ret; + return generic_file_aio_write(iocb, iov, nr_segs, pos); } static struct vm_operations_struct ext4_file_vm_ops = { -- cgit v1.2.3 From ebbbf757c6b8577ac2fb6181c08c2059153bb0e2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 18 Aug 2009 18:13:58 +0200 Subject: ntfs: Use new syncing helpers and update comments Use new syncing helpers in .write and .aio_write functions. Also remove superfluous syncing in ntfs_file_buffered_write() and update comments about generic_osync_inode(). CC: Anton Altaparmakov CC: linux-ntfs-dev@lists.sourceforge.net Signed-off-by: Jan Kara --- fs/ntfs/file.c | 16 ++++------------ fs/ntfs/mft.c | 13 ++++++------- 2 files changed, 10 insertions(+), 19 deletions(-) diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 3140a4429af..4350d4993b1 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -2076,14 +2076,6 @@ err_out: *ppos = pos; if (cached_page) page_cache_release(cached_page); - /* For now, when the user asks for O_SYNC, we actually give O_DSYNC. */ - if (likely(!status)) { - if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(vi))) { - if (!mapping->a_ops->writepage || !is_sync_kiocb(iocb)) - status = generic_osync_inode(vi, mapping, - OSYNC_METADATA|OSYNC_DATA); - } - } pagevec_lru_add_file(&lru_pvec); ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", written ? "written" : "status", (unsigned long)written, @@ -2145,8 +2137,8 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, mutex_lock(&inode->i_mutex); ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err = sync_page_range(inode, mapping, pos, ret); + if (ret > 0) { + int err = generic_write_sync(file, pos, ret); if (err < 0) ret = err; } @@ -2173,8 +2165,8 @@ static ssize_t ntfs_file_writev(struct file *file, const struct iovec *iov, if (ret == -EIOCBQUEUED) ret = wait_on_sync_kiocb(&kiocb); mutex_unlock(&inode->i_mutex); - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err = sync_page_range(inode, mapping, *ppos - ret, ret); + if (ret > 0) { + int err = generic_write_sync(file, *ppos - ret, ret); if (err < 0) ret = err; } diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 23bf68453d7..1caa0ef0b2b 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -384,13 +384,12 @@ unm_err_out: * it is dirty in the inode meta data rather than the data page cache of the * inode, and thus there are no data pages that need writing out. Therefore, a * full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the - * other hand, is not sufficient, because I_DIRTY_DATASYNC needs to be set to - * ensure ->write_inode is called from generic_osync_inode() and this needs to - * happen or the file data would not necessarily hit the device synchronously, - * even though the vfs inode has the O_SYNC flag set. Also, I_DIRTY_DATASYNC - * simply "feels" better than just I_DIRTY_SYNC, since the file data has not - * actually hit the block device yet, which is not what I_DIRTY_SYNC on its own - * would suggest. + * other hand, is not sufficient, because ->write_inode needs to be called even + * in case of fdatasync. This needs to happen or the file data would not + * necessarily hit the device synchronously, even though the vfs inode has the + * O_SYNC flag set. Also, I_DIRTY_DATASYNC simply "feels" better than just + * I_DIRTY_SYNC, since the file data has not actually hit the block device yet, + * which is not what I_DIRTY_SYNC on its own would suggest. */ void __mark_mft_record_dirty(ntfs_inode *ni) { -- cgit v1.2.3 From d23c937b0f740888765676f6f82f509dbbb2bbad Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 18 Aug 2009 18:24:31 +0200 Subject: ocfs2: Update syncing after splicing to match generic version Update ocfs2 specific splicing code to use generic syncing helper. The sync now does not happen under rw_lock because generic_write_sync() acquires i_mutex which ranks above rw_lock. That should not matter because standard fsync path does not hold it either. Acked-by: Joel Becker Acked-by: Mark Fasheh CC: ocfs2-devel@oss.oracle.com Signed-off-by: Jan Kara --- fs/ocfs2/file.c | 27 ++++++--------------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 60022738938..221c5e98957 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1993,31 +1993,16 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, if (ret > 0) { unsigned long nr_pages; + int err; - *ppos += ret; nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - /* - * If file or inode is SYNC and we actually wrote some data, - * sync it. - */ - if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err; - - mutex_lock(&inode->i_mutex); - err = ocfs2_rw_lock(inode, 1); - if (err < 0) { - mlog_errno(err); - } else { - err = generic_osync_inode(inode, mapping, - OSYNC_METADATA|OSYNC_DATA); - ocfs2_rw_unlock(inode, 1); - } - mutex_unlock(&inode->i_mutex); + err = generic_write_sync(out, *ppos, ret); + if (err) + ret = err; + else + *ppos += ret; - if (err) - ret = err; - } balance_dirty_pages_ratelimited_nr(mapping, nr_pages); } -- cgit v1.2.3 From af0f4414f343429971d33b0dd8dccc85c1f3dcd2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 18 Aug 2009 18:32:55 +0200 Subject: xfs: Convert sync_page_range() to simple filemap_write_and_wait_range() Christoph Hellwig says that it is enough for XFS to call filemap_write_and_wait_range() instead of sync_page_range() because we do all the metadata syncing when forcing the log. CC: Felix Blyakher CC: xfs@oss.sgi.com CC: Christoph Hellwig Signed-off-by: Jan Kara --- fs/xfs/linux-2.6/xfs_lrw.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 7078974a6ee..fde63a3c4ec 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -817,7 +817,8 @@ write_retry: xfs_iunlock(xip, iolock); if (need_i_mutex) mutex_unlock(&inode->i_mutex); - error2 = sync_page_range(inode, mapping, pos, ret); + error2 = filemap_write_and_wait_range(mapping, pos, + pos + ret - 1); if (!error) error = error2; if (need_i_mutex) -- cgit v1.2.3 From aa3caafe53cab7ef60605e481cd5d7943e1c3022 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 18 Aug 2009 18:38:45 +0200 Subject: pohmelfs: Use new syncing helper Use new generic_write_sync() helper instead of sync_page_range(). Acked-by: Evgeniy Polyakov Signed-off-by: Jan Kara --- drivers/staging/pohmelfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c index e9d41e439aa..d004a9ddddb 100644 --- a/drivers/staging/pohmelfs/inode.c +++ b/drivers/staging/pohmelfs/inode.c @@ -927,10 +927,10 @@ ssize_t pohmelfs_write(struct file *file, const char __user *buf, mutex_unlock(&inode->i_mutex); WARN_ON(ret < 0); - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + if (ret > 0) { ssize_t err; - err = sync_page_range(inode, mapping, pos, ret); + err = generic_write_sync(file, pos, ret); if (err < 0) ret = err; WARN_ON(ret < 0); -- cgit v1.2.3 From 2f3d675bcd4a84251d6e8eea8096ec8fc795e5d6 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 17 Aug 2009 17:00:02 +0200 Subject: fat: Opencode sync_page_range_nolock() fat_cont_expand() is the only user of sync_page_range_nolock(). It's also the only user of generic_osync_inode() which does not have a file open. So opencode needed actions for FAT so that we can convert generic_osync_inode() to a standard syncing path. Update a comment about generic_osync_inode(). CC: OGAWA Hirofumi Signed-off-by: Jan Kara --- fs/fat/file.c | 22 ++++++++++++++++++++-- fs/fat/misc.c | 4 ++-- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/fs/fat/file.c b/fs/fat/file.c index f042b965c95..e8c159de236 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -176,8 +176,26 @@ static int fat_cont_expand(struct inode *inode, loff_t size) inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - if (IS_SYNC(inode)) - err = sync_page_range_nolock(inode, mapping, start, count); + if (IS_SYNC(inode)) { + int err2; + + /* + * Opencode syncing since we don't have a file open to use + * standard fsync path. + */ + err = filemap_fdatawrite_range(mapping, start, + start + count - 1); + err2 = sync_mapping_buffers(mapping); + if (!err) + err = err2; + err2 = write_inode_now(inode, 1); + if (!err) + err = err2; + if (!err) { + err = filemap_fdatawait_range(mapping, start, + start + count - 1); + } + } out: return err; } diff --git a/fs/fat/misc.c b/fs/fat/misc.c index a6c20473dfd..4e35be873e0 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -119,8 +119,8 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster) MSDOS_I(inode)->i_start = new_dclus; MSDOS_I(inode)->i_logstart = new_dclus; /* - * Since generic_osync_inode() synchronize later if - * this is not directory, we don't here. + * Since generic_write_sync() synchronizes regular files later, + * we sync here only directories. */ if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) { ret = fat_sync_inode(inode); -- cgit v1.2.3 From 18f2ee705d98034b0f229a3202d827468d4bffd9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 18 Aug 2009 18:43:15 +0200 Subject: vfs: Remove generic_osync_inode() and sync_page_range{_nolock}() Remove these three functions since nobody uses them anymore. Signed-off-by: Jan Kara --- fs/fs-writeback.c | 54 --------------------------------------- include/linux/fs.h | 5 ---- include/linux/writeback.h | 4 --- mm/filemap.c | 64 ----------------------------------------------- 4 files changed, 127 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index da86ef58e42..628235cf44b 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1242,57 +1242,3 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc) return ret; } EXPORT_SYMBOL(sync_inode); - -/** - * generic_osync_inode - flush all dirty data for a given inode to disk - * @inode: inode to write - * @mapping: the address_space that should be flushed - * @what: what to write and wait upon - * - * This can be called by file_write functions for files which have the - * O_SYNC flag set, to flush dirty writes to disk. - * - * @what is a bitmask, specifying which part of the inode's data should be - * written and waited upon. - * - * OSYNC_DATA: i_mapping's dirty data - * OSYNC_METADATA: the buffers at i_mapping->private_list - * OSYNC_INODE: the inode itself - */ - -int generic_osync_inode(struct inode *inode, struct address_space *mapping, int what) -{ - int err = 0; - int need_write_inode_now = 0; - int err2; - - if (what & OSYNC_DATA) - err = filemap_fdatawrite(mapping); - if (what & (OSYNC_METADATA|OSYNC_DATA)) { - err2 = sync_mapping_buffers(mapping); - if (!err) - err = err2; - } - if (what & OSYNC_DATA) { - err2 = filemap_fdatawait(mapping); - if (!err) - err = err2; - } - - spin_lock(&inode_lock); - if ((inode->i_state & I_DIRTY) && - ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC))) - need_write_inode_now = 1; - spin_unlock(&inode_lock); - - if (need_write_inode_now) { - err2 = write_inode_now(inode, 1); - if (!err) - err = err2; - } - else - inode_sync_wait(inode); - - return err; -} -EXPORT_SYMBOL(generic_osync_inode); diff --git a/include/linux/fs.h b/include/linux/fs.h index e2c7f516766..37f53216998 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1455,11 +1455,6 @@ int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); #define DT_SOCK 12 #define DT_WHT 14 -#define OSYNC_METADATA (1<<0) -#define OSYNC_DATA (1<<1) -#define OSYNC_INODE (1<<2) -int generic_osync_inode(struct inode *, struct address_space *, int); - /* * This is the "filldir" function type, used by readdir() to let * the kernel specify what kind of dirent layout it wants to have. diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 78b1e4684cc..d347632f186 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -150,10 +150,6 @@ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data); int do_writepages(struct address_space *mapping, struct writeback_control *wbc); -int sync_page_range(struct inode *inode, struct address_space *mapping, - loff_t pos, loff_t count); -int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, - loff_t pos, loff_t count); void set_page_dirty_balance(struct page *page, int page_mkwrite); void writeback_set_ratelimit(void); diff --git a/mm/filemap.c b/mm/filemap.c index 849293c4f41..dd51c68e2b8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -325,70 +325,6 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start, } EXPORT_SYMBOL(filemap_fdatawait_range); -/** - * sync_page_range - write and wait on all pages in the passed range - * @inode: target inode - * @mapping: target address_space - * @pos: beginning offset in pages to write - * @count: number of bytes to write - * - * Write and wait upon all the pages in the passed range. This is a "data - * integrity" operation. It waits upon in-flight writeout before starting and - * waiting upon new writeout. If there was an IO error, return it. - * - * We need to re-take i_mutex during the generic_osync_inode list walk because - * it is otherwise livelockable. - */ -int sync_page_range(struct inode *inode, struct address_space *mapping, - loff_t pos, loff_t count) -{ - pgoff_t start = pos >> PAGE_CACHE_SHIFT; - pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; - int ret; - - if (!mapping_cap_writeback_dirty(mapping) || !count) - return 0; - ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); - if (ret == 0) { - mutex_lock(&inode->i_mutex); - ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); - mutex_unlock(&inode->i_mutex); - } - if (ret == 0) - ret = wait_on_page_writeback_range(mapping, start, end); - return ret; -} -EXPORT_SYMBOL(sync_page_range); - -/** - * sync_page_range_nolock - write & wait on all pages in the passed range without locking - * @inode: target inode - * @mapping: target address_space - * @pos: beginning offset in pages to write - * @count: number of bytes to write - * - * Note: Holding i_mutex across sync_page_range_nolock() is not a good idea - * as it forces O_SYNC writers to different parts of the same file - * to be serialised right until io completion. - */ -int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, - loff_t pos, loff_t count) -{ - pgoff_t start = pos >> PAGE_CACHE_SHIFT; - pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; - int ret; - - if (!mapping_cap_writeback_dirty(mapping) || !count) - return 0; - ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); - if (ret == 0) - ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); - if (ret == 0) - ret = wait_on_page_writeback_range(mapping, start, end); - return ret; -} -EXPORT_SYMBOL(sync_page_range_nolock); - /** * filemap_fdatawait - wait for all under-writeback pages to complete * @mapping: address space structure to wait for -- cgit v1.2.3 From 2daea67e966dc0c42067ebea015ddac6834cef88 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Sep 2009 12:39:39 +0200 Subject: fsync: wait for data writeout completion before calling ->fsync Currenly vfs_fsync(_range) first calls filemap_fdatawrite to write out the data, the calls into ->fsync to write out the metadata and then finally calls filemap_fdatawait to wait for the data I/O to complete. What sounds like a clever micro-optimization actually is nast trap for many filesystems. For many modern filesystems i_size or other inode information is only updated on I/O completion and we need to wait for I/O to finish before we can write out the metadata. For old fashionen filesystems that instanciate blocks during the actual write and also update the metadata at that point it opens up a large window were we could expose uninitialized blocks after a crash. While a few filesystems that need it already wait for the I/O to finish inside their ->fsync methods it is rather suboptimal as it is done under the i_mutex and also always for the whole file instead of just a part as we could do for O_SYNC handling. Here is a small audit of all fsync instances in the tree: - spufs_mfc_fsync: - ps3flash_fsync: - vol_cdev_fsync: - printer_fsync: - fb_deferred_io_fsync: - bad_file_fsync: - simple_sync_file: don't care - filesystems/drivers do't use the page cache or are purely in-memory. - simple_fsync: - file_fsync: - affs_file_fsync: - fat_file_fsync: - jfs_fsync: - ubifs_fsync: - reiserfs_dir_fsync: - reiserfs_sync_file: never touch pagecache themselves. We need to wait before if we do not want to expose stale data after an allocation. - afs_fsync: - fuse_fsync_common: do the waiting writeback itself in awkward ways, would benefit from proper semantics - block_fsync: Does a filemap_write_and_wait on the block device inode. Because we now have f_mapping that is the same inode we call it on in vfs_fsync. So just removing it and letting the VFS do the work in one go would be an improvement. - btrfs_sync_file: - cifs_fsync: - xfs_file_fsync: need the wait first and currently do it themselves. would benefit from doing it outside i_mutex. - coda_fsync: - ecryptfs_fsync: - exofs_file_fsync: - shm_fsync: only passes the fsync through to the lower layer - ext3_sync_file: doesn't seem to care, comments are confusing. - ext4_sync_file: would need the wait to work correctly for delalloc mode with late i_size updates. Otherwise the ext3 comment applies. currently implemens it's own writeback and wait in an odd way, could benefit from doing it properly. - gfs2_fsync: not needed for journaled data mode, but probably harmless there. Currently writes back data asynchronously itself. Needs some major audit. - hostfs_fsync: just calls fsync/datasync on the host FD. Without the wait before data might not even be inflight yet if we're unlucky. - hpfs_file_fsync: - ncp_fsync: no-ops. Dangerous before and after. - jffs2_fsync: just calls jffs2_flush_wbuf_gc, not sure how this relates to data. - nfs_fsync_dir: just increments stats, claims all directory operations are synchronous - nfs_file_fsync: only writes out data??? Looks very odd. - nilfs_sync_file: looks like it expects all data done, but not sure from the code - ntfs_dir_fsync: - ntfs_file_fsync: appear to do their own data writeback. Very convoluted code. - ocfs2_sync_file: does it's own data writeback, but no wait. probably needs the wait. - smb_fsync: according to a comment expects all pages written already, probably needs the wait before. This patch only changes vfs_fsync_range, removal of the wait in the methods that have it is left to the filesystem maintainers. Note that most filesystems really do need an audit for their fsync methods given the gems found in this very brief audit. Signed-off-by: Christoph Hellwig Signed-off-by: Jan Kara --- fs/sync.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/sync.c b/fs/sync.c index 4e15da01923..192340930bb 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -218,7 +218,7 @@ int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start, goto out; } - ret = filemap_fdatawrite_range(mapping, start, end); + ret = filemap_write_and_wait_range(mapping, start, end); /* * We need to protect against concurrent writers, which could cause @@ -230,9 +230,6 @@ int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start, ret = err; mutex_unlock(&mapping->host->i_mutex); - err = filemap_fdatawait_range(mapping, start, end); - if (!ret) - ret = err; out: return ret; } -- cgit v1.2.3