From bcd78e49613c41b5bed96fa288e983876f286a59 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 23 Jul 2008 21:27:35 -0700 Subject: tmpfs: support aio We have a request for tmpfs to support the AIO interface: easily done, no more than replacing the old shmem_file_read by shmem_file_aio_read, cribbed from generic_file_aio_read. (In 2.6.25 its write side was already changed to use generic_file_aio_write.) Incorporate cleanups from Andrew Morton and Harvey Harrison. Tests out fine with LTP's ltp-aiodio.sh, given hacks (not included) to support O_DIRECT. tmpfs cannot honestly support O_DIRECT: its cache-avoiding-IO nature is at odds with direct IO-avoiding-cache. Signed-off-by: Hugh Dickins Tested-by: Lawrence Greenfield Cc: Christoph Rohland Cc: Badari Pulavarty Cc: Zach Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 55 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 21 deletions(-) (limited to 'mm/shmem.c') diff --git a/mm/shmem.c b/mm/shmem.c index e2a6ae1a44e..9ffbea9b79e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1690,26 +1690,38 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_ file_accessed(filp); } -static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) -{ - read_descriptor_t desc; - - if ((ssize_t) count < 0) - return -EINVAL; - if (!access_ok(VERIFY_WRITE, buf, count)) - return -EFAULT; - if (!count) - return 0; - - desc.written = 0; - desc.count = count; - desc.arg.buf = buf; - desc.error = 0; - - do_shmem_file_read(filp, ppos, &desc, file_read_actor); - if (desc.written) - return desc.written; - return desc.error; +static ssize_t shmem_file_aio_read(struct kiocb *iocb, + const struct iovec *iov, unsigned long nr_segs, loff_t pos) +{ + struct file *filp = iocb->ki_filp; + ssize_t retval; + unsigned long seg; + size_t count; + loff_t *ppos = &iocb->ki_pos; + + retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); + if (retval) + return retval; + + for (seg = 0; seg < nr_segs; seg++) { + read_descriptor_t desc; + + desc.written = 0; + desc.arg.buf = iov[seg].iov_base; + desc.count = iov[seg].iov_len; + if (desc.count == 0) + continue; + desc.error = 0; + do_shmem_file_read(filp, ppos, &desc, file_read_actor); + retval += desc.written; + if (desc.error) { + retval = retval ?: desc.error; + break; + } + if (desc.count > 0) + break; + } + return retval; } static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -2369,8 +2381,9 @@ static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, #ifdef CONFIG_TMPFS .llseek = generic_file_llseek, - .read = shmem_file_read, + .read = do_sync_read, .write = do_sync_write, + .aio_read = shmem_file_aio_read, .aio_write = generic_file_aio_write, .fsync = simple_sync_file, .splice_read = generic_file_splice_read, -- cgit v1.2.3 From 69029cd550284e32de13d6dd2f77b723c8a0e444 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 25 Jul 2008 01:47:14 -0700 Subject: memcg: remove refcnt from page_cgroup memcg: performance improvements Patch Description 1/5 ... remove refcnt fron page_cgroup patch (shmem handling is fixed) 2/5 ... swapcache handling patch 3/5 ... add helper function for shmem's memory reclaim patch 4/5 ... optimize by likely/unlikely ppatch 5/5 ... remove redundunt check patch (shmem handling is fixed.) Unix bench result. == 2.6.26-rc2-mm1 + memory resource controller Execl Throughput 2915.4 lps (29.6 secs, 3 samples) C Compiler Throughput 1019.3 lpm (60.0 secs, 3 samples) Shell Scripts (1 concurrent) 5796.0 lpm (60.0 secs, 3 samples) Shell Scripts (8 concurrent) 1097.7 lpm (60.0 secs, 3 samples) Shell Scripts (16 concurrent) 565.3 lpm (60.0 secs, 3 samples) File Read 1024 bufsize 2000 maxblocks 1022128.0 KBps (30.0 secs, 3 samples) File Write 1024 bufsize 2000 maxblocks 544057.0 KBps (30.0 secs, 3 samples) File Copy 1024 bufsize 2000 maxblocks 346481.0 KBps (30.0 secs, 3 samples) File Read 256 bufsize 500 maxblocks 319325.0 KBps (30.0 secs, 3 samples) File Write 256 bufsize 500 maxblocks 148788.0 KBps (30.0 secs, 3 samples) File Copy 256 bufsize 500 maxblocks 99051.0 KBps (30.0 secs, 3 samples) File Read 4096 bufsize 8000 maxblocks 2058917.0 KBps (30.0 secs, 3 samples) File Write 4096 bufsize 8000 maxblocks 1606109.0 KBps (30.0 secs, 3 samples) File Copy 4096 bufsize 8000 maxblocks 854789.0 KBps (30.0 secs, 3 samples) Dc: sqrt(2) to 99 decimal places 126145.2 lpm (30.0 secs, 3 samples) INDEX VALUES TEST BASELINE RESULT INDEX Execl Throughput 43.0 2915.4 678.0 File Copy 1024 bufsize 2000 maxblocks 3960.0 346481.0 875.0 File Copy 256 bufsize 500 maxblocks 1655.0 99051.0 598.5 File Copy 4096 bufsize 8000 maxblocks 5800.0 854789.0 1473.8 Shell Scripts (8 concurrent) 6.0 1097.7 1829.5 ========= FINAL SCORE 991.3 == 2.6.26-rc2-mm1 + this set == Execl Throughput 3012.9 lps (29.9 secs, 3 samples) C Compiler Throughput 981.0 lpm (60.0 secs, 3 samples) Shell Scripts (1 concurrent) 5872.0 lpm (60.0 secs, 3 samples) Shell Scripts (8 concurrent) 1120.3 lpm (60.0 secs, 3 samples) Shell Scripts (16 concurrent) 578.0 lpm (60.0 secs, 3 samples) File Read 1024 bufsize 2000 maxblocks 1003993.0 KBps (30.0 secs, 3 samples) File Write 1024 bufsize 2000 maxblocks 550452.0 KBps (30.0 secs, 3 samples) File Copy 1024 bufsize 2000 maxblocks 347159.0 KBps (30.0 secs, 3 samples) File Read 256 bufsize 500 maxblocks 314644.0 KBps (30.0 secs, 3 samples) File Write 256 bufsize 500 maxblocks 151852.0 KBps (30.0 secs, 3 samples) File Copy 256 bufsize 500 maxblocks 101000.0 KBps (30.0 secs, 3 samples) File Read 4096 bufsize 8000 maxblocks 2033256.0 KBps (30.0 secs, 3 samples) File Write 4096 bufsize 8000 maxblocks 1611814.0 KBps (30.0 secs, 3 samples) File Copy 4096 bufsize 8000 maxblocks 847979.0 KBps (30.0 secs, 3 samples) Dc: sqrt(2) to 99 decimal places 128148.7 lpm (30.0 secs, 3 samples) INDEX VALUES TEST BASELINE RESULT INDEX Execl Throughput 43.0 3012.9 700.7 File Copy 1024 bufsize 2000 maxblocks 3960.0 347159.0 876.7 File Copy 256 bufsize 500 maxblocks 1655.0 101000.0 610.3 File Copy 4096 bufsize 8000 maxblocks 5800.0 847979.0 1462.0 Shell Scripts (8 concurrent) 6.0 1120.3 1867.2 ========= FINAL SCORE 1004.6 This patch: Remove refcnt from page_cgroup(). After this, * A page is charged only when !page_mapped() && no page_cgroup is assigned. * Anon page is newly mapped. * File page is added to mapping->tree. * A page is uncharged only when * Anon page is fully unmapped. * File page is removed from LRU. There is no change in behavior from user's view. This patch also removes unnecessary calls in rmap.c which was used only for refcnt mangement. [akpm@linux-foundation.org: fix warning] [hugh@veritas.com: fix shmem_unuse_inode charging] Signed-off-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Li Zefan Cc: Hugh Dickins Cc: YAMAMOTO Takashi Cc: Paul Menage Cc: David Rientjes Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) (limited to 'mm/shmem.c') diff --git a/mm/shmem.c b/mm/shmem.c index 9ffbea9b79e..d58305e8a48 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -922,20 +922,26 @@ found: error = 1; if (!inode) goto out; - /* Precharge page while we can wait, compensate afterwards */ + /* Precharge page using GFP_KERNEL while we can wait */ error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); if (error) goto out; error = radix_tree_preload(GFP_KERNEL); - if (error) - goto uncharge; + if (error) { + mem_cgroup_uncharge_cache_page(page); + goto out; + } error = 1; spin_lock(&info->lock); ptr = shmem_swp_entry(info, idx, NULL); - if (ptr && ptr->val == entry.val) + if (ptr && ptr->val == entry.val) { error = add_to_page_cache(page, inode->i_mapping, idx, GFP_NOWAIT); + /* does mem_cgroup_uncharge_cache_page on error */ + } else /* we must compensate for our precharge above */ + mem_cgroup_uncharge_cache_page(page); + if (error == -EEXIST) { struct page *filepage = find_get_page(inode->i_mapping, idx); error = 1; @@ -961,8 +967,6 @@ found: shmem_swp_unmap(ptr); spin_unlock(&info->lock); radix_tree_preload_end(); -uncharge: - mem_cgroup_uncharge_page(page); out: unlock_page(page); page_cache_release(page); @@ -1319,7 +1323,7 @@ repeat: page_cache_release(swappage); goto failed; } - mem_cgroup_uncharge_page(swappage); + mem_cgroup_uncharge_cache_page(swappage); } page_cache_release(swappage); goto repeat; @@ -1358,6 +1362,8 @@ repeat: } if (!filepage) { + int ret; + spin_unlock(&info->lock); filepage = shmem_alloc_page(gfp, info, idx); if (!filepage) { @@ -1386,10 +1392,18 @@ repeat: swap = *entry; shmem_swp_unmap(entry); } - if (error || swap.val || 0 != add_to_page_cache_lru( - filepage, mapping, idx, GFP_NOWAIT)) { + ret = error || swap.val; + if (ret) + mem_cgroup_uncharge_cache_page(filepage); + else + ret = add_to_page_cache_lru(filepage, mapping, + idx, GFP_NOWAIT); + /* + * At add_to_page_cache_lru() failure, uncharge will + * be done automatically. + */ + if (ret) { spin_unlock(&info->lock); - mem_cgroup_uncharge_page(filepage); page_cache_release(filepage); shmem_unacct_blocks(info->flags, 1); shmem_free_blocks(inode, 1); @@ -1398,7 +1412,6 @@ repeat: goto failed; goto repeat; } - mem_cgroup_uncharge_page(filepage); info->flags |= SHMEM_PAGEIN; } -- cgit v1.2.3 From c9b0ed51483cc2fc42bb801b6675c4231b0e4634 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 25 Jul 2008 01:47:15 -0700 Subject: memcg: helper function for relcaim from shmem. A new call, mem_cgroup_shrink_usage() is added for shmem handling and relacing non-standard usage of mem_cgroup_charge/uncharge. Now, shmem calls mem_cgroup_charge() just for reclaim some pages from mem_cgroup. In general, shmem is used by some process group and not for global resource (like file caches). So, it's reasonable to reclaim pages from mem_cgroup where shmem is mainly used. [hugh@veritas.com: shmem_getpage release page sooner] [hugh@veritas.com: mem_cgroup_shrink_usage css_put] Signed-off-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Cc: Li Zefan Cc: YAMAMOTO Takashi Cc: Paul Menage Cc: David Rientjes Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'mm/shmem.c') diff --git a/mm/shmem.c b/mm/shmem.c index d58305e8a48..f92fea94d03 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1315,17 +1315,14 @@ repeat: shmem_swp_unmap(entry); spin_unlock(&info->lock); unlock_page(swappage); + page_cache_release(swappage); if (error == -ENOMEM) { /* allow reclaim from this memory cgroup */ - error = mem_cgroup_cache_charge(swappage, - current->mm, gfp & ~__GFP_HIGHMEM); - if (error) { - page_cache_release(swappage); + error = mem_cgroup_shrink_usage(current->mm, + gfp); + if (error) goto failed; - } - mem_cgroup_uncharge_cache_page(swappage); } - page_cache_release(swappage); goto repeat; } } else if (sgp == SGP_READ && !filepage) { -- cgit v1.2.3