From 0ccf831cbee94df9c5006dd46248c0f07847dd7c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 Feb 2008 22:27:20 -0800 Subject: lockdep: annotate epoll On Sat, 2008-01-05 at 13:35 -0800, Davide Libenzi wrote: > I remember I talked with Arjan about this time ago. Basically, since 1) > you can drop an epoll fd inside another epoll fd 2) callback-based wakeups > are used, you can see a wake_up() from inside another wake_up(), but they > will never refer to the same lock instance. > Think about: > > dfd = socket(...); > efd1 = epoll_create(); > efd2 = epoll_create(); > epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...); > epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...); > > When a packet arrives to the device underneath "dfd", the net code will > issue a wake_up() on its poll wake list. Epoll (efd1) has installed a > callback wakeup entry on that queue, and the wake_up() performed by the > "dfd" net code will end up in ep_poll_callback(). At this point epoll > (efd1) notices that it may have some event ready, so it needs to wake up > the waiters on its poll wait list (efd2). So it calls ep_poll_safewake() > that ends up in another wake_up(), after having checked about the > recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to > avoid stack blasting. Never hit the same queue, to avoid loops like: > > epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...); > epoll_ctl(efd3, EPOLL_CTL_ADD, efd2, ...); > epoll_ctl(efd4, EPOLL_CTL_ADD, efd3, ...); > epoll_ctl(efd1, EPOLL_CTL_ADD, efd4, ...); > > The code "if (tncur->wq == wq || ..." prevents re-entering the same > queue/lock. Since the epoll code is very careful to not nest same instance locks allow the recursion. Signed-off-by: Peter Zijlstra Tested-by: Stefan Richter Acked-by: Davide Libenzi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 81c04abfb1a..a415f42d32c 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -353,7 +353,7 @@ static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq) spin_unlock_irqrestore(&psw->lock, flags); /* Do really wake up now */ - wake_up(wq); + wake_up_nested(wq, 1 + wake_nests); /* Remove the current task from the list */ spin_lock_irqsave(&psw->lock, flags); -- cgit v1.2.3 From 59714d65dfbc86d5cb93adc5bac57a921cc2fa84 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 4 Feb 2008 22:27:21 -0800 Subject: get_task_comm(): return the result It was dumb to make get_task_comm() return void. Change it to return a pointer to the resulting output for caller convenience. Cc: Ulrich Drepper Cc: Ingo Molnar Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 282240afe99..966c5c5b674 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -947,12 +947,13 @@ static void flush_old_files(struct files_struct * files) spin_unlock(&files->file_lock); } -void get_task_comm(char *buf, struct task_struct *tsk) +char *get_task_comm(char *buf, struct task_struct *tsk) { /* buf must be at least sizeof(tsk->comm) in size */ task_lock(tsk); strncpy(buf, tsk->comm, sizeof(tsk->comm)); task_unlock(tsk); + return buf; } void set_task_comm(struct task_struct *tsk, char *buf) -- cgit v1.2.3 From ed5d2cac114202fe2978a9cbcab8f5032796d538 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 4 Feb 2008 22:27:24 -0800 Subject: exec: rework the group exit and fix the race with kill As Roland pointed out, we have the very old problem with exec. de_thread() sets SIGNAL_GROUP_EXIT, kills other threads, changes ->group_leader and then clears signal->flags. All signals (even fatal ones) sent in this window (which is not too small) will be lost. With this patch exec doesn't abuse SIGNAL_GROUP_EXIT. signal_group_exit(), the new helper, should be used to detect exit_group() or exec() in progress. It can have more users, but this patch does only strictly necessary changes. Signed-off-by: Oleg Nesterov Cc: Davide Libenzi Cc: Ingo Molnar Cc: Robin Holt Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 966c5c5b674..be923e4bc38 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -760,7 +760,7 @@ static int de_thread(struct task_struct *tsk) */ read_lock(&tasklist_lock); spin_lock_irq(lock); - if (sig->flags & SIGNAL_GROUP_EXIT) { + if (signal_group_exit(sig)) { /* * Another group action in progress, just * return so that the signal is processed. @@ -778,6 +778,7 @@ static int de_thread(struct task_struct *tsk) if (unlikely(tsk->group_leader == task_child_reaper(tsk))) task_active_pid_ns(tsk)->child_reaper = tsk; + sig->group_exit_task = tsk; zap_other_threads(tsk); read_unlock(&tasklist_lock); @@ -802,7 +803,6 @@ static int de_thread(struct task_struct *tsk) } sig->notify_count = count; - sig->group_exit_task = tsk; while (atomic_read(&sig->count) > count) { __set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock_irq(lock); @@ -871,15 +871,10 @@ static int de_thread(struct task_struct *tsk) leader->exit_state = EXIT_DEAD; write_unlock_irq(&tasklist_lock); - } + } sig->group_exit_task = NULL; sig->notify_count = 0; - /* - * There may be one thread left which is just exiting, - * but it's safe to stop telling the group to kill themselves. - */ - sig->flags = 0; no_thread_group: exit_itimers(sig); @@ -1549,7 +1544,7 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, int err = -EAGAIN; spin_lock_irq(&tsk->sighand->siglock); - if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) { + if (!signal_group_exit(tsk->signal)) { tsk->signal->group_exit_code = exit_code; zap_process(tsk); err = 0; -- cgit v1.2.3 From 4d672e7ac79b5ec5cdc90e450823441e20464691 Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Mon, 4 Feb 2008 22:27:26 -0800 Subject: timerfd: new timerfd API This is the new timerfd API as it is implemented by the following patch: int timerfd_create(int clockid, int flags); int timerfd_settime(int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr); int timerfd_gettime(int ufd, struct itimerspec *otmr); The timerfd_create() API creates an un-programmed timerfd fd. The "clockid" parameter can be either CLOCK_MONOTONIC or CLOCK_REALTIME. The timerfd_settime() API give new settings by the timerfd fd, by optionally retrieving the previous expiration time (in case the "otmr" parameter is not NULL). The time value specified in "utmr" is absolute, if the TFD_TIMER_ABSTIME bit is set in the "flags" parameter. Otherwise it's a relative time. The timerfd_gettime() API returns the next expiration time of the timer, or {0, 0} if the timerfd has not been set yet. Like the previous timerfd API implementation, read(2) and poll(2) are supported (with the same interface). Here's a simple test program I used to exercise the new timerfd APIs: http://www.xmailserver.org/timerfd-test2.c [akpm@linux-foundation.org: coding-style cleanups] [akpm@linux-foundation.org: fix ia64 build] [akpm@linux-foundation.org: fix m68k build] [akpm@linux-foundation.org: fix mips build] [akpm@linux-foundation.org: fix alpha, arm, blackfin, cris, m68k, s390, sparc and sparc64 builds] [heiko.carstens@de.ibm.com: fix s390] [akpm@linux-foundation.org: fix powerpc build] [akpm@linux-foundation.org: fix sparc64 more] Signed-off-by: Davide Libenzi Cc: Michael Kerrisk Cc: Thomas Gleixner Cc: Davide Libenzi Cc: Michael Kerrisk Cc: Martin Schwidefsky Signed-off-by: Heiko Carstens Cc: Michael Kerrisk Cc: Davide Libenzi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/compat.c | 32 +++++++-- fs/timerfd.c | 207 +++++++++++++++++++++++++++++++++++++++-------------------- 2 files changed, 165 insertions(+), 74 deletions(-) (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index 5216c3fd751..69baca5ad60 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -2206,19 +2206,41 @@ asmlinkage long compat_sys_signalfd(int ufd, #ifdef CONFIG_TIMERFD -asmlinkage long compat_sys_timerfd(int ufd, int clockid, int flags, - const struct compat_itimerspec __user *utmr) +asmlinkage long compat_sys_timerfd_settime(int ufd, int flags, + const struct compat_itimerspec __user *utmr, + struct compat_itimerspec __user *otmr) { + int error; struct itimerspec t; struct itimerspec __user *ut; if (get_compat_itimerspec(&t, utmr)) return -EFAULT; - ut = compat_alloc_user_space(sizeof(*ut)); - if (copy_to_user(ut, &t, sizeof(t))) + ut = compat_alloc_user_space(2 * sizeof(struct itimerspec)); + if (copy_to_user(&ut[0], &t, sizeof(t))) return -EFAULT; + error = sys_timerfd_settime(ufd, flags, &ut[0], &ut[1]); + if (!error && otmr) + error = (copy_from_user(&t, &ut[1], sizeof(struct itimerspec)) || + put_compat_itimerspec(otmr, &t)) ? -EFAULT: 0; + + return error; +} + +asmlinkage long compat_sys_timerfd_gettime(int ufd, + struct compat_itimerspec __user *otmr) +{ + int error; + struct itimerspec t; + struct itimerspec __user *ut; - return sys_timerfd(ufd, clockid, flags, ut); + ut = compat_alloc_user_space(sizeof(struct itimerspec)); + error = sys_timerfd_gettime(ufd, ut); + if (!error) + error = (copy_from_user(&t, ut, sizeof(struct itimerspec)) || + put_compat_itimerspec(otmr, &t)) ? -EFAULT: 0; + + return error; } #endif /* CONFIG_TIMERFD */ diff --git a/fs/timerfd.c b/fs/timerfd.c index 61983f3b107..10c80b59ec4 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -25,13 +25,15 @@ struct timerfd_ctx { struct hrtimer tmr; ktime_t tintv; wait_queue_head_t wqh; + u64 ticks; int expired; + int clockid; }; /* * This gets called when the timer event triggers. We set the "expired" * flag, but we do not re-arm the timer (in case it's necessary, - * tintv.tv64 != 0) until the timer is read. + * tintv.tv64 != 0) until the timer is accessed. */ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) { @@ -40,13 +42,24 @@ static enum hrtimer_restart timerfd_tmrproc(struct hrtimer *htmr) spin_lock_irqsave(&ctx->wqh.lock, flags); ctx->expired = 1; + ctx->ticks++; wake_up_locked(&ctx->wqh); spin_unlock_irqrestore(&ctx->wqh.lock, flags); return HRTIMER_NORESTART; } -static void timerfd_setup(struct timerfd_ctx *ctx, int clockid, int flags, +static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx) +{ + ktime_t now, remaining; + + now = ctx->tmr.base->get_time(); + remaining = ktime_sub(ctx->tmr.expires, now); + + return remaining.tv64 < 0 ? ktime_set(0, 0): remaining; +} + +static void timerfd_setup(struct timerfd_ctx *ctx, int flags, const struct itimerspec *ktmr) { enum hrtimer_mode htmode; @@ -57,8 +70,9 @@ static void timerfd_setup(struct timerfd_ctx *ctx, int clockid, int flags, texp = timespec_to_ktime(ktmr->it_value); ctx->expired = 0; + ctx->ticks = 0; ctx->tintv = timespec_to_ktime(ktmr->it_interval); - hrtimer_init(&ctx->tmr, clockid, htmode); + hrtimer_init(&ctx->tmr, ctx->clockid, htmode); ctx->tmr.expires = texp; ctx->tmr.function = timerfd_tmrproc; if (texp.tv64 != 0) @@ -83,7 +97,7 @@ static unsigned int timerfd_poll(struct file *file, poll_table *wait) poll_wait(file, &ctx->wqh, wait); spin_lock_irqsave(&ctx->wqh.lock, flags); - if (ctx->expired) + if (ctx->ticks) events |= POLLIN; spin_unlock_irqrestore(&ctx->wqh.lock, flags); @@ -102,11 +116,11 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, return -EINVAL; spin_lock_irq(&ctx->wqh.lock); res = -EAGAIN; - if (!ctx->expired && !(file->f_flags & O_NONBLOCK)) { + if (!ctx->ticks && !(file->f_flags & O_NONBLOCK)) { __add_wait_queue(&ctx->wqh, &wait); for (res = 0;;) { set_current_state(TASK_INTERRUPTIBLE); - if (ctx->expired) { + if (ctx->ticks) { res = 0; break; } @@ -121,22 +135,21 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, __remove_wait_queue(&ctx->wqh, &wait); __set_current_state(TASK_RUNNING); } - if (ctx->expired) { - ctx->expired = 0; - if (ctx->tintv.tv64 != 0) { + if (ctx->ticks) { + ticks = ctx->ticks; + if (ctx->expired && ctx->tintv.tv64) { /* * If tintv.tv64 != 0, this is a periodic timer that * needs to be re-armed. We avoid doing it in the timer * callback to avoid DoS attacks specifying a very * short timer period. */ - ticks = (u64) - hrtimer_forward(&ctx->tmr, - hrtimer_cb_get_time(&ctx->tmr), - ctx->tintv); + ticks += hrtimer_forward_now(&ctx->tmr, + ctx->tintv) - 1; hrtimer_restart(&ctx->tmr); - } else - ticks = 1; + } + ctx->expired = 0; + ctx->ticks = 0; } spin_unlock_irq(&ctx->wqh.lock); if (ticks) @@ -150,76 +163,132 @@ static const struct file_operations timerfd_fops = { .read = timerfd_read, }; -asmlinkage long sys_timerfd(int ufd, int clockid, int flags, - const struct itimerspec __user *utmr) +static struct file *timerfd_fget(int fd) +{ + struct file *file; + + file = fget(fd); + if (!file) + return ERR_PTR(-EBADF); + if (file->f_op != &timerfd_fops) { + fput(file); + return ERR_PTR(-EINVAL); + } + + return file; +} + +asmlinkage long sys_timerfd_create(int clockid, int flags) { - int error; + int error, ufd; struct timerfd_ctx *ctx; struct file *file; struct inode *inode; - struct itimerspec ktmr; - - if (copy_from_user(&ktmr, utmr, sizeof(ktmr))) - return -EFAULT; + if (flags) + return -EINVAL; if (clockid != CLOCK_MONOTONIC && clockid != CLOCK_REALTIME) return -EINVAL; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + init_waitqueue_head(&ctx->wqh); + ctx->clockid = clockid; + hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS); + + error = anon_inode_getfd(&ufd, &inode, &file, "[timerfd]", + &timerfd_fops, ctx); + if (error) { + kfree(ctx); + return error; + } + + return ufd; +} + +asmlinkage long sys_timerfd_settime(int ufd, int flags, + const struct itimerspec __user *utmr, + struct itimerspec __user *otmr) +{ + struct file *file; + struct timerfd_ctx *ctx; + struct itimerspec ktmr, kotmr; + + if (copy_from_user(&ktmr, utmr, sizeof(ktmr))) + return -EFAULT; + if (!timespec_valid(&ktmr.it_value) || !timespec_valid(&ktmr.it_interval)) return -EINVAL; - if (ufd == -1) { - ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); - if (!ctx) - return -ENOMEM; - - init_waitqueue_head(&ctx->wqh); - - timerfd_setup(ctx, clockid, flags, &ktmr); - - /* - * When we call this, the initialization must be complete, since - * anon_inode_getfd() will install the fd. - */ - error = anon_inode_getfd(&ufd, &inode, &file, "[timerfd]", - &timerfd_fops, ctx); - if (error) - goto err_tmrcancel; - } else { - file = fget(ufd); - if (!file) - return -EBADF; - ctx = file->private_data; - if (file->f_op != &timerfd_fops) { - fput(file); - return -EINVAL; - } - /* - * We need to stop the existing timer before reprogramming - * it to the new values. - */ - for (;;) { - spin_lock_irq(&ctx->wqh.lock); - if (hrtimer_try_to_cancel(&ctx->tmr) >= 0) - break; - spin_unlock_irq(&ctx->wqh.lock); - cpu_relax(); - } - /* - * Re-program the timer to the new value ... - */ - timerfd_setup(ctx, clockid, flags, &ktmr); + file = timerfd_fget(ufd); + if (IS_ERR(file)) + return PTR_ERR(file); + ctx = file->private_data; + /* + * We need to stop the existing timer before reprogramming + * it to the new values. + */ + for (;;) { + spin_lock_irq(&ctx->wqh.lock); + if (hrtimer_try_to_cancel(&ctx->tmr) >= 0) + break; spin_unlock_irq(&ctx->wqh.lock); - fput(file); + cpu_relax(); } - return ufd; + /* + * If the timer is expired and it's periodic, we need to advance it + * because the caller may want to know the previous expiration time. + * We do not update "ticks" and "expired" since the timer will be + * re-programmed again in the following timerfd_setup() call. + */ + if (ctx->expired && ctx->tintv.tv64) + hrtimer_forward_now(&ctx->tmr, ctx->tintv); -err_tmrcancel: - hrtimer_cancel(&ctx->tmr); - kfree(ctx); - return error; + kotmr.it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); + kotmr.it_interval = ktime_to_timespec(ctx->tintv); + + /* + * Re-program the timer to the new value ... + */ + timerfd_setup(ctx, flags, &ktmr); + + spin_unlock_irq(&ctx->wqh.lock); + fput(file); + if (otmr && copy_to_user(otmr, &kotmr, sizeof(kotmr))) + return -EFAULT; + + return 0; +} + +asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr) +{ + struct file *file; + struct timerfd_ctx *ctx; + struct itimerspec kotmr; + + file = timerfd_fget(ufd); + if (IS_ERR(file)) + return PTR_ERR(file); + ctx = file->private_data; + + spin_lock_irq(&ctx->wqh.lock); + if (ctx->expired && ctx->tintv.tv64) { + ctx->expired = 0; + ctx->ticks += + hrtimer_forward_now(&ctx->tmr, ctx->tintv) - 1; + hrtimer_restart(&ctx->tmr); + } + kotmr.it_value = ktime_to_timespec(timerfd_get_remaining(ctx)); + kotmr.it_interval = ktime_to_timespec(ctx->tintv); + spin_unlock_irq(&ctx->wqh.lock); + fput(file); + + return copy_to_user(otmr, &kotmr, sizeof(kotmr)) ? -EFAULT: 0; } -- cgit v1.2.3 From eebd2aa355692afaf9906f62118620f1a1c19dbb Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 4 Feb 2008 22:28:29 -0800 Subject: Pagecache zeroing: zero_user_segment, zero_user_segments and zero_user Simplify page cache zeroing of segments of pages through 3 functions zero_user_segments(page, start1, end1, start2, end2) Zeros two segments of the page. It takes the position where to start and end the zeroing which avoids length calculations and makes code clearer. zero_user_segment(page, start, end) Same for a single segment. zero_user(page, start, length) Length variant for the case where we know the length. We remove the zero_user_page macro. Issues: 1. Its a macro. Inline functions are preferable. 2. The KM_USER0 macro is only defined for HIGHMEM. Having to treat this special case everywhere makes the code needlessly complex. The parameter for zeroing is always KM_USER0 except in one single case that we open code. Avoiding KM_USER0 makes a lot of code not having to be dealing with the special casing for HIGHMEM anymore. Dealing with kmap is only necessary for HIGHMEM configurations. In those configurations we use KM_USER0 like we do for a series of other functions defined in highmem.h. Since KM_USER0 is depends on HIGHMEM the existing zero_user_page function could not be a macro. zero_user_* functions introduced here can be be inline because that constant is not used when these functions are called. Also extract the flushing of the caches to be outside of the kmap. [akpm@linux-foundation.org: fix nfs and ntfs build] [akpm@linux-foundation.org: fix ntfs build some more] Signed-off-by: Christoph Lameter Cc: Steven French Cc: Michael Halcrow Cc: Cc: Steven Whitehouse Cc: Trond Myklebust Cc: "J. Bruce Fields" Cc: Anton Altaparmakov Cc: Mark Fasheh Cc: David Chinner Cc: Michael Halcrow Cc: Steven French Cc: Steven Whitehouse Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 44 ++++++++++++++------------------------------ fs/cifs/inode.c | 2 +- fs/direct-io.c | 4 ++-- fs/ecryptfs/mmap.c | 5 ++--- fs/ext3/inode.c | 4 ++-- fs/ext4/inode.c | 4 ++-- fs/gfs2/bmap.c | 2 +- fs/gfs2/ops_address.c | 2 +- fs/libfs.c | 11 ++++------- fs/mpage.c | 7 ++----- fs/nfs/read.c | 10 +++++----- fs/nfs/write.c | 4 +--- fs/ntfs/aops.c | 20 +++++++++++--------- fs/ntfs/compress.c | 2 +- fs/ntfs/file.c | 32 +++++++++++++++----------------- fs/ocfs2/alloc.c | 2 +- fs/ocfs2/aops.c | 6 +++--- fs/reiserfs/inode.c | 4 ++-- fs/xfs/linux-2.6/xfs_lrw.c | 2 +- 19 files changed, 71 insertions(+), 96 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 456c9ab7705..1de921484ea 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1798,7 +1798,7 @@ void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) start = max(from, block_start); size = min(to, block_end) - start; - zero_user_page(page, start, size, KM_USER0); + zero_user(page, start, size); set_buffer_uptodate(bh); } @@ -1861,19 +1861,10 @@ static int __block_prepare_write(struct inode *inode, struct page *page, mark_buffer_dirty(bh); continue; } - if (block_end > to || block_start < from) { - void *kaddr; - - kaddr = kmap_atomic(page, KM_USER0); - if (block_end > to) - memset(kaddr+to, 0, - block_end-to); - if (block_start < from) - memset(kaddr+block_start, - 0, from-block_start); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); - } + if (block_end > to || block_start < from) + zero_user_segments(page, + to, block_end, + block_start, from); continue; } } @@ -2104,8 +2095,7 @@ int block_read_full_page(struct page *page, get_block_t *get_block) SetPageError(page); } if (!buffer_mapped(bh)) { - zero_user_page(page, i * blocksize, blocksize, - KM_USER0); + zero_user(page, i * blocksize, blocksize); if (!err) set_buffer_uptodate(bh); continue; @@ -2218,7 +2208,7 @@ int cont_expand_zero(struct file *file, struct address_space *mapping, &page, &fsdata); if (err) goto out; - zero_user_page(page, zerofrom, len, KM_USER0); + zero_user(page, zerofrom, len); err = pagecache_write_end(file, mapping, curpos, len, len, page, fsdata); if (err < 0) @@ -2245,7 +2235,7 @@ int cont_expand_zero(struct file *file, struct address_space *mapping, &page, &fsdata); if (err) goto out; - zero_user_page(page, zerofrom, len, KM_USER0); + zero_user(page, zerofrom, len); err = pagecache_write_end(file, mapping, curpos, len, len, page, fsdata); if (err < 0) @@ -2422,7 +2412,6 @@ int nobh_write_begin(struct file *file, struct address_space *mapping, unsigned block_in_page; unsigned block_start, block_end; sector_t block_in_file; - char *kaddr; int nr_reads = 0; int ret = 0; int is_mapped_to_disk = 1; @@ -2493,13 +2482,8 @@ int nobh_write_begin(struct file *file, struct address_space *mapping, continue; } if (buffer_new(bh) || !buffer_mapped(bh)) { - kaddr = kmap_atomic(page, KM_USER0); - if (block_start < from) - memset(kaddr+block_start, 0, from-block_start); - if (block_end > to) - memset(kaddr + to, 0, block_end - to); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); + zero_user_segments(page, block_start, from, + to, block_end); continue; } if (buffer_uptodate(bh)) @@ -2636,7 +2620,7 @@ int nobh_writepage(struct page *page, get_block_t *get_block, * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ - zero_user_page(page, offset, PAGE_CACHE_SIZE - offset, KM_USER0); + zero_user_segment(page, offset, PAGE_CACHE_SIZE); out: ret = mpage_writepage(page, get_block, wbc); if (ret == -EAGAIN) @@ -2709,7 +2693,7 @@ has_buffers: if (page_has_buffers(page)) goto has_buffers; } - zero_user_page(page, offset, length, KM_USER0); + zero_user(page, offset, length); set_page_dirty(page); err = 0; @@ -2785,7 +2769,7 @@ int block_truncate_page(struct address_space *mapping, goto unlock; } - zero_user_page(page, offset, length, KM_USER0); + zero_user(page, offset, length); mark_buffer_dirty(bh); err = 0; @@ -2831,7 +2815,7 @@ int block_write_full_page(struct page *page, get_block_t *get_block, * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */ - zero_user_page(page, offset, PAGE_CACHE_SIZE - offset, KM_USER0); + zero_user_segment(page, offset, PAGE_CACHE_SIZE); return __block_write_full_page(inode, page, get_block, wbc); } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index d9567ba2960..47f2621001e 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1386,7 +1386,7 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from) if (!page) return -ENOMEM; - zero_user_page(page, offset, PAGE_CACHE_SIZE - offset, KM_USER0); + zero_user_segment(page, offset, PAGE_CACHE_SIZE); unlock_page(page); page_cache_release(page); return rc; diff --git a/fs/direct-io.c b/fs/direct-io.c index acf0da1bd25..9e81addbd6e 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -878,8 +878,8 @@ do_holes: page_cache_release(page); goto out; } - zero_user_page(page, block_in_page << blkbits, - 1 << blkbits, KM_USER0); + zero_user(page, block_in_page << blkbits, + 1 << blkbits); dio->block_in_file++; block_in_page++; goto next_block; diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 32c5711d79a..0535412d8c6 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -257,8 +257,7 @@ static int fill_zeros_to_end_of_page(struct page *page, unsigned int to) end_byte_in_page = i_size_read(inode) % PAGE_CACHE_SIZE; if (to > end_byte_in_page) end_byte_in_page = to; - zero_user_page(page, end_byte_in_page, - PAGE_CACHE_SIZE - end_byte_in_page, KM_USER0); + zero_user_segment(page, end_byte_in_page, PAGE_CACHE_SIZE); out: return 0; } @@ -307,7 +306,7 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page, */ if ((i_size_read(page->mapping->host) == prev_page_end_size) && (from != 0)) { - zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0); + zero_user(page, 0, PAGE_CACHE_SIZE); } out: return rc; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 9b162cd6c16..07753543928 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1845,7 +1845,7 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page, */ if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode) && PageUptodate(page)) { - zero_user_page(page, offset, length, KM_USER0); + zero_user(page, offset, length); set_page_dirty(page); goto unlock; } @@ -1898,7 +1898,7 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page, goto unlock; } - zero_user_page(page, offset, length, KM_USER0); + zero_user(page, offset, length); BUFFER_TRACE(bh, "zeroed end of block"); err = 0; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bb717cbb749..05c4145dd27 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1840,7 +1840,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page, */ if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode) && PageUptodate(page)) { - zero_user_page(page, offset, length, KM_USER0); + zero_user(page, offset, length); set_page_dirty(page); goto unlock; } @@ -1893,7 +1893,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page, goto unlock; } - zero_user_page(page, offset, length, KM_USER0); + zero_user(page, offset, length); BUFFER_TRACE(bh, "zeroed end of block"); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index e4effc47abf..e9456ebd3bb 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -932,7 +932,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping) if (!gfs2_is_writeback(ip)) gfs2_trans_add_bh(ip->i_gl, bh, 0); - zero_user_page(page, offset, length, KM_USER0); + zero_user(page, offset, length); unlock: unlock_page(page); diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index 38dbe99a30e..ac772b6d9db 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c @@ -446,7 +446,7 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) * so we need to supply one here. It doesn't happen often. */ if (unlikely(page->index)) { - zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0); + zero_user(page, 0, PAGE_CACHE_SIZE); return 0; } diff --git a/fs/libfs.c b/fs/libfs.c index 6e68b700958..5523bde9638 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -341,13 +341,10 @@ int simple_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { if (!PageUptodate(page)) { - if (to - from != PAGE_CACHE_SIZE) { - void *kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr, 0, from); - memset(kaddr + to, 0, PAGE_CACHE_SIZE - to); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); - } + if (to - from != PAGE_CACHE_SIZE) + zero_user_segments(page, + 0, from, + to, PAGE_CACHE_SIZE); } return 0; } diff --git a/fs/mpage.c b/fs/mpage.c index d54f8f89722..5df564366f3 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -276,9 +276,7 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, } if (first_hole != blocks_per_page) { - zero_user_page(page, first_hole << blkbits, - PAGE_CACHE_SIZE - (first_hole << blkbits), - KM_USER0); + zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE); if (first_hole == 0) { SetPageUptodate(page); unlock_page(page); @@ -571,8 +569,7 @@ page_is_mapped: if (page->index > end_index || !offset) goto confused; - zero_user_page(page, offset, PAGE_CACHE_SIZE - offset, - KM_USER0); + zero_user_segment(page, offset, PAGE_CACHE_SIZE); } /* diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 8fd6dfbe1bc..3d7d9631e12 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -79,7 +79,7 @@ void nfs_readdata_release(void *data) static int nfs_return_empty_page(struct page *page) { - zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0); + zero_user(page, 0, PAGE_CACHE_SIZE); SetPageUptodate(page); unlock_page(page); return 0; @@ -103,10 +103,10 @@ static void nfs_readpage_truncate_uninitialised_page(struct nfs_read_data *data) pglen = PAGE_CACHE_SIZE - base; for (;;) { if (remainder <= pglen) { - zero_user_page(*pages, base, remainder, KM_USER0); + zero_user(*pages, base, remainder); break; } - zero_user_page(*pages, base, pglen, KM_USER0); + zero_user(*pages, base, pglen); pages++; remainder -= pglen; pglen = PAGE_CACHE_SIZE; @@ -130,7 +130,7 @@ static int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, return PTR_ERR(new); } if (len < PAGE_CACHE_SIZE) - zero_user_page(page, len, PAGE_CACHE_SIZE - len, KM_USER0); + zero_user_segment(page, len, PAGE_CACHE_SIZE); nfs_list_add_request(new, &one_request); if (NFS_SERVER(inode)->rsize < PAGE_CACHE_SIZE) @@ -532,7 +532,7 @@ readpage_async_filler(void *data, struct page *page) goto out_error; if (len < PAGE_CACHE_SIZE) - zero_user_page(page, len, PAGE_CACHE_SIZE - len, KM_USER0); + zero_user_segment(page, len, PAGE_CACHE_SIZE); nfs_pageio_add_request(desc->pgio, new); return 0; out_error: diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 522efff3e2c..b144b1957dd 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -665,9 +665,7 @@ zero_page: * then we need to zero any uninitalised data. */ if (req->wb_pgbase == 0 && req->wb_bytes != PAGE_CACHE_SIZE && !PageUptodate(req->wb_page)) - zero_user_page(req->wb_page, req->wb_bytes, - PAGE_CACHE_SIZE - req->wb_bytes, - KM_USER0); + zero_user_segment(req->wb_page, req->wb_bytes, PAGE_CACHE_SIZE); return req; } diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index ad87cb01299..00e9ccde8e4 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -87,13 +87,17 @@ static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate) /* Check for the current buffer head overflowing. */ if (unlikely(file_ofs + bh->b_size > init_size)) { int ofs; + void *kaddr; ofs = 0; if (file_ofs < init_size) ofs = init_size - file_ofs; local_irq_save(flags); - zero_user_page(page, bh_offset(bh) + ofs, - bh->b_size - ofs, KM_BIO_SRC_IRQ); + kaddr = kmap_atomic(page, KM_BIO_SRC_IRQ); + memset(kaddr + bh_offset(bh) + ofs, 0, + bh->b_size - ofs); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_BIO_SRC_IRQ); local_irq_restore(flags); } } else { @@ -334,7 +338,7 @@ handle_hole: bh->b_blocknr = -1UL; clear_buffer_mapped(bh); handle_zblock: - zero_user_page(page, i * blocksize, blocksize, KM_USER0); + zero_user(page, i * blocksize, blocksize); if (likely(!err)) set_buffer_uptodate(bh); } while (i++, iblock++, (bh = bh->b_this_page) != head); @@ -410,7 +414,7 @@ retry_readpage: /* Is the page fully outside i_size? (truncate in progress) */ if (unlikely(page->index >= (i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT)) { - zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0); + zero_user(page, 0, PAGE_CACHE_SIZE); ntfs_debug("Read outside i_size - truncated?"); goto done; } @@ -459,7 +463,7 @@ retry_readpage: * ok to ignore the compressed flag here. */ if (unlikely(page->index > 0)) { - zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0); + zero_user(page, 0, PAGE_CACHE_SIZE); goto done; } if (!NInoAttr(ni)) @@ -788,8 +792,7 @@ lock_retry_remap: if (err == -ENOENT || lcn == LCN_ENOENT) { bh->b_blocknr = -1; clear_buffer_dirty(bh); - zero_user_page(page, bh_offset(bh), blocksize, - KM_USER0); + zero_user(page, bh_offset(bh), blocksize); set_buffer_uptodate(bh); err = 0; continue; @@ -1414,8 +1417,7 @@ retry_writepage: if (page->index >= (i_size >> PAGE_CACHE_SHIFT)) { /* The page straddles i_size. */ unsigned int ofs = i_size & ~PAGE_CACHE_MASK; - zero_user_page(page, ofs, PAGE_CACHE_SIZE - ofs, - KM_USER0); + zero_user_segment(page, ofs, PAGE_CACHE_SIZE); } /* Handle mst protected attributes. */ if (NInoMstProtected(ni)) diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c index d1619d05eb2..33ff314cc50 100644 --- a/fs/ntfs/compress.c +++ b/fs/ntfs/compress.c @@ -565,7 +565,7 @@ int ntfs_read_compressed_block(struct page *page) if (xpage >= max_page) { kfree(bhs); kfree(pages); - zero_user_page(page, 0, PAGE_CACHE_SIZE, KM_USER0); + zero_user(page, 0, PAGE_CACHE_SIZE); ntfs_debug("Compressed read outside i_size - truncated?"); SetPageUptodate(page); unlock_page(page); diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 6cd08dfdc2e..3c5550cd11d 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -607,8 +607,8 @@ do_next_page: ntfs_submit_bh_for_read(bh); *wait_bh++ = bh; } else { - zero_user_page(page, bh_offset(bh), - blocksize, KM_USER0); + zero_user(page, bh_offset(bh), + blocksize); set_buffer_uptodate(bh); } } @@ -683,9 +683,8 @@ map_buffer_cached: ntfs_submit_bh_for_read(bh); *wait_bh++ = bh; } else { - zero_user_page(page, - bh_offset(bh), - blocksize, KM_USER0); + zero_user(page, bh_offset(bh), + blocksize); set_buffer_uptodate(bh); } } @@ -703,8 +702,8 @@ map_buffer_cached: */ if (bh_end <= pos || bh_pos >= end) { if (!buffer_uptodate(bh)) { - zero_user_page(page, bh_offset(bh), - blocksize, KM_USER0); + zero_user(page, bh_offset(bh), + blocksize); set_buffer_uptodate(bh); } mark_buffer_dirty(bh); @@ -743,8 +742,7 @@ map_buffer_cached: if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); } else if (!buffer_uptodate(bh)) { - zero_user_page(page, bh_offset(bh), blocksize, - KM_USER0); + zero_user(page, bh_offset(bh), blocksize); set_buffer_uptodate(bh); } continue; @@ -868,8 +866,8 @@ rl_not_mapped_enoent: if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); } else if (!buffer_uptodate(bh)) { - zero_user_page(page, bh_offset(bh), - blocksize, KM_USER0); + zero_user(page, bh_offset(bh), + blocksize); set_buffer_uptodate(bh); } continue; @@ -1128,8 +1126,8 @@ rl_not_mapped_enoent: if (likely(bh_pos < initialized_size)) ofs = initialized_size - bh_pos; - zero_user_page(page, bh_offset(bh) + ofs, - blocksize - ofs, KM_USER0); + zero_user_segment(page, bh_offset(bh) + ofs, + blocksize); } } else /* if (unlikely(!buffer_uptodate(bh))) */ err = -EIO; @@ -1269,8 +1267,8 @@ rl_not_mapped_enoent: if (PageUptodate(page)) set_buffer_uptodate(bh); else { - zero_user_page(page, bh_offset(bh), - blocksize, KM_USER0); + zero_user(page, bh_offset(bh), + blocksize); set_buffer_uptodate(bh); } } @@ -1330,7 +1328,7 @@ err_out: len = PAGE_CACHE_SIZE; if (len > bytes) len = bytes; - zero_user_page(*pages, 0, len, KM_USER0); + zero_user(*pages, 0, len); } goto out; } @@ -1451,7 +1449,7 @@ err_out: len = PAGE_CACHE_SIZE; if (len > bytes) len = bytes; - zero_user_page(*pages, 0, len, KM_USER0); + zero_user(*pages, 0, len); } goto out; } diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 64713e149e4..447206eb5c2 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5670,7 +5670,7 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, mlog_errno(ret); if (zero) - zero_user_page(page, from, to - from, KM_USER0); + zero_user_segment(page, from, to); /* * Need to set the buffers we zero'd into uptodate diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index bc7b4cbbe8e..82243127eeb 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -307,7 +307,7 @@ static int ocfs2_readpage(struct file *file, struct page *page) * XXX sys_readahead() seems to get that wrong? */ if (start >= i_size_read(inode)) { - zero_user_page(page, 0, PAGE_SIZE, KM_USER0); + zero_user(page, 0, PAGE_SIZE); SetPageUptodate(page); ret = 0; goto out_alloc; @@ -869,7 +869,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, if (block_start >= to) break; - zero_user_page(page, block_start, bh->b_size, KM_USER0); + zero_user(page, block_start, bh->b_size); set_buffer_uptodate(bh); mark_buffer_dirty(bh); @@ -1034,7 +1034,7 @@ static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to start = max(from, block_start); end = min(to, block_end); - zero_user_page(page, start, end - start, KM_USER0); + zero_user_segment(page, start, end); set_buffer_uptodate(bh); } diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 231fd5ccadc..195309857e6 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2143,7 +2143,7 @@ int reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps) /* if we are not on a block boundary */ if (length) { length = blocksize - length; - zero_user_page(page, offset, length, KM_USER0); + zero_user(page, offset, length); if (buffer_mapped(bh) && bh->b_blocknr != 0) { mark_buffer_dirty(bh); } @@ -2367,7 +2367,7 @@ static int reiserfs_write_full_page(struct page *page, unlock_page(page); return 0; } - zero_user_page(page, last_offset, PAGE_CACHE_SIZE - last_offset, KM_USER0); + zero_user_segment(page, last_offset, PAGE_CACHE_SIZE); } bh = head; block = page->index << (PAGE_CACHE_SHIFT - s->s_blocksize_bits); diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index d6a8dddb226..6f614f35f65 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -155,7 +155,7 @@ xfs_iozero( if (status) break; - zero_user_page(page, offset, bytes, KM_USER0); + zero_user(page, offset, bytes); status = pagecache_write_end(NULL, mapping, pos, bytes, bytes, page, fsdata); -- cgit v1.2.3 From 9e2779fa281cfda13ac060753d674bbcaa23367e Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 4 Feb 2008 22:28:34 -0800 Subject: is_vmalloc_addr(): Check if an address is within the vmalloc boundaries Checking if an address is a vmalloc address is done in a couple of places. Define a common version in mm.h and replace the other checks. Again the include structures suck. The definition of VMALLOC_START and VMALLOC_END is not available in vmalloc.h since highmem.c cannot be included there. Signed-off-by: Christoph Lameter Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ntfs/malloc.h | 3 +-- fs/proc/kcore.c | 2 +- fs/xfs/linux-2.6/kmem.c | 3 +-- fs/xfs/linux-2.6/xfs_buf.c | 3 +-- 4 files changed, 4 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h index e38e402e410..cd0be3f5c3c 100644 --- a/fs/ntfs/malloc.h +++ b/fs/ntfs/malloc.h @@ -85,8 +85,7 @@ static inline void *ntfs_malloc_nofs_nofail(unsigned long size) static inline void ntfs_free(void *addr) { - if (likely(((unsigned long)addr < VMALLOC_START) || - ((unsigned long)addr >= VMALLOC_END ))) { + if (!is_vmalloc_addr(addr)) { kfree(addr); /* free_page((unsigned long)addr); */ return; diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 1be73082edd..7dd26e18cbf 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -325,7 +325,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) if (m == NULL) { if (clear_user(buffer, tsz)) return -EFAULT; - } else if ((start >= VMALLOC_START) && (start < VMALLOC_END)) { + } else if (is_vmalloc_addr((void *)start)) { char * elf_buf; struct vm_struct *m; unsigned long curstart = start; diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c index ed2b16dff91..e040f1ce1b6 100644 --- a/fs/xfs/linux-2.6/kmem.c +++ b/fs/xfs/linux-2.6/kmem.c @@ -92,8 +92,7 @@ kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize, void kmem_free(void *ptr, size_t size) { - if (((unsigned long)ptr < VMALLOC_START) || - ((unsigned long)ptr >= VMALLOC_END)) { + if (!is_vmalloc_addr(ptr)) { kfree(ptr); } else { vfree(ptr); diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index a49dd8d4b06..0382c19d652 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -709,8 +709,7 @@ static inline struct page * mem_to_page( void *addr) { - if (((unsigned long)addr < VMALLOC_START) || - ((unsigned long)addr >= VMALLOC_END)) { + if ((!is_vmalloc_addr(addr))) { return virt_to_page(addr); } else { return vmalloc_to_page(addr); -- cgit v1.2.3 From b98938c373117043598002f197200d7ed08acd49 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 4 Feb 2008 22:28:36 -0800 Subject: bufferhead: revert constructor removal The constructor for buffer_head slabs was removed recently. We need the constructor back in slab defrag in order to insure that slab objects always have a definite state even before we allocated them. I think we mistakenly merged the removal of the constuctor into a cleanup patch. You (ie: akpm) had a test that showed that the removal of the constructor led to a small regression. The prior state makes things easier for slab defrag. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Christoph Lameter Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 1de921484ea..826baf4f04b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3153,7 +3153,7 @@ static void recalc_bh_state(void) struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) { - struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, + struct buffer_head *ret = kmem_cache_alloc(bh_cachep, set_migrateflags(gfp_flags, __GFP_RECLAIMABLE)); if (ret) { INIT_LIST_HEAD(&ret->b_assoc_buffers); @@ -3241,12 +3241,24 @@ int bh_submit_read(struct buffer_head *bh) } EXPORT_SYMBOL(bh_submit_read); +static void +init_buffer_head(struct kmem_cache *cachep, void *data) +{ + struct buffer_head *bh = data; + + memset(bh, 0, sizeof(*bh)); + INIT_LIST_HEAD(&bh->b_assoc_buffers); +} + void __init buffer_init(void) { int nrpages; - bh_cachep = KMEM_CACHE(buffer_head, - SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD); + bh_cachep = kmem_cache_create("buffer_head", + sizeof(struct buffer_head), 0, + (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| + SLAB_MEM_SPREAD), + init_buffer_head); /* * Limit the bh occupancy to 10% of ZONE_NORMAL -- cgit v1.2.3 From 75897d60a54ccee94253312107f941a83b5077cb Mon Sep 17 00:00:00 2001 From: Ken Chen Date: Mon, 4 Feb 2008 22:28:36 -0800 Subject: hugetlb: allow sticky directory mount option Allow sticky directory mount option for hugetlbfs. This allows admin to create a shared hugetlbfs mount point for multiple users, while prevent accidental file deletion that users may step on each other. It is similiar to default tmpfs mount option, or typical option used on /tmp. Signed-off-by: Ken Chen Cc: Badari Pulavarty Cc: Adam Litke Cc: David Gibson Cc: William Lee Irwin III Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 09ee07f0266..3b3cc28cdef 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -768,7 +768,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) case Opt_mode: if (match_octal(&args[0], &option)) goto bad_val; - pconfig->mode = option & 0777U; + pconfig->mode = option & 01777U; break; case Opt_size: { -- cgit v1.2.3 From ec4dd3eb35759f9fbeb5c1abb01403b2fde64cc9 Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Mon, 4 Feb 2008 22:28:56 -0800 Subject: maps4: add proportional set size accounting in smaps The "proportional set size" (PSS) of a process is the count of pages it has in memory, where each page is divided by the number of processes sharing it. So if a process has 1000 pages all to itself, and 1000 shared with one other process, its PSS will be 1500. - lwn.net: "ELC: How much memory are applications really using?" The PSS proposed by Matt Mackall is a very nice metic for measuring an process's memory footprint. So collect and export it via /proc//smaps. Matt Mackall's pagemap/kpagemap and John Berthels's exmap can also do the job. They are comprehensive tools. But for PSS, let's do it in the simple way. Cc: John Berthels Cc: Bernardo Innocenti Cc: Padraig Brady Cc: Denys Vlasenko Cc: Balbir Singh Signed-off-by: Matt Mackall Signed-off-by: Fengguang Wu Cc: Hugh Dickins Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8043a3eab52..8952ce70315 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -114,6 +114,25 @@ static void pad_len_spaces(struct seq_file *m, int len) seq_printf(m, "%*c", len, ' '); } +/* + * Proportional Set Size(PSS): my share of RSS. + * + * PSS of a process is the count of pages it has in memory, where each + * page is divided by the number of processes sharing it. So if a + * process has 1000 pages all to itself, and 1000 shared with one other + * process, its PSS will be 1500. + * + * To keep (accumulated) division errors low, we adopt a 64bit + * fixed-point pss counter to minimize division errors. So (pss >> + * PSS_SHIFT) would be the real byte count. + * + * A shift of 12 before division means (assuming 4K page size): + * - 1M 3-user-pages add up to 8KB errors; + * - supports mapcount up to 2^24, or 16M; + * - supports PSS up to 2^52 bytes, or 4PB. + */ +#define PSS_SHIFT 12 + struct mem_size_stats { unsigned long resident; @@ -122,6 +141,7 @@ struct mem_size_stats unsigned long private_clean; unsigned long private_dirty; unsigned long referenced; + u64 pss; }; struct pmd_walker { @@ -195,6 +215,7 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats seq_printf(m, "Size: %8lu kB\n" "Rss: %8lu kB\n" + "Pss: %8lu kB\n" "Shared_Clean: %8lu kB\n" "Shared_Dirty: %8lu kB\n" "Private_Clean: %8lu kB\n" @@ -202,6 +223,7 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats "Referenced: %8lu kB\n", (vma->vm_end - vma->vm_start) >> 10, mss->resident >> 10, + (unsigned long)(mss->pss >> (10 + PSS_SHIFT)), mss->shared_clean >> 10, mss->shared_dirty >> 10, mss->private_clean >> 10, @@ -226,6 +248,7 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pte_t *pte, ptent; spinlock_t *ptl; struct page *page; + int mapcount; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { @@ -242,16 +265,19 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd, /* Accumulate the size in pages that have been accessed. */ if (pte_young(ptent) || PageReferenced(page)) mss->referenced += PAGE_SIZE; - if (page_mapcount(page) >= 2) { + mapcount = page_mapcount(page); + if (mapcount >= 2) { if (pte_dirty(ptent)) mss->shared_dirty += PAGE_SIZE; else mss->shared_clean += PAGE_SIZE; + mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount; } else { if (pte_dirty(ptent)) mss->private_dirty += PAGE_SIZE; else mss->private_clean += PAGE_SIZE; + mss->pss += (PAGE_SIZE << PSS_SHIFT); } } pte_unmap_unlock(pte - 1, ptl); -- cgit v1.2.3 From b3ae5acbbb98d95c1300c8ced56d15f97d09c506 Mon Sep 17 00:00:00 2001 From: Matt Mackall Date: Mon, 4 Feb 2008 22:29:01 -0800 Subject: maps4: use pagewalker in clear_refs and smaps Use the generic pagewalker for smaps and clear_refs Signed-off-by: Matt Mackall Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 95 ++++++++++-------------------------------------------- 1 file changed, 17 insertions(+), 78 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8952ce70315..791b2d400be 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -135,6 +135,7 @@ static void pad_len_spaces(struct seq_file *m, int len) struct mem_size_stats { + struct vm_area_struct *vma; unsigned long resident; unsigned long shared_clean; unsigned long shared_dirty; @@ -144,13 +145,6 @@ struct mem_size_stats u64 pss; }; -struct pmd_walker { - struct vm_area_struct *vma; - void *private; - void (*action)(struct vm_area_struct *, pmd_t *, unsigned long, - unsigned long, void *); -}; - static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss) { struct proc_maps_private *priv = m->private; @@ -240,11 +234,11 @@ static int show_map(struct seq_file *m, void *v) return show_map_internal(m, v, NULL); } -static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, - void *private) +static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + void *private) { struct mem_size_stats *mss = private; + struct vm_area_struct *vma = mss->vma; pte_t *pte, ptent; spinlock_t *ptl; struct page *page; @@ -282,12 +276,13 @@ static void smaps_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } pte_unmap_unlock(pte - 1, ptl); cond_resched(); + return 0; } -static void clear_refs_pte_range(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, - void *private) +static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, void *private) { + struct vm_area_struct *vma = private; pte_t *pte, ptent; spinlock_t *ptl; struct page *page; @@ -308,71 +303,10 @@ static void clear_refs_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } pte_unmap_unlock(pte - 1, ptl); cond_resched(); + return 0; } -static inline void walk_pmd_range(struct pmd_walker *walker, pud_t *pud, - unsigned long addr, unsigned long end) -{ - pmd_t *pmd; - unsigned long next; - - for (pmd = pmd_offset(pud, addr); addr != end; - pmd++, addr = next) { - next = pmd_addr_end(addr, end); - if (pmd_none_or_clear_bad(pmd)) - continue; - walker->action(walker->vma, pmd, addr, next, walker->private); - } -} - -static inline void walk_pud_range(struct pmd_walker *walker, pgd_t *pgd, - unsigned long addr, unsigned long end) -{ - pud_t *pud; - unsigned long next; - - for (pud = pud_offset(pgd, addr); addr != end; - pud++, addr = next) { - next = pud_addr_end(addr, end); - if (pud_none_or_clear_bad(pud)) - continue; - walk_pmd_range(walker, pud, addr, next); - } -} - -/* - * walk_page_range - walk the page tables of a VMA with a callback - * @vma - VMA to walk - * @action - callback invoked for every bottom-level (PTE) page table - * @private - private data passed to the callback function - * - * Recursively walk the page table for the memory area in a VMA, calling - * a callback for every bottom-level (PTE) page table. - */ -static inline void walk_page_range(struct vm_area_struct *vma, - void (*action)(struct vm_area_struct *, - pmd_t *, unsigned long, - unsigned long, void *), - void *private) -{ - unsigned long addr = vma->vm_start; - unsigned long end = vma->vm_end; - struct pmd_walker walker = { - .vma = vma, - .private = private, - .action = action, - }; - pgd_t *pgd; - unsigned long next; - - for (pgd = pgd_offset(vma->vm_mm, addr); addr != end; - pgd++, addr = next) { - next = pgd_addr_end(addr, end); - if (pgd_none_or_clear_bad(pgd)) - continue; - walk_pud_range(&walker, pgd, addr, next); - } -} +static struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range }; static int show_smap(struct seq_file *m, void *v) { @@ -380,11 +314,15 @@ static int show_smap(struct seq_file *m, void *v) struct mem_size_stats mss; memset(&mss, 0, sizeof mss); + mss.vma = vma; if (vma->vm_mm && !is_vm_hugetlb_page(vma)) - walk_page_range(vma, smaps_pte_range, &mss); + walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end, + &smaps_walk, &mss); return show_map_internal(m, v, &mss); } +static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range }; + void clear_refs_smap(struct mm_struct *mm) { struct vm_area_struct *vma; @@ -392,7 +330,8 @@ void clear_refs_smap(struct mm_struct *mm) down_read(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) if (vma->vm_mm && !is_vm_hugetlb_page(vma)) - walk_page_range(vma, clear_refs_pte_range, NULL); + walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end, + &clear_refs_walk, vma); flush_tlb_mm(mm); up_read(&mm->mmap_sem); } -- cgit v1.2.3 From 4752c369789250eafcd7813e11c8fb689235b0d2 Mon Sep 17 00:00:00 2001 From: Matt Mackall Date: Mon, 4 Feb 2008 22:29:02 -0800 Subject: maps4: simplify interdependence of maps and smaps This pulls the shared map display code out of show_map and puts it in show_smap where it belongs. Signed-off-by: Matt Mackall Cc: Jeremy Fitzhardinge Acked-by: David Rientjes Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 52 ++++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 791b2d400be..abc44f39f1d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -145,7 +145,7 @@ struct mem_size_stats u64 pss; }; -static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss) +static int show_map(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; struct task_struct *task = priv->task; @@ -205,35 +205,11 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats } seq_putc(m, '\n'); - if (mss) - seq_printf(m, - "Size: %8lu kB\n" - "Rss: %8lu kB\n" - "Pss: %8lu kB\n" - "Shared_Clean: %8lu kB\n" - "Shared_Dirty: %8lu kB\n" - "Private_Clean: %8lu kB\n" - "Private_Dirty: %8lu kB\n" - "Referenced: %8lu kB\n", - (vma->vm_end - vma->vm_start) >> 10, - mss->resident >> 10, - (unsigned long)(mss->pss >> (10 + PSS_SHIFT)), - mss->shared_clean >> 10, - mss->shared_dirty >> 10, - mss->private_clean >> 10, - mss->private_dirty >> 10, - mss->referenced >> 10); - if (m->count < m->size) /* vma is copied successfully */ m->version = (vma != get_gate_vma(task))? vma->vm_start: 0; return 0; } -static int show_map(struct seq_file *m, void *v) -{ - return show_map_internal(m, v, NULL); -} - static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, void *private) { @@ -312,13 +288,37 @@ static int show_smap(struct seq_file *m, void *v) { struct vm_area_struct *vma = v; struct mem_size_stats mss; + int ret; memset(&mss, 0, sizeof mss); mss.vma = vma; if (vma->vm_mm && !is_vm_hugetlb_page(vma)) walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end, &smaps_walk, &mss); - return show_map_internal(m, v, &mss); + + ret = show_map(m, v); + if (ret) + return ret; + + seq_printf(m, + "Size: %8lu kB\n" + "Rss: %8lu kB\n" + "Pss: %8lu kB\n" + "Shared_Clean: %8lu kB\n" + "Shared_Dirty: %8lu kB\n" + "Private_Clean: %8lu kB\n" + "Private_Dirty: %8lu kB\n" + "Referenced: %8lu kB\n", + (vma->vm_end - vma->vm_start) >> 10, + mss.resident >> 10, + (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), + mss.shared_clean >> 10, + mss.shared_dirty >> 10, + mss.private_clean >> 10, + mss.private_dirty >> 10, + mss.referenced >> 10); + + return ret; } static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range }; -- cgit v1.2.3 From f248dcb34d7b7ac255db70071a20be9d9c6ad491 Mon Sep 17 00:00:00 2001 From: Matt Mackall Date: Mon, 4 Feb 2008 22:29:03 -0800 Subject: maps4: move clear_refs code to task_mmu.c This puts all the clear_refs code where it belongs and probably lets things compile on MMU-less systems as well. Signed-off-by: Matt Mackall Cc: Jeremy Fitzhardinge Cc: David Rientjes Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 40 ---------------------------------------- fs/proc/internal.h | 6 +----- fs/proc/task_mmu.c | 44 ++++++++++++++++++++++++++++++++++++-------- 3 files changed, 37 insertions(+), 53 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 33537487f5a..1bd646d3fe9 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -88,10 +88,6 @@ * in /proc for a task before it execs a suid executable. */ - -/* Worst case buffer size needed for holding an integer. */ -#define PROC_NUMBUF 13 - struct pid_entry { char *name; int len; @@ -935,42 +931,6 @@ static const struct file_operations proc_oom_adjust_operations = { .write = oom_adjust_write, }; -#ifdef CONFIG_MMU -static ssize_t clear_refs_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct task_struct *task; - char buffer[PROC_NUMBUF], *end; - struct mm_struct *mm; - - memset(buffer, 0, sizeof(buffer)); - if (count > sizeof(buffer) - 1) - count = sizeof(buffer) - 1; - if (copy_from_user(buffer, buf, count)) - return -EFAULT; - if (!simple_strtol(buffer, &end, 0)) - return -EINVAL; - if (*end == '\n') - end++; - task = get_proc_task(file->f_path.dentry->d_inode); - if (!task) - return -ESRCH; - mm = get_task_mm(task); - if (mm) { - clear_refs_smap(mm); - mmput(mm); - } - put_task_struct(task); - if (end - buffer == 0) - return -EIO; - return end - buffer; -} - -static struct file_operations proc_clear_refs_operations = { - .write = clear_refs_write, -}; -#endif - #ifdef CONFIG_AUDITSYSCALL #define TMPBUFLEN 21 static ssize_t proc_loginuid_read(struct file * file, char __user * buf, diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 05b3e900626..ddfaeec3749 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -56,11 +56,7 @@ extern int proc_pid_statm(struct task_struct *, char *); extern const struct file_operations proc_maps_operations; extern const struct file_operations proc_numa_maps_operations; extern const struct file_operations proc_smaps_operations; - -extern const struct file_operations proc_maps_operations; -extern const struct file_operations proc_numa_maps_operations; -extern const struct file_operations proc_smaps_operations; - +extern const struct file_operations proc_clear_refs_operations; void free_proc_entry(struct proc_dir_entry *de); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index abc44f39f1d..fcdbd233f25 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -323,19 +323,47 @@ static int show_smap(struct seq_file *m, void *v) static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range }; -void clear_refs_smap(struct mm_struct *mm) +static ssize_t clear_refs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) { + struct task_struct *task; + char buffer[PROC_NUMBUF], *end; + struct mm_struct *mm; struct vm_area_struct *vma; - down_read(&mm->mmap_sem); - for (vma = mm->mmap; vma; vma = vma->vm_next) - if (vma->vm_mm && !is_vm_hugetlb_page(vma)) - walk_page_range(vma->vm_mm, vma->vm_start, vma->vm_end, - &clear_refs_walk, vma); - flush_tlb_mm(mm); - up_read(&mm->mmap_sem); + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + if (!simple_strtol(buffer, &end, 0)) + return -EINVAL; + if (*end == '\n') + end++; + task = get_proc_task(file->f_path.dentry->d_inode); + if (!task) + return -ESRCH; + mm = get_task_mm(task); + if (mm) { + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) + if (!is_vm_hugetlb_page(vma)) + walk_page_range(mm, vma->vm_start, vma->vm_end, + &clear_refs_walk, vma); + flush_tlb_mm(mm); + up_read(&mm->mmap_sem); + mmput(mm); + } + put_task_struct(task); + if (end - buffer == 0) + return -EIO; + return end - buffer; } +const struct file_operations proc_clear_refs_operations = { + .write = clear_refs_write, +}; + static void *m_start(struct seq_file *m, loff_t *pos) { struct proc_maps_private *priv = m->private; -- cgit v1.2.3 From a6198797cc3fd659b2d81cdf6bb6b9bba9cd93e9 Mon Sep 17 00:00:00 2001 From: Matt Mackall Date: Mon, 4 Feb 2008 22:29:03 -0800 Subject: maps4: regroup task_mmu by interface Reorder source so that all the code and data for each interface is together. Signed-off-by: Matt Mackall Cc: Jeremy Fitzhardinge Cc: David Rientjes Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 417 +++++++++++++++++++++++++++-------------------------- 1 file changed, 210 insertions(+), 207 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index fcdbd233f25..308fc5451e4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -114,36 +114,122 @@ static void pad_len_spaces(struct seq_file *m, int len) seq_printf(m, "%*c", len, ' '); } -/* - * Proportional Set Size(PSS): my share of RSS. - * - * PSS of a process is the count of pages it has in memory, where each - * page is divided by the number of processes sharing it. So if a - * process has 1000 pages all to itself, and 1000 shared with one other - * process, its PSS will be 1500. - * - * To keep (accumulated) division errors low, we adopt a 64bit - * fixed-point pss counter to minimize division errors. So (pss >> - * PSS_SHIFT) would be the real byte count. - * - * A shift of 12 before division means (assuming 4K page size): - * - 1M 3-user-pages add up to 8KB errors; - * - supports mapcount up to 2^24, or 16M; - * - supports PSS up to 2^52 bytes, or 4PB. - */ -#define PSS_SHIFT 12 +static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) +{ + if (vma && vma != priv->tail_vma) { + struct mm_struct *mm = vma->vm_mm; + up_read(&mm->mmap_sem); + mmput(mm); + } +} -struct mem_size_stats +static void *m_start(struct seq_file *m, loff_t *pos) { - struct vm_area_struct *vma; - unsigned long resident; - unsigned long shared_clean; - unsigned long shared_dirty; - unsigned long private_clean; - unsigned long private_dirty; - unsigned long referenced; - u64 pss; -}; + struct proc_maps_private *priv = m->private; + unsigned long last_addr = m->version; + struct mm_struct *mm; + struct vm_area_struct *vma, *tail_vma = NULL; + loff_t l = *pos; + + /* Clear the per syscall fields in priv */ + priv->task = NULL; + priv->tail_vma = NULL; + + /* + * We remember last_addr rather than next_addr to hit with + * mmap_cache most of the time. We have zero last_addr at + * the beginning and also after lseek. We will have -1 last_addr + * after the end of the vmas. + */ + + if (last_addr == -1UL) + return NULL; + + priv->task = get_pid_task(priv->pid, PIDTYPE_PID); + if (!priv->task) + return NULL; + + mm = mm_for_maps(priv->task); + if (!mm) + return NULL; + + tail_vma = get_gate_vma(priv->task); + priv->tail_vma = tail_vma; + + /* Start with last addr hint */ + vma = find_vma(mm, last_addr); + if (last_addr && vma) { + vma = vma->vm_next; + goto out; + } + + /* + * Check the vma index is within the range and do + * sequential scan until m_index. + */ + vma = NULL; + if ((unsigned long)l < mm->map_count) { + vma = mm->mmap; + while (l-- && vma) + vma = vma->vm_next; + goto out; + } + + if (l != mm->map_count) + tail_vma = NULL; /* After gate vma */ + +out: + if (vma) + return vma; + + /* End of vmas has been reached */ + m->version = (tail_vma != NULL)? 0: -1UL; + up_read(&mm->mmap_sem); + mmput(mm); + return tail_vma; +} + +static void *m_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct proc_maps_private *priv = m->private; + struct vm_area_struct *vma = v; + struct vm_area_struct *tail_vma = priv->tail_vma; + + (*pos)++; + if (vma && (vma != tail_vma) && vma->vm_next) + return vma->vm_next; + vma_stop(priv, vma); + return (vma != tail_vma)? tail_vma: NULL; +} + +static void m_stop(struct seq_file *m, void *v) +{ + struct proc_maps_private *priv = m->private; + struct vm_area_struct *vma = v; + + vma_stop(priv, vma); + if (priv->task) + put_task_struct(priv->task); +} + +static int do_maps_open(struct inode *inode, struct file *file, + struct seq_operations *ops) +{ + struct proc_maps_private *priv; + int ret = -ENOMEM; + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (priv) { + priv->pid = proc_pid(inode); + ret = seq_open(file, ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = priv; + } else { + kfree(priv); + } + } + return ret; +} static int show_map(struct seq_file *m, void *v) { @@ -210,6 +296,56 @@ static int show_map(struct seq_file *m, void *v) return 0; } +static struct seq_operations proc_pid_maps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_map +}; + +static int maps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_pid_maps_op); +} + +const struct file_operations proc_maps_operations = { + .open = maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +/* + * Proportional Set Size(PSS): my share of RSS. + * + * PSS of a process is the count of pages it has in memory, where each + * page is divided by the number of processes sharing it. So if a + * process has 1000 pages all to itself, and 1000 shared with one other + * process, its PSS will be 1500. + * + * To keep (accumulated) division errors low, we adopt a 64bit + * fixed-point pss counter to minimize division errors. So (pss >> + * PSS_SHIFT) would be the real byte count. + * + * A shift of 12 before division means (assuming 4K page size): + * - 1M 3-user-pages add up to 8KB errors; + * - supports mapcount up to 2^24, or 16M; + * - supports PSS up to 2^52 bytes, or 4PB. + */ +#define PSS_SHIFT 12 + +struct mem_size_stats +{ + struct vm_area_struct *vma; + unsigned long resident; + unsigned long shared_clean; + unsigned long shared_dirty; + unsigned long private_clean; + unsigned long private_dirty; + unsigned long referenced; + u64 pss; +}; + static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, void *private) { @@ -255,33 +391,6 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, return 0; } -static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, void *private) -{ - struct vm_area_struct *vma = private; - pte_t *pte, ptent; - spinlock_t *ptl; - struct page *page; - - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - for (; addr != end; pte++, addr += PAGE_SIZE) { - ptent = *pte; - if (!pte_present(ptent)) - continue; - - page = vm_normal_page(vma, addr, ptent); - if (!page) - continue; - - /* Clear accessed and referenced bits. */ - ptep_test_and_clear_young(vma, addr, pte); - ClearPageReferenced(page); - } - pte_unmap_unlock(pte - 1, ptl); - cond_resched(); - return 0; -} - static struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range }; static int show_smap(struct seq_file *m, void *v) @@ -321,6 +430,52 @@ static int show_smap(struct seq_file *m, void *v) return ret; } +static struct seq_operations proc_pid_smaps_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_smap +}; + +static int smaps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_pid_smaps_op); +} + +const struct file_operations proc_smaps_operations = { + .open = smaps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, + unsigned long end, void *private) +{ + struct vm_area_struct *vma = private; + pte_t *pte, ptent; + spinlock_t *ptl; + struct page *page; + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (; addr != end; pte++, addr += PAGE_SIZE) { + ptent = *pte; + if (!pte_present(ptent)) + continue; + + page = vm_normal_page(vma, addr, ptent); + if (!page) + continue; + + /* Clear accessed and referenced bits. */ + ptep_test_and_clear_young(vma, addr, pte); + ClearPageReferenced(page); + } + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); + return 0; +} + static struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range }; static ssize_t clear_refs_write(struct file *file, const char __user *buf, @@ -364,147 +519,6 @@ const struct file_operations proc_clear_refs_operations = { .write = clear_refs_write, }; -static void *m_start(struct seq_file *m, loff_t *pos) -{ - struct proc_maps_private *priv = m->private; - unsigned long last_addr = m->version; - struct mm_struct *mm; - struct vm_area_struct *vma, *tail_vma = NULL; - loff_t l = *pos; - - /* Clear the per syscall fields in priv */ - priv->task = NULL; - priv->tail_vma = NULL; - - /* - * We remember last_addr rather than next_addr to hit with - * mmap_cache most of the time. We have zero last_addr at - * the beginning and also after lseek. We will have -1 last_addr - * after the end of the vmas. - */ - - if (last_addr == -1UL) - return NULL; - - priv->task = get_pid_task(priv->pid, PIDTYPE_PID); - if (!priv->task) - return NULL; - - mm = mm_for_maps(priv->task); - if (!mm) - return NULL; - - priv->tail_vma = tail_vma = get_gate_vma(priv->task); - - /* Start with last addr hint */ - if (last_addr && (vma = find_vma(mm, last_addr))) { - vma = vma->vm_next; - goto out; - } - - /* - * Check the vma index is within the range and do - * sequential scan until m_index. - */ - vma = NULL; - if ((unsigned long)l < mm->map_count) { - vma = mm->mmap; - while (l-- && vma) - vma = vma->vm_next; - goto out; - } - - if (l != mm->map_count) - tail_vma = NULL; /* After gate vma */ - -out: - if (vma) - return vma; - - /* End of vmas has been reached */ - m->version = (tail_vma != NULL)? 0: -1UL; - up_read(&mm->mmap_sem); - mmput(mm); - return tail_vma; -} - -static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) -{ - if (vma && vma != priv->tail_vma) { - struct mm_struct *mm = vma->vm_mm; - up_read(&mm->mmap_sem); - mmput(mm); - } -} - -static void *m_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct proc_maps_private *priv = m->private; - struct vm_area_struct *vma = v; - struct vm_area_struct *tail_vma = priv->tail_vma; - - (*pos)++; - if (vma && (vma != tail_vma) && vma->vm_next) - return vma->vm_next; - vma_stop(priv, vma); - return (vma != tail_vma)? tail_vma: NULL; -} - -static void m_stop(struct seq_file *m, void *v) -{ - struct proc_maps_private *priv = m->private; - struct vm_area_struct *vma = v; - - vma_stop(priv, vma); - if (priv->task) - put_task_struct(priv->task); -} - -static struct seq_operations proc_pid_maps_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_map -}; - -static struct seq_operations proc_pid_smaps_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_smap -}; - -static int do_maps_open(struct inode *inode, struct file *file, - struct seq_operations *ops) -{ - struct proc_maps_private *priv; - int ret = -ENOMEM; - priv = kzalloc(sizeof(*priv), GFP_KERNEL); - if (priv) { - priv->pid = proc_pid(inode); - ret = seq_open(file, ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = priv; - } else { - kfree(priv); - } - } - return ret; -} - -static int maps_open(struct inode *inode, struct file *file) -{ - return do_maps_open(inode, file, &proc_pid_maps_op); -} - -const struct file_operations proc_maps_operations = { - .open = maps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; - #ifdef CONFIG_NUMA extern int show_numa_map(struct seq_file *m, void *v); @@ -539,14 +553,3 @@ const struct file_operations proc_numa_maps_operations = { }; #endif -static int smaps_open(struct inode *inode, struct file *file) -{ - return do_maps_open(inode, file, &proc_pid_smaps_op); -} - -const struct file_operations proc_smaps_operations = { - .open = smaps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; -- cgit v1.2.3 From 85863e475e59afb027b0113290e3796ee6020b7d Mon Sep 17 00:00:00 2001 From: Matt Mackall Date: Mon, 4 Feb 2008 22:29:04 -0800 Subject: maps4: add /proc/pid/pagemap interface This interface provides a mapping for each page in an address space to its physical page frame number, allowing precise determination of what pages are mapped and what pages are shared between processes. New in this version: - headers gone again (as recommended by Dave Hansen and Alan Cox) - 64-bit entries (as per discussion with Andi Kleen) - swap pte information exported (from Dave Hansen) - page walker callback for holes (from Dave Hansen) - direct put_user I/O (as suggested by Rusty Russell) This patch folds in cleanups and swap PTE support from Dave Hansen . Signed-off-by: Matt Mackall Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 4 +- fs/proc/internal.h | 2 + fs/proc/task_mmu.c | 200 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 204 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 1bd646d3fe9..9004db04efa 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -783,7 +783,7 @@ out_no_task: } #endif -static loff_t mem_lseek(struct file * file, loff_t offset, int orig) +loff_t mem_lseek(struct file *file, loff_t offset, int orig) { switch (orig) { case 0: @@ -2252,6 +2252,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_MMU REG("clear_refs", S_IWUSR, clear_refs), REG("smaps", S_IRUGO, smaps), + REG("pagemap", S_IRUSR, pagemap), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, attr_dir), @@ -2580,6 +2581,7 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_MMU REG("clear_refs", S_IWUSR, clear_refs), REG("smaps", S_IRUGO, smaps), + REG("pagemap", S_IRUSR, pagemap), #endif #ifdef CONFIG_SECURITY DIR("attr", S_IRUGO|S_IXUGO, attr_dir), diff --git a/fs/proc/internal.h b/fs/proc/internal.h index ddfaeec3749..7d57e806992 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -52,11 +52,13 @@ extern int proc_tid_stat(struct task_struct *, char *); extern int proc_tgid_stat(struct task_struct *, char *); extern int proc_pid_status(struct task_struct *, char *); extern int proc_pid_statm(struct task_struct *, char *); +extern loff_t mem_lseek(struct file *file, loff_t offset, int orig); extern const struct file_operations proc_maps_operations; extern const struct file_operations proc_numa_maps_operations; extern const struct file_operations proc_smaps_operations; extern const struct file_operations proc_clear_refs_operations; +extern const struct file_operations proc_pagemap_operations; void free_proc_entry(struct proc_dir_entry *de); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 308fc5451e4..bbd9b145051 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -5,7 +5,10 @@ #include #include #include +#include #include +#include +#include #include #include @@ -519,6 +522,202 @@ const struct file_operations proc_clear_refs_operations = { .write = clear_refs_write, }; +struct pagemapread { + char __user *out, *end; +}; + +#define PM_ENTRY_BYTES sizeof(u64) +#define PM_RESERVED_BITS 3 +#define PM_RESERVED_OFFSET (64 - PM_RESERVED_BITS) +#define PM_RESERVED_MASK (((1LL<out + PM_ENTRY_BYTES >= pm->end) { + if (copy_to_user(pm->out, &pfn, pm->end - pm->out)) + return -EFAULT; + pm->out = pm->end; + return PM_END_OF_BUFFER; + } + + if (put_user(pfn, pm->out)) + return -EFAULT; + pm->out += PM_ENTRY_BYTES; + return 0; +} + +static int pagemap_pte_hole(unsigned long start, unsigned long end, + void *private) +{ + struct pagemapread *pm = private; + unsigned long addr; + int err = 0; + for (addr = start; addr < end; addr += PAGE_SIZE) { + err = add_to_pagemap(addr, PM_NOT_PRESENT, pm); + if (err) + break; + } + return err; +} + +u64 swap_pte_to_pagemap_entry(pte_t pte) +{ + swp_entry_t e = pte_to_swp_entry(pte); + return PM_SWAP | swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT); +} + +static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, + void *private) +{ + struct pagemapread *pm = private; + pte_t *pte; + int err = 0; + + for (; addr != end; addr += PAGE_SIZE) { + u64 pfn = PM_NOT_PRESENT; + pte = pte_offset_map(pmd, addr); + if (is_swap_pte(*pte)) + pfn = swap_pte_to_pagemap_entry(*pte); + else if (pte_present(*pte)) + pfn = pte_pfn(*pte); + /* unmap so we're not in atomic when we copy to userspace */ + pte_unmap(pte); + err = add_to_pagemap(addr, pfn, pm); + if (err) + return err; + } + + cond_resched(); + + return err; +} + +static struct mm_walk pagemap_walk = { + .pmd_entry = pagemap_pte_range, + .pte_hole = pagemap_pte_hole +}; + +/* + * /proc/pid/pagemap - an array mapping virtual pages to pfns + * + * For each page in the address space, this file contains one 64-bit + * entry representing the corresponding physical page frame number + * (PFN) if the page is present. If there is a swap entry for the + * physical page, then an encoding of the swap file number and the + * page's offset into the swap file are returned. If no page is + * present at all, PM_NOT_PRESENT is returned. This allows determining + * precisely which pages are mapped (or in swap) and comparing mapped + * pages between processes. + * + * Efficient users of this interface will use /proc/pid/maps to + * determine which areas of memory are actually mapped and llseek to + * skip over unmapped regions. + */ +static ssize_t pagemap_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + struct page **pages, *page; + unsigned long uaddr, uend; + struct mm_struct *mm; + struct pagemapread pm; + int pagecount; + int ret = -ESRCH; + + if (!task) + goto out; + + ret = -EACCES; + if (!ptrace_may_attach(task)) + goto out; + + ret = -EINVAL; + /* file position must be aligned */ + if (*ppos % PM_ENTRY_BYTES) + goto out; + + ret = 0; + mm = get_task_mm(task); + if (!mm) + goto out; + + ret = -ENOMEM; + uaddr = (unsigned long)buf & PAGE_MASK; + uend = (unsigned long)(buf + count); + pagecount = (PAGE_ALIGN(uend) - uaddr) / PAGE_SIZE; + pages = kmalloc(pagecount * sizeof(struct page *), GFP_KERNEL); + if (!pages) + goto out_task; + + down_read(¤t->mm->mmap_sem); + ret = get_user_pages(current, current->mm, uaddr, pagecount, + 1, 0, pages, NULL); + up_read(¤t->mm->mmap_sem); + + if (ret < 0) + goto out_free; + + pm.out = buf; + pm.end = buf + count; + + if (!ptrace_may_attach(task)) { + ret = -EIO; + } else { + unsigned long src = *ppos; + unsigned long svpfn = src / PM_ENTRY_BYTES; + unsigned long start_vaddr = svpfn << PAGE_SHIFT; + unsigned long end_vaddr = TASK_SIZE_OF(task); + + /* watch out for wraparound */ + if (svpfn > TASK_SIZE_OF(task) >> PAGE_SHIFT) + start_vaddr = end_vaddr; + + /* + * The odds are that this will stop walking way + * before end_vaddr, because the length of the + * user buffer is tracked in "pm", and the walk + * will stop when we hit the end of the buffer. + */ + ret = walk_page_range(mm, start_vaddr, end_vaddr, + &pagemap_walk, &pm); + if (ret == PM_END_OF_BUFFER) + ret = 0; + /* don't need mmap_sem for these, but this looks cleaner */ + *ppos += pm.out - buf; + if (!ret) + ret = pm.out - buf; + } + + for (; pagecount; pagecount--) { + page = pages[pagecount-1]; + if (!PageReserved(page)) + SetPageDirty(page); + page_cache_release(page); + } + mmput(mm); +out_free: + kfree(pages); +out_task: + put_task_struct(task); +out: + return ret; +} + +const struct file_operations proc_pagemap_operations = { + .llseek = mem_lseek, /* borrow this */ + .read = pagemap_read, +}; + #ifdef CONFIG_NUMA extern int show_numa_map(struct seq_file *m, void *v); @@ -552,4 +751,3 @@ const struct file_operations proc_numa_maps_operations = { .release = seq_release_private, }; #endif - -- cgit v1.2.3 From 161f47bf41c5ece90ac53cbb6a4cb9bf74ce0ef6 Mon Sep 17 00:00:00 2001 From: Matt Mackall Date: Mon, 4 Feb 2008 22:29:05 -0800 Subject: maps4: add /proc/kpagecount interface This makes physical page map counts available to userspace. Together with /proc/pid/pagemap and /proc/pid/clear_refs, this can be used to monitor memory usage on a per-page basis. [akpm@linux-foundation.org: remove unneeded access_ok()] [bunk@stusta.de: make struct proc_kpagemap static] Signed-off-by: Matt Mackall Cc: Jeremy Fitzhardinge Cc: David Rientjes Signed-off-by: Adrian Bunk Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_misc.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) (limited to 'fs') diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 3462bfde89f..19b69f931be 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -675,6 +676,54 @@ static const struct file_operations proc_sysrq_trigger_operations = { }; #endif +#define KPMSIZE sizeof(u64) +#define KPMMASK (KPMSIZE - 1) +/* /proc/kpagecount - an array exposing page counts + * + * Each entry is a u64 representing the corresponding + * physical page count. + */ +static ssize_t kpagecount_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 __user *out = (u64 __user *)buf; + struct page *ppage; + unsigned long src = *ppos; + unsigned long pfn; + ssize_t ret = 0; + u64 pcount; + + pfn = src / KPMSIZE; + count = min_t(size_t, count, (max_pfn * KPMSIZE) - src); + if (src & KPMMASK || count & KPMMASK) + return -EIO; + + while (count > 0) { + ppage = pfn_to_page(pfn++); + if (!ppage) + pcount = 0; + else + pcount = atomic_read(&ppage->_count); + + if (put_user(pcount, out++)) { + ret = -EFAULT; + break; + } + + count -= KPMSIZE; + } + + *ppos += (char __user *)out - buf; + if (!ret) + ret = (char __user *)out - buf; + return ret; +} + +static struct file_operations proc_kpagecount_operations = { + .llseek = mem_lseek, + .read = kpagecount_read, +}; + struct proc_dir_entry *proc_root_kcore; void create_seq_entry(char *name, mode_t mode, const struct file_operations *f) @@ -755,6 +804,7 @@ void __init proc_misc_init(void) (size_t)high_memory - PAGE_OFFSET + PAGE_SIZE; } #endif + create_seq_entry("kpagecount", S_IRUSR, &proc_kpagecount_operations); #ifdef CONFIG_PROC_VMCORE proc_vmcore = create_proc_entry("vmcore", S_IRUSR, NULL); if (proc_vmcore) -- cgit v1.2.3 From 304daa8132a95e998b6716d4b7bd8bd76aa152b2 Mon Sep 17 00:00:00 2001 From: Matt Mackall Date: Mon, 4 Feb 2008 22:29:06 -0800 Subject: maps4: add /proc/kpageflags interface This makes a subset of physical page flags available to userspace. Together with /proc/pid/kpagemap, this allows tracking of a wide variety of VM behaviors. Exported flags are decoupled from the kernel's internal flags. This allows us to reorder flag bits, and synthesize any bits that get redefined in terms of other bits. [akpm@linux-foundation.org: remove unneeded access_ok()] [akpm@linux-foundation.org: s/0/NULL/] Signed-off-by: Matt Mackall Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_misc.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 19b69f931be..fd751ea37fc 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -699,7 +699,10 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, return -EIO; while (count > 0) { - ppage = pfn_to_page(pfn++); + ppage = NULL; + if (pfn_valid(pfn)) + ppage = pfn_to_page(pfn); + pfn++; if (!ppage) pcount = 0; else @@ -724,6 +727,84 @@ static struct file_operations proc_kpagecount_operations = { .read = kpagecount_read, }; +/* /proc/kpageflags - an array exposing page flags + * + * Each entry is a u64 representing the corresponding + * physical page flags. + */ + +/* These macros are used to decouple internal flags from exported ones */ + +#define KPF_LOCKED 0 +#define KPF_ERROR 1 +#define KPF_REFERENCED 2 +#define KPF_UPTODATE 3 +#define KPF_DIRTY 4 +#define KPF_LRU 5 +#define KPF_ACTIVE 6 +#define KPF_SLAB 7 +#define KPF_WRITEBACK 8 +#define KPF_RECLAIM 9 +#define KPF_BUDDY 10 + +#define kpf_copy_bit(flags, srcpos, dstpos) (((flags >> srcpos) & 1) << dstpos) + +static ssize_t kpageflags_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + u64 __user *out = (u64 __user *)buf; + struct page *ppage; + unsigned long src = *ppos; + unsigned long pfn; + ssize_t ret = 0; + u64 kflags, uflags; + + pfn = src / KPMSIZE; + count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src); + if (src & KPMMASK || count & KPMMASK) + return -EIO; + + while (count > 0) { + ppage = NULL; + if (pfn_valid(pfn)) + ppage = pfn_to_page(pfn); + pfn++; + if (!ppage) + kflags = 0; + else + kflags = ppage->flags; + + uflags = kpf_copy_bit(KPF_LOCKED, PG_locked, kflags) | + kpf_copy_bit(kflags, KPF_ERROR, PG_error) | + kpf_copy_bit(kflags, KPF_REFERENCED, PG_referenced) | + kpf_copy_bit(kflags, KPF_UPTODATE, PG_uptodate) | + kpf_copy_bit(kflags, KPF_DIRTY, PG_dirty) | + kpf_copy_bit(kflags, KPF_LRU, PG_lru) | + kpf_copy_bit(kflags, KPF_ACTIVE, PG_active) | + kpf_copy_bit(kflags, KPF_SLAB, PG_slab) | + kpf_copy_bit(kflags, KPF_WRITEBACK, PG_writeback) | + kpf_copy_bit(kflags, KPF_RECLAIM, PG_reclaim) | + kpf_copy_bit(kflags, KPF_BUDDY, PG_buddy); + + if (put_user(uflags, out++)) { + ret = -EFAULT; + break; + } + + count -= KPMSIZE; + } + + *ppos += (char __user *)out - buf; + if (!ret) + ret = (char __user *)out - buf; + return ret; +} + +static struct file_operations proc_kpageflags_operations = { + .llseek = mem_lseek, + .read = kpageflags_read, +}; + struct proc_dir_entry *proc_root_kcore; void create_seq_entry(char *name, mode_t mode, const struct file_operations *f) @@ -805,6 +886,7 @@ void __init proc_misc_init(void) } #endif create_seq_entry("kpagecount", S_IRUSR, &proc_kpagecount_operations); + create_seq_entry("kpageflags", S_IRUSR, &proc_kpageflags_operations); #ifdef CONFIG_PROC_VMCORE proc_vmcore = create_proc_entry("vmcore", S_IRUSR, NULL); if (proc_vmcore) -- cgit v1.2.3 From 1e88328111aae3ea408f346763ba9f9bad71f876 Mon Sep 17 00:00:00 2001 From: Matt Mackall Date: Mon, 4 Feb 2008 22:29:07 -0800 Subject: maps4: make page monitoring /proc file optional Make /proc/ page monitoring configurable This puts the following files under an embedded config option: /proc/pid/clear_refs /proc/pid/smaps /proc/pid/pagemap /proc/kpagecount /proc/kpageflags [akpm@linux-foundation.org: Kconfig fix] Signed-off-by: Matt Mackall Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 4 ++-- fs/proc/proc_misc.c | 4 ++++ fs/proc/task_mmu.c | 2 ++ 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 9004db04efa..cd9f84c4bbf 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2249,7 +2249,7 @@ static const struct pid_entry tgid_base_stuff[] = { LNK("exe", exe), REG("mounts", S_IRUGO, mounts), REG("mountstats", S_IRUSR, mountstats), -#ifdef CONFIG_MMU +#ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, clear_refs), REG("smaps", S_IRUGO, smaps), REG("pagemap", S_IRUSR, pagemap), @@ -2578,7 +2578,7 @@ static const struct pid_entry tid_base_stuff[] = { LNK("root", root), LNK("exe", exe), REG("mounts", S_IRUGO, mounts), -#ifdef CONFIG_MMU +#ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, clear_refs), REG("smaps", S_IRUGO, smaps), REG("pagemap", S_IRUSR, pagemap), diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index fd751ea37fc..51288db37a0 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -676,6 +676,7 @@ static const struct file_operations proc_sysrq_trigger_operations = { }; #endif +#ifdef CONFIG_PROC_PAGE_MONITOR #define KPMSIZE sizeof(u64) #define KPMMASK (KPMSIZE - 1) /* /proc/kpagecount - an array exposing page counts @@ -804,6 +805,7 @@ static struct file_operations proc_kpageflags_operations = { .llseek = mem_lseek, .read = kpageflags_read, }; +#endif /* CONFIG_PROC_PAGE_MONITOR */ struct proc_dir_entry *proc_root_kcore; @@ -885,8 +887,10 @@ void __init proc_misc_init(void) (size_t)high_memory - PAGE_OFFSET + PAGE_SIZE; } #endif +#ifdef CONFIG_PROC_PAGE_MONITOR create_seq_entry("kpagecount", S_IRUSR, &proc_kpagecount_operations); create_seq_entry("kpageflags", S_IRUSR, &proc_kpageflags_operations); +#endif #ifdef CONFIG_PROC_VMCORE proc_vmcore = create_proc_entry("vmcore", S_IRUSR, NULL); if (proc_vmcore) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index bbd9b145051..38338ed98cc 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -337,6 +337,7 @@ const struct file_operations proc_maps_operations = { */ #define PSS_SHIFT 12 +#ifdef CONFIG_PROC_PAGE_MONITOR struct mem_size_stats { struct vm_area_struct *vma; @@ -717,6 +718,7 @@ const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, }; +#endif /* CONFIG_PROC_PAGE_MONITOR */ #ifdef CONFIG_NUMA extern int show_numa_map(struct seq_file *m, void *v); -- cgit v1.2.3 From 7766755a2f249e7e0dabc5255a0a3d151ff79821 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Mon, 4 Feb 2008 22:29:21 -0800 Subject: Fix /proc dcache deadlock in do_exit This patch fixes a sles9 system hang in start_this_handle from a customer with some heavy workload where all tasks are waiting on kjournald to commit the transaction, but kjournald waits on t_updates to go down to zero (it never does). This was reported as a lowmem shortage deadlock but when checking the debug data I noticed the VM wasn't under pressure at all (well it was really under vm pressure, because lots of tasks hanged in the VM prune_dcache methods trying to flush dirty inodes, but no task was hanging in GFP_NOFS mode, the holder of the journal handle should have if this was a vm issue in the first place). No task was apparently holding the leftover handle in the committing transaction, so I deduced t_updates was stuck to 1 because a journal_stop was never run by some path (this turned out to be correct). With a debug patch adding proper reverse links and stack trace logging in ext3 deployed in production, I found journal_stop is never run because mark_inode_dirty_sync is called inside release_task called by do_exit. (that was quite fun because I would have never thought about this subtleness, I thought a regular path in ext3 had a bug and it forgot to call journal_stop) do_exit->release_task->mark_inode_dirty_sync->schedule() (will never come back to run journal_stop) The reason is that shrink_dcache_parent is racy by design (feature not a bug) and it can do blocking I/O in some case, but the point is that calling shrink_dcache_parent at the last stage of do_exit isn't safe for self-reaping tasks. I guess the memory pressure of the unbalanced highmem system allowed to trigger this more easily. Now mainline doesn't have this line in iput (like sles9 has): if (inode->i_state & I_DIRTY_DELAYED) mark_inode_dirty_sync(inode); so it will probably not crash with ext3, but for example ext2 implements an I/O-blocking ext2_put_inode that will lead to similar screwups with ext2_free_blocks never coming back and it's definitely wrong to call blocking-IO paths inside do_exit. So this should fix a subtle bug in mainline too (not verified in practice though). The equivalent fix for ext3 is also not verified yet to fix the problem in sles9 but I don't have doubt it will (it usually takes days to crash, so it'll take weeks to be sure). An alternate fix would be to offload that work to a kernel thread, but I don't think a reschedule for this is worth it, the vm should be able to collect those entries for the synchronous release_task. Signed-off-by: Andrea Arcangeli Cc: Jan Kara Cc: Ingo Molnar Cc: "Eric W. Biederman" Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index cd9f84c4bbf..c59852b3878 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2321,7 +2321,8 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) name.len = snprintf(buf, sizeof(buf), "%d", pid); dentry = d_hash_and_lookup(mnt->mnt_root, &name); if (dentry) { - shrink_dcache_parent(dentry); + if (!(current->flags & PF_EXITING)) + shrink_dcache_parent(dentry); d_drop(dentry); dput(dentry); } -- cgit v1.2.3 From 2d544564f9954860235db97df2e549a66c61f557 Mon Sep 17 00:00:00 2001 From: Qi Yong Date: Mon, 4 Feb 2008 22:29:23 -0800 Subject: skip writing data pages when inode is under I_SYNC Since I_SYNC was split out from I_LOCK, the concern in commit 4b89eed93e0fa40a63e3d7b1796ec1337ea7a3aa ("Write back inode data pages even when the inode itself is locked") is not longer valid. We should revert to the original behavior: in __writeback_single_inode(), when we find an I_SYNC-ed inode and we're not doing a data-integrity sync, skip writing entirely. Otherwise, we are double calling do_writepages() Signed-off-by: Qi Yong Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Joern Engel Cc: WU Fengguang Cc: Michael Rubin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 300324bd563..3fe782d70a7 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -334,9 +334,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) WARN_ON(inode->i_state & I_WILL_FREE); if ((wbc->sync_mode != WB_SYNC_ALL) && (inode->i_state & I_SYNC)) { - struct address_space *mapping = inode->i_mapping; - int ret; - /* * We're skipping this inode because it's locked, and we're not * doing writeback-for-data-integrity. Move it to s_more_io so @@ -345,15 +342,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * completed a full scan of s_io. */ requeue_io(inode); - - /* - * Even if we don't actually write the inode itself here, - * we can at least start some of the data writeout.. - */ - spin_unlock(&inode_lock); - ret = do_writepages(mapping, wbc); - spin_lock(&inode_lock); - return ret; + return 0; } /* -- cgit v1.2.3 From 8bc3be2751b4f74ab90a446da1912fd8204d53f7 Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Mon, 4 Feb 2008 22:29:36 -0800 Subject: writeback: speed up writeback of big dirty files After making dirty a 100M file, the normal behavior is to start the writeback for all data after 30s delays. But sometimes the following happens instead: - after 30s: ~4M - after 5s: ~4M - after 5s: all remaining 92M Some analyze shows that the internal io dispatch queues goes like this: s_io s_more_io ------------------------- 1) 100M,1K 0 2) 1K 96M 3) 0 96M 1) initial state with a 100M file and a 1K file 2) 4M written, nr_to_write <= 0, so write more 3) 1K written, nr_to_write > 0, no more writes(BUG) nr_to_write > 0 in (3) fools the upper layer to think that data have all been written out. The big dirty file is actually still sitting in s_more_io. We cannot simply splice s_more_io back to s_io as soon as s_io becomes empty, and let the loop in generic_sync_sb_inodes() continue: this may starve newly expired inodes in s_dirty. It is also not an option to draw inodes from both s_more_io and s_dirty, an let the loop go on: this might lead to live locks, and might also starve other superblocks in sync time(well kupdate may still starve some superblocks, that's another bug). We have to return when a full scan of s_io completes. So nr_to_write > 0 does not necessarily mean that "all data are written". This patch introduces a flag writeback_control.more_io to indicate that more io should be done. With it the big dirty file no longer has to wait for the next kupdate invokation 5s later. In sync_sb_inodes() we only set more_io on super_blocks we actually visited. This avoids the interaction between two pdflush deamons. Also in __sync_single_inode() we don't blindly keep requeuing the io if the filesystem cannot progress. Failing to do so may lead to 100% iowait. Tested-by: Mike Snitzer Signed-off-by: Fengguang Wu Cc: Michael Rubin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3fe782d70a7..0b3064079fa 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -284,7 +284,17 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc) * soon as the queue becomes uncongested. */ inode->i_state |= I_DIRTY_PAGES; - requeue_io(inode); + if (wbc->nr_to_write <= 0) { + /* + * slice used up: queue for next turn + */ + requeue_io(inode); + } else { + /* + * somehow blocked: retry later + */ + redirty_tail(inode); + } } else { /* * Otherwise fully redirty the inode so that @@ -468,8 +478,12 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) iput(inode); cond_resched(); spin_lock(&inode_lock); - if (wbc->nr_to_write <= 0) + if (wbc->nr_to_write <= 0) { + wbc->more_io = 1; break; + } + if (!list_empty(&sb->s_more_io)) + wbc->more_io = 1; } return; /* Leave any unwritten inodes on s_io */ } -- cgit v1.2.3 From 42492594043d621a7910ff5877c3eb9202870b45 Mon Sep 17 00:00:00 2001 From: "David P. Quigley" Date: Mon, 4 Feb 2008 22:29:39 -0800 Subject: VFS/Security: Rework inode_getsecurity and callers to return resulting buffer This patch modifies the interface to inode_getsecurity to have the function return a buffer containing the security blob and its length via parameters instead of relying on the calling function to give it an appropriately sized buffer. Security blobs obtained with this function should be freed using the release_secctx LSM hook. This alleviates the problem of the caller having to guess a length and preallocate a buffer for this function allowing it to be used elsewhere for Labeled NFS. The patch also removed the unused err parameter. The conversion is similar to the one performed by Al Viro for the security_getprocattr hook. Signed-off-by: David P. Quigley Cc: Stephen Smalley Cc: Chris Wright Acked-by: James Morris Acked-by: Serge Hallyn Cc: Casey Schaufler Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/xattr.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xattr.c b/fs/xattr.c index 6645b7313b3..1858552a6a1 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -104,6 +104,33 @@ out: } EXPORT_SYMBOL_GPL(vfs_setxattr); +ssize_t +xattr_getsecurity(struct inode *inode, const char *name, void *value, + size_t size) +{ + void *buffer = NULL; + ssize_t len; + + if (!value || !size) { + len = security_inode_getsecurity(inode, name, &buffer, false); + goto out_noalloc; + } + + len = security_inode_getsecurity(inode, name, &buffer, true); + if (len < 0) + return len; + if (size < len) { + len = -ERANGE; + goto out; + } + memcpy(value, buffer, len); +out: + security_release_secctx(buffer, len); +out_noalloc: + return len; +} +EXPORT_SYMBOL_GPL(xattr_getsecurity); + ssize_t vfs_getxattr(struct dentry *dentry, char *name, void *value, size_t size) { @@ -126,8 +153,7 @@ vfs_getxattr(struct dentry *dentry, char *name, void *value, size_t size) if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; - int ret = security_inode_getsecurity(inode, suffix, value, - size, error); + int ret = xattr_getsecurity(inode, suffix, value, size); /* * Only overwrite the return value if a security module * is actually active. -- cgit v1.2.3 From 4bea58053f206be9a89ca35850f9ad295dac2042 Mon Sep 17 00:00:00 2001 From: "David P. Quigley" Date: Mon, 4 Feb 2008 22:29:40 -0800 Subject: VFS: Reorder vfs_getxattr to avoid unnecessary calls to the LSM Originally vfs_getxattr would pull the security xattr variable using the inode getxattr handle and then proceed to clobber it with a subsequent call to the LSM. This patch reorders the two operations such that when the xattr requested is in the security namespace it first attempts to grab the value from the LSM directly. If it fails to obtain the value because there is no module present or the module does not support the operation it will fall back to using the inode getxattr operation. In the event that both are inaccessible it returns EOPNOTSUPP. Signed-off-by: David P. Quigley Cc: Stephen Smalley Cc: Chris Wright Acked-by: James Morris Acked-by: Serge Hallyn Cc: Casey Schaufler Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/xattr.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xattr.c b/fs/xattr.c index 1858552a6a1..f7c8f87bb39 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -145,11 +145,6 @@ vfs_getxattr(struct dentry *dentry, char *name, void *value, size_t size) if (error) return error; - if (inode->i_op->getxattr) - error = inode->i_op->getxattr(dentry, name, value, size); - else - error = -EOPNOTSUPP; - if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; @@ -158,9 +153,15 @@ vfs_getxattr(struct dentry *dentry, char *name, void *value, size_t size) * Only overwrite the return value if a security module * is actually active. */ - if (ret != -EOPNOTSUPP) - error = ret; + if (ret == -EOPNOTSUPP) + goto nolsm; + return ret; } +nolsm: + if (inode->i_op->getxattr) + error = inode->i_op->getxattr(dentry, name, value, size); + else + error = -EOPNOTSUPP; return error; } -- cgit v1.2.3 From e338d263a76af78fe8f38a72131188b58fceb591 Mon Sep 17 00:00:00 2001 From: Andrew Morgan Date: Mon, 4 Feb 2008 22:29:42 -0800 Subject: Add 64-bit capability support to the kernel The patch supports legacy (32-bit) capability userspace, and where possible translates 32-bit capabilities to/from userspace and the VFS to 64-bit kernel space capabilities. If a capability set cannot be compressed into 32-bits for consumption by user space, the system call fails, with -ERANGE. FWIW libcap-2.00 supports this change (and earlier capability formats) http://www.kernel.org/pub/linux/libs/security/linux-privs/kernel-2.6/ [akpm@linux-foundation.org: coding-syle fixes] [akpm@linux-foundation.org: use get_task_comm()] [ezk@cs.sunysb.edu: build fix] [akpm@linux-foundation.org: do not initialise statics to 0 or NULL] [akpm@linux-foundation.org: unused var] [serue@us.ibm.com: export __cap_ symbols] Signed-off-by: Andrew G. Morgan Cc: Stephen Smalley Acked-by: Serge Hallyn Cc: Chris Wright Cc: James Morris Cc: Casey Schaufler Signed-off-by: Erez Zadok Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/auth.c | 10 +++++----- fs/proc/array.c | 21 +++++++++++++++------ 2 files changed, 20 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 21928056e35..d13403e3362 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -11,8 +11,6 @@ #include #include -#define CAP_NFSD_MASK (CAP_FS_MASK|CAP_TO_MASK(CAP_SYS_RESOURCE)) - int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp) { struct exp_flavor_info *f; @@ -69,10 +67,12 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) ret = set_current_groups(cred.cr_group_info); put_group_info(cred.cr_group_info); if ((cred.cr_uid)) { - cap_t(current->cap_effective) &= ~CAP_NFSD_MASK; + current->cap_effective = + cap_drop_nfsd_set(current->cap_effective); } else { - cap_t(current->cap_effective) |= (CAP_NFSD_MASK & - current->cap_permitted); + current->cap_effective = + cap_raise_nfsd_set(current->cap_effective, + current->cap_permitted); } return ret; } diff --git a/fs/proc/array.c b/fs/proc/array.c index b380313092b..6ba2746e451 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -281,14 +281,23 @@ static inline char *task_sig(struct task_struct *p, char *buffer) return buffer; } +static char *render_cap_t(const char *header, kernel_cap_t *a, char *buffer) +{ + unsigned __capi; + + buffer += sprintf(buffer, "%s", header); + CAP_FOR_EACH_U32(__capi) { + buffer += sprintf(buffer, "%08x", + a->cap[(_LINUX_CAPABILITY_U32S-1) - __capi]); + } + return buffer + sprintf(buffer, "\n"); +} + static inline char *task_cap(struct task_struct *p, char *buffer) { - return buffer + sprintf(buffer, "CapInh:\t%016x\n" - "CapPrm:\t%016x\n" - "CapEff:\t%016x\n", - cap_t(p->cap_inheritable), - cap_t(p->cap_permitted), - cap_t(p->cap_effective)); + buffer = render_cap_t("CapInh:\t", &p->cap_inheritable, buffer); + buffer = render_cap_t("CapPrm:\t", &p->cap_permitted, buffer); + return render_cap_t("CapEff:\t", &p->cap_effective, buffer); } static inline char *task_context_switch_counts(struct task_struct *p, -- cgit v1.2.3 From d7b88513c504e49d450b0f89f80ba9d451a3c804 Mon Sep 17 00:00:00 2001 From: Dominique Quatravaux Date: Mon, 4 Feb 2008 22:31:15 -0800 Subject: uml: fix hostfs tv_usec calculations To convert from tv_nsec to tv_usec, one needs to divide by 1000, not multiply. Signed-off-by: Dominique Quatravaux Signed-off-by: Jeff Dike Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hostfs/hostfs_user.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index 35c1a9f33f4..53fd0a67c11 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -285,17 +285,17 @@ int set_attr(const char *file, struct hostfs_iattr *attrs, int fd) return err; times[0].tv_sec = atime_ts.tv_sec; - times[0].tv_usec = atime_ts.tv_nsec * 1000; + times[0].tv_usec = atime_ts.tv_nsec / 1000; times[1].tv_sec = mtime_ts.tv_sec; - times[1].tv_usec = mtime_ts.tv_nsec * 1000; + times[1].tv_usec = mtime_ts.tv_nsec / 1000; if (attrs->ia_valid & HOSTFS_ATTR_ATIME_SET) { times[0].tv_sec = attrs->ia_atime.tv_sec; - times[0].tv_usec = attrs->ia_atime.tv_nsec * 1000; + times[0].tv_usec = attrs->ia_atime.tv_nsec / 1000; } if (attrs->ia_valid & HOSTFS_ATTR_MTIME_SET) { times[1].tv_sec = attrs->ia_mtime.tv_sec; - times[1].tv_usec = attrs->ia_mtime.tv_nsec * 1000; + times[1].tv_usec = attrs->ia_mtime.tv_nsec / 1000; } if (fd >= 0) { -- cgit v1.2.3 From c773633916c66f8362ca01983d97bd33e35b743f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 5 Feb 2008 14:22:58 -0800 Subject: deprecate smbfs in favour of cifs smbfs is a bit buggy and has no maintainer. Change it to shout at the user on the first five mount attempts - tell them to switch to CIFS. Come December we'll mark it BROKEN and see what happens. [olecom@flower.upol.cz: documentation update] Cc: Urban Widmark Acked-by: Steven French Signed-off-by: Oleg Verych Cc: Jeff Layton Cc: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Kconfig | 28 ++++++++++++++-------------- fs/smbfs/inode.c | 7 +++++++ 2 files changed, 21 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index 987b5d7cb21..ea5b3594762 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -1152,7 +1152,7 @@ config BEFS_DEBUG depends on BEFS_FS help If you say Y here, you can use the 'debug' mount option to enable - debugging output from the driver. + debugging output from the driver. config BFS_FS tristate "BFS file system support (EXPERIMENTAL)" @@ -1263,7 +1263,7 @@ config JFFS2_FS_XATTR Extended attributes are name:value pairs associated with inodes by the kernel or by users (see the attr(5) manual page, or visit for details). - + If unsure, say N. config JFFS2_FS_POSIX_ACL @@ -1274,10 +1274,10 @@ config JFFS2_FS_POSIX_ACL help Posix Access Control Lists (ACLs) support permissions for users and groups beyond the owner/group/world scheme. - + To learn more about Access Control Lists, visit the Posix ACLs for Linux website . - + If you don't know what Access Control Lists are, say N config JFFS2_FS_SECURITY @@ -1289,7 +1289,7 @@ config JFFS2_FS_SECURITY implemented by security modules like SELinux. This option enables an extended attribute handler for file security labels in the jffs2 filesystem. - + If you are not using a security module that requires using extended attributes for file security labels, say N. @@ -1835,7 +1835,7 @@ config RPCSEC_GSS_SPKM3 If unsure, say N. config SMB_FS - tristate "SMB file system support (to mount Windows shares etc.)" + tristate "SMB file system support (OBSOLETE, please use CIFS)" depends on INET select NLS help @@ -1858,8 +1858,8 @@ config SMB_FS General information about how to connect Linux, Windows machines and Macs is on the WWW at . - To compile the SMB support as a module, choose M here: the module will - be called smbfs. Most people say N, however. + To compile the SMB support as a module, choose M here: + the module will be called smbfs. Most people say N, however. config SMB_NLS_DEFAULT bool "Use a default NLS" @@ -1891,7 +1891,7 @@ config SMB_NLS_REMOTE smbmount from samba 2.2.0 or later supports this. config CIFS - tristate "CIFS support (advanced network filesystem for Samba, Window and other CIFS compliant servers)" + tristate "CIFS support (advanced network filesystem, SMBFS successor)" depends on INET select NLS help @@ -1949,16 +1949,16 @@ config CIFS_WEAK_PW_HASH LANMAN based servers such as OS/2 and Windows 95, but such mounts may be less secure than mounts using NTLM or more recent security mechanisms if you are on a public network. Unless you - have a need to access old SMB servers (and are on a private + have a need to access old SMB servers (and are on a private network) you probably want to say N. Even if this support is enabled in the kernel build, LANMAN authentication will not be used automatically. At runtime LANMAN mounts are disabled but can be set to required (or optional) either in /proc/fs/cifs (see fs/cifs/README for more detail) or via an - option on the mount command. This support is disabled by + option on the mount command. This support is disabled by default in order to reduce the possibility of a downgrade attack. - + If unsure, say N. config CIFS_XATTR @@ -1999,7 +1999,7 @@ config CIFS_DEBUG2 messages in some error paths, slowing performance. This option can be turned off unless you are debugging cifs problems. If unsure, say N. - + config CIFS_EXPERIMENTAL bool "CIFS Experimental Features (EXPERIMENTAL)" depends on CIFS && EXPERIMENTAL @@ -2090,7 +2090,7 @@ config CODA_FS_OLD_API However this new API is not backward compatible with older clients. If you really need to run the old Coda userspace cache manager then say Y. - + For most cases you probably want to say N. config AFS_FS diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c index 9416ead0c7a..4e5c22ca802 100644 --- a/fs/smbfs/inode.c +++ b/fs/smbfs/inode.c @@ -500,6 +500,13 @@ static int smb_fill_super(struct super_block *sb, void *raw_data, int silent) struct smb_fattr root; int ver; void *mem; + static int warn_count; + + if (warn_count < 5) { + warn_count++; + printk(KERN_EMERG "smbfs is deprecated and will be removed" + "from the 2.6.27 kernel. Please migrate to cifs\n"); + } if (!raw_data) goto out_no_data; -- cgit v1.2.3