diff options
296 files changed, 5303 insertions, 3695 deletions
diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt index 2c3b1eae428..ba26201d502 100644 --- a/Documentation/kprobes.txt +++ b/Documentation/kprobes.txt @@ -151,9 +151,9 @@ So that you can load and unload Kprobes-based instrumentation modules, make sure "Loadable module support" (CONFIG_MODULES) and "Module unloading" (CONFIG_MODULE_UNLOAD) are set to "y". -You may also want to ensure that CONFIG_KALLSYMS and perhaps even -CONFIG_KALLSYMS_ALL are set to "y", since kallsyms_lookup_name() -is a handy, version-independent way to find a function's address. +Also make sure that CONFIG_KALLSYMS and perhaps even CONFIG_KALLSYMS_ALL +are set to "y", since kallsyms_lookup_name() is used by the in-kernel +kprobe address resolution code. If you need to insert a probe in the middle of a function, you may find it useful to "Compile the kernel with debug info" (CONFIG_DEBUG_INFO), @@ -179,6 +179,27 @@ occurs during execution of kp->pre_handler or kp->post_handler, or during single-stepping of the probed instruction, Kprobes calls kp->fault_handler. Any or all handlers can be NULL. +NOTE: +1. With the introduction of the "symbol_name" field to struct kprobe, +the probepoint address resolution will now be taken care of by the kernel. +The following will now work: + + kp.symbol_name = "symbol_name"; + +(64-bit powerpc intricacies such as function descriptors are handled +transparently) + +2. Use the "offset" field of struct kprobe if the offset into the symbol +to install a probepoint is known. This field is used to calculate the +probepoint. + +3. Specify either the kprobe "symbol_name" OR the "addr". If both are +specified, kprobe registration will fail with -EINVAL. + +4. With CISC architectures (such as i386 and x86_64), the kprobes code +does not validate if the kprobe.addr is at an instruction boundary. +Use "offset" with caution. + register_kprobe() returns 0 on success, or a negative errno otherwise. User's pre-handler (kp->pre_handler): @@ -225,6 +246,12 @@ control to Kprobes.) If the probed function is declared asmlinkage, fastcall, or anything else that affects how args are passed, the handler's declaration must match. +NOTE: A macro JPROBE_ENTRY is provided to handle architecture-specific +aliasing of jp->entry. In the interest of portability, it is advised +to use: + + jp->entry = JPROBE_ENTRY(handler); + register_jprobe() returns 0 on success, or a negative errno otherwise. 4.3 register_kretprobe @@ -251,6 +278,11 @@ of interest: - ret_addr: the return address - rp: points to the corresponding kretprobe object - task: points to the corresponding task struct + +The regs_return_value(regs) macro provides a simple abstraction to +extract the return value from the appropriate register as defined by +the architecture's ABI. + The handler's return value is currently ignored. 4.4 unregister_*probe @@ -369,7 +401,6 @@ stack trace and selected i386 registers when do_fork() is called. #include <linux/kernel.h> #include <linux/module.h> #include <linux/kprobes.h> -#include <linux/kallsyms.h> #include <linux/sched.h> /*For each probe you need to allocate a kprobe structure*/ @@ -403,18 +434,14 @@ int handler_fault(struct kprobe *p, struct pt_regs *regs, int trapnr) return 0; } -int init_module(void) +static int __init kprobe_init(void) { int ret; kp.pre_handler = handler_pre; kp.post_handler = handler_post; kp.fault_handler = handler_fault; - kp.addr = (kprobe_opcode_t*) kallsyms_lookup_name("do_fork"); - /* register the kprobe now */ - if (!kp.addr) { - printk("Couldn't find %s to plant kprobe\n", "do_fork"); - return -1; - } + kp.symbol_name = "do_fork"; + if ((ret = register_kprobe(&kp) < 0)) { printk("register_kprobe failed, returned %d\n", ret); return -1; @@ -423,12 +450,14 @@ int init_module(void) return 0; } -void cleanup_module(void) +static void __exit kprobe_exit(void) { unregister_kprobe(&kp); printk("kprobe unregistered\n"); } +module_init(kprobe_init) +module_exit(kprobe_exit) MODULE_LICENSE("GPL"); ----- cut here ----- @@ -463,7 +492,6 @@ the arguments of do_fork(). #include <linux/fs.h> #include <linux/uio.h> #include <linux/kprobes.h> -#include <linux/kallsyms.h> /* * Jumper probe for do_fork. @@ -485,17 +513,13 @@ long jdo_fork(unsigned long clone_flags, unsigned long stack_start, } static struct jprobe my_jprobe = { - .entry = (kprobe_opcode_t *) jdo_fork + .entry = JPROBE_ENTRY(jdo_fork) }; -int init_module(void) +static int __init jprobe_init(void) { int ret; - my_jprobe.kp.addr = (kprobe_opcode_t *) kallsyms_lookup_name("do_fork"); - if (!my_jprobe.kp.addr) { - printk("Couldn't find %s to plant jprobe\n", "do_fork"); - return -1; - } + my_jprobe.kp.symbol_name = "do_fork"; if ((ret = register_jprobe(&my_jprobe)) <0) { printk("register_jprobe failed, returned %d\n", ret); @@ -506,12 +530,14 @@ int init_module(void) return 0; } -void cleanup_module(void) +static void __exit jprobe_exit(void) { unregister_jprobe(&my_jprobe); printk("jprobe unregistered\n"); } +module_init(jprobe_init) +module_exit(jprobe_exit) MODULE_LICENSE("GPL"); ----- cut here ----- @@ -530,16 +556,13 @@ report failed calls to sys_open(). #include <linux/kernel.h> #include <linux/module.h> #include <linux/kprobes.h> -#include <linux/kallsyms.h> static const char *probed_func = "sys_open"; /* Return-probe handler: If the probed function fails, log the return value. */ static int ret_handler(struct kretprobe_instance *ri, struct pt_regs *regs) { - // Substitute the appropriate register name for your architecture -- - // e.g., regs->rax for x86_64, regs->gpr[3] for ppc64. - int retval = (int) regs->eax; + int retval = regs_return_value(regs); if (retval < 0) { printk("%s returns %d\n", probed_func, retval); } @@ -552,15 +575,11 @@ static struct kretprobe my_kretprobe = { .maxactive = 20 }; -int init_module(void) +static int __init kretprobe_init(void) { int ret; - my_kretprobe.kp.addr = - (kprobe_opcode_t *) kallsyms_lookup_name(probed_func); - if (!my_kretprobe.kp.addr) { - printk("Couldn't find %s to plant return probe\n", probed_func); - return -1; - } + my_kretprobe.kp.symbol_name = (char *)probed_func; + if ((ret = register_kretprobe(&my_kretprobe)) < 0) { printk("register_kretprobe failed, returned %d\n", ret); return -1; @@ -569,7 +588,7 @@ int init_module(void) return 0; } -void cleanup_module(void) +static void __exit kretprobe_exit(void) { unregister_kretprobe(&my_kretprobe); printk("kretprobe unregistered\n"); @@ -578,6 +597,8 @@ void cleanup_module(void) my_kretprobe.nmissed, probed_func); } +module_init(kretprobe_init) +module_exit(kretprobe_exit) MODULE_LICENSE("GPL"); ----- cut here ----- @@ -590,3 +611,5 @@ messages.) For additional information on Kprobes, refer to the following URLs: http://www-106.ibm.com/developerworks/library/l-kprobes.html?ca=dgr-lnxw42Kprobe http://www.redhat.com/magazine/005mar05/features/kprobes/ +http://www-users.cs.umn.edu/~boutcher/kprobes/ +http://www.linuxsymposium.org/2006/linuxsymposium_procv2.pdf (pages 101-115) diff --git a/arch/alpha/kernel/alpha_ksyms.c b/arch/alpha/kernel/alpha_ksyms.c index f042cc42b00..dbe327d32b6 100644 --- a/arch/alpha/kernel/alpha_ksyms.c +++ b/arch/alpha/kernel/alpha_ksyms.c @@ -36,7 +36,6 @@ #include <asm/cacheflush.h> #include <asm/vga.h> -#define __KERNEL_SYSCALLS__ #include <asm/unistd.h> extern struct hwrpb_struct *hwrpb; @@ -116,7 +115,7 @@ EXPORT_SYMBOL(sys_dup); EXPORT_SYMBOL(sys_exit); EXPORT_SYMBOL(sys_write); EXPORT_SYMBOL(sys_lseek); -EXPORT_SYMBOL(execve); +EXPORT_SYMBOL(kernel_execve); EXPORT_SYMBOL(sys_setsid); EXPORT_SYMBOL(sys_wait4); diff --git a/arch/alpha/kernel/entry.S b/arch/alpha/kernel/entry.S index 01ecd09d4a6..c95e95e1ab0 100644 --- a/arch/alpha/kernel/entry.S +++ b/arch/alpha/kernel/entry.S @@ -655,12 +655,12 @@ kernel_thread: .end kernel_thread /* - * execve(path, argv, envp) + * kernel_execve(path, argv, envp) */ .align 4 - .globl execve - .ent execve -execve: + .globl kernel_execve + .ent kernel_execve +kernel_execve: /* We can be called from a module. */ ldgp $gp, 0($27) lda $sp, -(32+SIZEOF_PT_REGS+8)($sp) @@ -704,7 +704,7 @@ execve: 1: lda $sp, 32+SIZEOF_PT_REGS+8($sp) ret -.end execve +.end kernel_execve /* diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index 73c7622b529..8a31fc1bfb1 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -402,15 +402,15 @@ osf_utsname(char __user *name) down_read(&uts_sem); error = -EFAULT; - if (copy_to_user(name + 0, system_utsname.sysname, 32)) + if (copy_to_user(name + 0, utsname()->sysname, 32)) goto out; - if (copy_to_user(name + 32, system_utsname.nodename, 32)) + if (copy_to_user(name + 32, utsname()->nodename, 32)) goto out; - if (copy_to_user(name + 64, system_utsname.release, 32)) + if (copy_to_user(name + 64, utsname()->release, 32)) goto out; - if (copy_to_user(name + 96, system_utsname.version, 32)) + if (copy_to_user(name + 96, utsname()->version, 32)) goto out; - if (copy_to_user(name + 128, system_utsname.machine, 32)) + if (copy_to_user(name + 128, utsname()->machine, 32)) goto out; error = 0; @@ -449,8 +449,8 @@ osf_getdomainname(char __user *name, int namelen) down_read(&uts_sem); for (i = 0; i < len; ++i) { - __put_user(system_utsname.domainname[i], name + i); - if (system_utsname.domainname[i] == '\0') + __put_user(utsname()->domainname[i], name + i); + if (utsname()->domainname[i] == '\0') break; } up_read(&uts_sem); @@ -607,12 +607,12 @@ osf_sigstack(struct sigstack __user *uss, struct sigstack __user *uoss) asmlinkage long osf_sysinfo(int command, char __user *buf, long count) { - static char * sysinfo_table[] = { - system_utsname.sysname, - system_utsname.nodename, - system_utsname.release, - system_utsname.version, - system_utsname.machine, + char *sysinfo_table[] = { + utsname()->sysname, + utsname()->nodename, + utsname()->release, + utsname()->version, + utsname()->machine, "alpha", /* instruction set architecture */ "dummy", /* hardware serial number */ "dummy", /* hardware manufacturer */ diff --git a/arch/alpha/kernel/srmcons.c b/arch/alpha/kernel/srmcons.c index 9d7dff27f81..75692320386 100644 --- a/arch/alpha/kernel/srmcons.c +++ b/arch/alpha/kernel/srmcons.c @@ -229,7 +229,7 @@ srmcons_close(struct tty_struct *tty, struct file *filp) static struct tty_driver *srmcons_driver; -static struct tty_operations srmcons_ops = { +static const struct tty_operations srmcons_ops = { .open = srmcons_open, .close = srmcons_close, .write = srmcons_write, diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index 0a722e77c14..6bbd93dd186 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -348,7 +348,7 @@ static void __init setup_processor(void) cpu_name, processor_id, (int)processor_id & 15, proc_arch[cpu_architecture()], cr_alignment); - sprintf(system_utsname.machine, "%s%c", list->arch_name, ENDIANNESS); + sprintf(init_utsname()->machine, "%s%c", list->arch_name, ENDIANNESS); sprintf(elf_platform, "%s%c", list->elf_name, ENDIANNESS); elf_hwcap = list->elf_hwcap; #ifndef CONFIG_ARM_THUMB diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 68e9634d260..421329f5e18 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -36,7 +36,9 @@ * The online bitmask indicates that the CPU is up and running. */ cpumask_t cpu_possible_map; +EXPORT_SYMBOL(cpu_possible_map); cpumask_t cpu_online_map; +EXPORT_SYMBOL(cpu_online_map); /* * as from 2.5, kernels no longer have an init_tasks structure diff --git a/arch/arm/kernel/sys_arm.c b/arch/arm/kernel/sys_arm.c index 8170af47143..00c18d35913 100644 --- a/arch/arm/kernel/sys_arm.c +++ b/arch/arm/kernel/sys_arm.c @@ -279,7 +279,7 @@ out: return error; } -long execve(const char *filename, char **argv, char **envp) +int kernel_execve(const char *filename, char *const argv[], char *const envp[]) { struct pt_regs regs; int ret; @@ -317,7 +317,7 @@ long execve(const char *filename, char **argv, char **envp) out: return ret; } -EXPORT_SYMBOL(execve); +EXPORT_SYMBOL(kernel_execve); /* * Since loff_t is a 64 bit type we avoid a lot of ABI hastle diff --git a/arch/arm26/kernel/setup.c b/arch/arm26/kernel/setup.c index e7eb070f794..466ddb54b44 100644 --- a/arch/arm26/kernel/setup.c +++ b/arch/arm26/kernel/setup.c @@ -143,7 +143,7 @@ static void __init setup_processor(void) dump_cpu_info(); - sprintf(system_utsname.machine, "%s", list->arch_name); + sprintf(init_utsname()->machine, "%s", list->arch_name); sprintf(elf_platform, "%s", list->elf_name); elf_hwcap = list->elf_hwcap; diff --git a/arch/arm26/kernel/sys_arm.c b/arch/arm26/kernel/sys_arm.c index 85457897b8a..dc05aba58ba 100644 --- a/arch/arm26/kernel/sys_arm.c +++ b/arch/arm26/kernel/sys_arm.c @@ -283,7 +283,7 @@ out: } /* FIXME - see if this is correct for arm26 */ -long execve(const char *filename, char **argv, char **envp) +int kernel_execve(const char *filename, char *const argv[], char *const envp[]) { struct pt_regs regs; int ret; @@ -320,4 +320,4 @@ long execve(const char *filename, char **argv, char **envp) return ret; } -EXPORT_SYMBOL(execve); +EXPORT_SYMBOL(kernel_execve); diff --git a/arch/avr32/kernel/sys_avr32.c b/arch/avr32/kernel/sys_avr32.c index 6ec5693da44..8deb6003ee6 100644 --- a/arch/avr32/kernel/sys_avr32.c +++ b/arch/avr32/kernel/sys_avr32.c @@ -49,3 +49,17 @@ asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, fput(file); return error; } + +int kernel_execve(const char *file, char **argv, char **envp) +{ + register long scno asm("r8") = __NR_execve; + register long sc1 asm("r12") = (long)file; + register long sc2 asm("r11") = (long)argv; + register long sc3 asm("r10") = (long)envp; + + asm volatile("scall" + : "=r"(sc1) + : "r"(scno), "0"(sc1), "r"(sc2), "r"(sc3) + : "cc", "memory"); + return sc1; +} diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c index 464ecaec3bc..2d0023f2d49 100644 --- a/arch/cris/arch-v32/kernel/smp.c +++ b/arch/cris/arch-v32/kernel/smp.c @@ -28,6 +28,7 @@ spinlock_t cris_atomic_locks[] = { [0 ... LOCK_COUNT - 1] = SPIN_LOCK_UNLOCKED}; /* CPU masks */ cpumask_t cpu_online_map = CPU_MASK_NONE; +EXPORT_SYMBOL(cpu_online_map); cpumask_t phys_cpu_present_map = CPU_MASK_NONE; EXPORT_SYMBOL(phys_cpu_present_map); diff --git a/arch/cris/kernel/setup.c b/arch/cris/kernel/setup.c index 7af3d5d43e4..ca8b45a0fe2 100644 --- a/arch/cris/kernel/setup.c +++ b/arch/cris/kernel/setup.c @@ -160,7 +160,7 @@ setup_arch(char **cmdline_p) show_etrax_copyright(); /* Setup utsname */ - strcpy(system_utsname.machine, cris_machine_name); + strcpy(init_utsname()->machine, cris_machine_name); } static void *c_start(struct seq_file *m, loff_t *pos) diff --git a/arch/frv/kernel/Makefile b/arch/frv/kernel/Makefile index 32db3499c46..e8f73ed28b5 100644 --- a/arch/frv/kernel/Makefile +++ b/arch/frv/kernel/Makefile @@ -8,7 +8,7 @@ heads-$(CONFIG_MMU) := head-mmu-fr451.o extra-y:= head.o init_task.o vmlinux.lds obj-y := $(heads-y) entry.o entry-table.o break.o switch_to.o kernel_thread.o \ - process.o traps.o ptrace.o signal.o dma.o \ + kernel_execve.o process.o traps.o ptrace.o signal.o dma.o \ sys_frv.o time.o semaphore.o setup.o frv_ksyms.o \ debug-stub.o irq.o sleep.o uaccess.o diff --git a/arch/frv/kernel/kernel_execve.S b/arch/frv/kernel/kernel_execve.S new file mode 100644 index 00000000000..9b074a16a05 --- /dev/null +++ b/arch/frv/kernel/kernel_execve.S @@ -0,0 +1,33 @@ +/* in-kernel program execution + * + * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/linkage.h> +#include <asm/unistd.h> + +############################################################################### +# +# Do a system call from kernel instead of calling sys_execve so we end up with +# proper pt_regs. +# +# int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +# +# On entry: GR8/GR9/GR10: arguments to function +# On return: GR8: syscall return. +# +############################################################################### + .globl kernel_execve + .type kernel_execve,@function +kernel_execve: + setlos __NR_execve,gr7 + tira gr0,#0 + bralr + + .size kernel_execve,.-kernel_execve diff --git a/arch/h8300/kernel/sys_h8300.c b/arch/h8300/kernel/sys_h8300.c index 0f61b7ad69a..302a2dfe634 100644 --- a/arch/h8300/kernel/sys_h8300.c +++ b/arch/h8300/kernel/sys_h8300.c @@ -25,6 +25,7 @@ #include <asm/cachectl.h> #include <asm/traps.h> #include <asm/ipc.h> +#include <asm/unistd.h> /* * sys_pipe() is the normal C calling standard for creating @@ -280,3 +281,26 @@ asmlinkage void syscall_print(void *dummy,...) ((regs->pc)&0xffffff)-2,regs->orig_er0,regs->er1,regs->er2,regs->er3,regs->er0); } #endif + +/* + * Do a system call from kernel instead of calling sys_execve so we + * end up with proper pt_regs. + */ +int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +{ + register long res __asm__("er0"); + register const char * _a __asm__("er1") = filename; + register void *_b __asm__("er2") = argv; + register void *_c __asm__("er3") = envp; + __asm__ __volatile__ ("mov.l %1,er0\n\t" + "trapa #0\n\t" + : "=r" (res) + : "g" (__NR_execve), + "g" (_a), + "g" (_b), + "g" (_c) + : "cc", "memory"); + return res; +} + + diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 3fd2f256f2b..af219e51734 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -1142,7 +1142,7 @@ source "arch/i386/oprofile/Kconfig" config KPROBES bool "Kprobes (EXPERIMENTAL)" - depends on EXPERIMENTAL && MODULES + depends on KALLSYMS && EXPERIMENTAL && MODULES help Kprobes allows you to trap at almost any kernel address and execute a callback function. register_kprobe() establishes diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c index afe6505ca0b..d98e44b16fe 100644 --- a/arch/i386/kernel/kprobes.c +++ b/arch/i386/kernel/kprobes.c @@ -230,20 +230,20 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) { unsigned long *sara = (unsigned long *)®s->esp; - struct kretprobe_instance *ri; - if ((ri = get_free_rp_inst(rp)) != NULL) { - ri->rp = rp; - ri->task = current; + struct kretprobe_instance *ri; + + if ((ri = get_free_rp_inst(rp)) != NULL) { + ri->rp = rp; + ri->task = current; ri->ret_addr = (kprobe_opcode_t *) *sara; /* Replace the return addr with trampoline addr */ *sara = (unsigned long) &kretprobe_trampoline; - - add_rp_inst(ri); - } else { - rp->nmissed++; - } + add_rp_inst(ri); + } else { + rp->nmissed++; + } } /* @@ -359,7 +359,7 @@ no_kprobe: void __kprobes kretprobe_trampoline_holder(void) { asm volatile ( ".global kretprobe_trampoline\n" - "kretprobe_trampoline: \n" + "kretprobe_trampoline: \n" " pushf\n" /* skip cs, eip, orig_eax, es, ds */ " subl $20, %esp\n" @@ -395,14 +395,15 @@ no_kprobe: */ fastcall void *__kprobes trampoline_handler(struct pt_regs *regs) { - struct kretprobe_instance *ri = NULL; - struct hlist_head *head; - struct hlist_node *node, *tmp; + struct kretprobe_instance *ri = NULL; + struct hlist_head *head, empty_rp; + struct hlist_node *node, *tmp; unsigned long flags, orig_ret_address = 0; unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; + INIT_HLIST_HEAD(&empty_rp); spin_lock_irqsave(&kretprobe_lock, flags); - head = kretprobe_inst_table_head(current); + head = kretprobe_inst_table_head(current); /* * It is possible to have multiple instances associated with a given @@ -413,14 +414,14 @@ fastcall void *__kprobes trampoline_handler(struct pt_regs *regs) * We can handle this because: * - instances are always inserted at the head of the list * - when multiple return probes are registered for the same - * function, the first instance's ret_addr will point to the + * function, the first instance's ret_addr will point to the * real return address, and all the rest will point to * kretprobe_trampoline */ hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { - if (ri->task != current) + if (ri->task != current) /* another task is sharing our hash bucket */ - continue; + continue; if (ri->rp && ri->rp->handler){ __get_cpu_var(current_kprobe) = &ri->rp->kp; @@ -429,7 +430,7 @@ fastcall void *__kprobes trampoline_handler(struct pt_regs *regs) } orig_ret_address = (unsigned long)ri->ret_addr; - recycle_rp_inst(ri); + recycle_rp_inst(ri, &empty_rp); if (orig_ret_address != trampoline_address) /* @@ -444,6 +445,10 @@ fastcall void *__kprobes trampoline_handler(struct pt_regs *regs) spin_unlock_irqrestore(&kretprobe_lock, flags); + hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { + hlist_del(&ri->hlist); + kfree(ri); + } return (void*)orig_ret_address; } diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 96cd0232e1e..dad02a960e0 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -297,9 +297,9 @@ void show_regs(struct pt_regs * regs) if (user_mode_vm(regs)) printk(" ESP: %04x:%08lx",0xffff & regs->xss,regs->esp); printk(" EFLAGS: %08lx %s (%s %.*s)\n", - regs->eflags, print_tainted(), system_utsname.release, - (int)strcspn(system_utsname.version, " "), - system_utsname.version); + regs->eflags, print_tainted(), init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", regs->eax,regs->ebx,regs->ecx,regs->edx); printk("ESI: %08lx EDI: %08lx EBP: %08lx", diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 0831f709f77..9d93ecf6d99 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -612,6 +612,7 @@ extern struct { /* which logical CPUs are on which nodes */ cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly = { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE }; +EXPORT_SYMBOL(node_2_cpu_mask); /* which node each logical CPU is on */ int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; EXPORT_SYMBOL(cpu_2_node); diff --git a/arch/i386/kernel/sys_i386.c b/arch/i386/kernel/sys_i386.c index 8fdb1fb17a5..4048397f174 100644 --- a/arch/i386/kernel/sys_i386.c +++ b/arch/i386/kernel/sys_i386.c @@ -21,6 +21,7 @@ #include <linux/utsname.h> #include <asm/uaccess.h> +#include <asm/unistd.h> #include <asm/ipc.h> /* @@ -210,7 +211,7 @@ asmlinkage int sys_uname(struct old_utsname __user * name) if (!name) return -EFAULT; down_read(&uts_sem); - err=copy_to_user(name, &system_utsname, sizeof (*name)); + err = copy_to_user(name, utsname(), sizeof (*name)); up_read(&uts_sem); return err?-EFAULT:0; } @@ -226,16 +227,21 @@ asmlinkage int sys_olduname(struct oldold_utsname __user * name) down_read(&uts_sem); - error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN); - error |= __put_user(0,name->sysname+__OLD_UTS_LEN); - error |= __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN); - error |= __put_user(0,name->nodename+__OLD_UTS_LEN); - error |= __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN); - error |= __put_user(0,name->release+__OLD_UTS_LEN); - error |= __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN); - error |= __put_user(0,name->version+__OLD_UTS_LEN); - error |= __copy_to_user(&name->machine,&system_utsname.machine,__OLD_UTS_LEN); - error |= __put_user(0,name->machine+__OLD_UTS_LEN); + error = __copy_to_user(&name->sysname, &utsname()->sysname, + __OLD_UTS_LEN); + error |= __put_user(0, name->sysname + __OLD_UTS_LEN); + error |= __copy_to_user(&name->nodename, &utsname()->nodename, + __OLD_UTS_LEN); + error |= __put_user(0, name->nodename + __OLD_UTS_LEN); + error |= __copy_to_user(&name->release, &utsname()->release, + __OLD_UTS_LEN); + error |= __put_user(0, name->release + __OLD_UTS_LEN); + error |= __copy_to_user(&name->version, &utsname()->version, + __OLD_UTS_LEN); + error |= __put_user(0, name->version + __OLD_UTS_LEN); + error |= __copy_to_user(&name->machine, &utsname()->machine, + __OLD_UTS_LEN); + error |= __put_user(0, name->machine + __OLD_UTS_LEN); up_read(&uts_sem); @@ -243,3 +249,17 @@ asmlinkage int sys_olduname(struct oldold_utsname __user * name) return error; } + + +/* + * Do a system call from kernel instead of calling sys_execve so we + * end up with proper pt_regs. + */ +int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +{ + long __res; + asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" + : "=a" (__res) + : "0" (__NR_execve),"ri" (filename),"c" (argv), "d" (envp) : "memory"); + return __res; +} diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c index 6820b8d643c..00489b706d2 100644 --- a/arch/i386/kernel/traps.c +++ b/arch/i386/kernel/traps.c @@ -357,9 +357,9 @@ void show_registers(struct pt_regs *regs) KERN_EMERG "EIP: %04x:[<%08lx>] %s VLI\n" KERN_EMERG "EFLAGS: %08lx (%s %.*s)\n", smp_processor_id(), 0xffff & regs->xcs, regs->eip, - print_tainted(), regs->eflags, system_utsname.release, - (int)strcspn(system_utsname.version, " "), - system_utsname.version); + print_tainted(), regs->eflags, init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); print_symbol(KERN_EMERG "EIP is at %s\n", regs->eip); printk(KERN_EMERG "eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", regs->eax, regs->ebx, regs->ecx, regs->edx); diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 0b7f701d5cf..70f7eb9fed3 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -516,7 +516,7 @@ source "arch/ia64/oprofile/Kconfig" config KPROBES bool "Kprobes (EXPERIMENTAL)" - depends on EXPERIMENTAL && MODULES + depends on KALLSYMS && EXPERIMENTAL && MODULES help Kprobes allows you to trap at almost any kernel address and execute a callback function. register_kprobe() establishes diff --git a/arch/ia64/hp/sim/simserial.c b/arch/ia64/hp/sim/simserial.c index 0daacc20ed3..246eb3d3757 100644 --- a/arch/ia64/hp/sim/simserial.c +++ b/arch/ia64/hp/sim/simserial.c @@ -940,7 +940,7 @@ static inline void show_serial_version(void) printk(KERN_INFO " no serial options enabled\n"); } -static struct tty_operations hp_ops = { +static const struct tty_operations hp_ops = { .open = rs_open, .close = rs_close, .write = rs_write, diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index 12701cf32d9..e5b1be51b19 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -492,11 +492,11 @@ GLOBAL_ENTRY(prefetch_stack) br.ret.sptk.many rp END(prefetch_stack) -GLOBAL_ENTRY(execve) +GLOBAL_ENTRY(kernel_execve) mov r15=__NR_execve // put syscall number in place break __BREAK_SYSCALL br.ret.sptk.many rp -END(execve) +END(kernel_execve) GLOBAL_ENTRY(clone) mov r15=__NR_clone // put syscall number in place diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c index 169ec3a7156..51217d63285 100644 --- a/arch/ia64/kernel/kprobes.c +++ b/arch/ia64/kernel/kprobes.c @@ -90,7 +90,7 @@ static void __kprobes update_kprobe_inst_flag(uint template, uint slot, p->ainsn.target_br_reg = 0; /* Check for Break instruction - * Bits 37:40 Major opcode to be zero + * Bits 37:40 Major opcode to be zero * Bits 27:32 X6 to be zero * Bits 32:35 X3 to be zero */ @@ -104,19 +104,19 @@ static void __kprobes update_kprobe_inst_flag(uint template, uint slot, switch (major_opcode) { case INDIRECT_CALL_OPCODE: p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; - p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); - break; + p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); + break; case IP_RELATIVE_PREDICT_OPCODE: case IP_RELATIVE_BRANCH_OPCODE: p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR; - break; + break; case IP_RELATIVE_CALL_OPCODE: - p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR; - p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; - p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); - break; + p->ainsn.inst_flag |= INST_FLAG_FIX_RELATIVE_IP_ADDR; + p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; + p->ainsn.target_br_reg = ((kprobe_inst >> 6) & 0x7); + break; } - } else if (bundle_encoding[template][slot] == X) { + } else if (bundle_encoding[template][slot] == X) { switch (major_opcode) { case LONG_CALL_OPCODE: p->ainsn.inst_flag |= INST_FLAG_FIX_BRANCH_REG; @@ -258,18 +258,18 @@ static void __kprobes get_kprobe_inst(bundle_t *bundle, uint slot, switch (slot) { case 0: - *major_opcode = (bundle->quad0.slot0 >> SLOT0_OPCODE_SHIFT); - *kprobe_inst = bundle->quad0.slot0; - break; + *major_opcode = (bundle->quad0.slot0 >> SLOT0_OPCODE_SHIFT); + *kprobe_inst = bundle->quad0.slot0; + break; case 1: - *major_opcode = (bundle->quad1.slot1_p1 >> SLOT1_p1_OPCODE_SHIFT); - kprobe_inst_p0 = bundle->quad0.slot1_p0; - kprobe_inst_p1 = bundle->quad1.slot1_p1; - *kprobe_inst = kprobe_inst_p0 | (kprobe_inst_p1 << (64-46)); + *major_opcode = (bundle->quad1.slot1_p1 >> SLOT1_p1_OPCODE_SHIFT); + kprobe_inst_p0 = bundle->quad0.slot1_p0; + kprobe_inst_p1 = bundle->quad1.slot1_p1; + *kprobe_inst = kprobe_inst_p0 | (kprobe_inst_p1 << (64-46)); break; case 2: - *major_opcode = (bundle->quad1.slot2 >> SLOT2_OPCODE_SHIFT); - *kprobe_inst = bundle->quad1.slot2; + *major_opcode = (bundle->quad1.slot2 >> SLOT2_OPCODE_SHIFT); + *kprobe_inst = bundle->quad1.slot2; break; } } @@ -290,11 +290,11 @@ static int __kprobes valid_kprobe_addr(int template, int slot, return -EINVAL; } - if (in_ivt_functions(addr)) { - printk(KERN_WARNING "Kprobes can't be inserted inside " + if (in_ivt_functions(addr)) { + printk(KERN_WARNING "Kprobes can't be inserted inside " "IVT functions at 0x%lx\n", addr); - return -EINVAL; - } + return -EINVAL; + } if (slot == 1 && bundle_encoding[template][1] != L) { printk(KERN_WARNING "Inserting kprobes on slot #1 " @@ -338,12 +338,13 @@ static void kretprobe_trampoline(void) int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) { struct kretprobe_instance *ri = NULL; - struct hlist_head *head; + struct hlist_head *head, empty_rp; struct hlist_node *node, *tmp; unsigned long flags, orig_ret_address = 0; unsigned long trampoline_address = ((struct fnptr *)kretprobe_trampoline)->ip; + INIT_HLIST_HEAD(&empty_rp); spin_lock_irqsave(&kretprobe_lock, flags); head = kretprobe_inst_table_head(current); @@ -369,7 +370,7 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) ri->rp->handler(ri, regs); orig_ret_address = (unsigned long)ri->ret_addr; - recycle_rp_inst(ri); + recycle_rp_inst(ri, &empty_rp); if (orig_ret_address != trampoline_address) /* @@ -387,6 +388,10 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) spin_unlock_irqrestore(&kretprobe_lock, flags); preempt_enable_no_resched(); + hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { + hlist_del(&ri->hlist); + kfree(ri); + } /* * By returning a non-zero value, we are telling * kprobe_handler() that we don't want the post_handler @@ -424,14 +429,14 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) bundle_t *bundle; bundle = &((kprobe_opcode_t *)kprobe_addr)->bundle; - template = bundle->quad0.template; + template = bundle->quad0.template; if(valid_kprobe_addr(template, slot, addr)) return -EINVAL; /* Move to slot 2, if bundle is MLX type and kprobe slot is 1 */ - if (slot == 1 && bundle_encoding[template][1] == L) - slot++; + if (slot == 1 && bundle_encoding[template][1] == L) + slot++; /* Get kprobe_inst and major_opcode from the bundle */ get_kprobe_inst(bundle, slot, &kprobe_inst, &major_opcode); @@ -489,21 +494,22 @@ void __kprobes arch_remove_kprobe(struct kprobe *p) */ static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs) { - unsigned long bundle_addr = (unsigned long) (&p->ainsn.insn->bundle); - unsigned long resume_addr = (unsigned long)p->addr & ~0xFULL; - unsigned long template; - int slot = ((unsigned long)p->addr & 0xf); + unsigned long bundle_addr = (unsigned long) (&p->ainsn.insn->bundle); + unsigned long resume_addr = (unsigned long)p->addr & ~0xFULL; + unsigned long template; + int slot = ((unsigned long)p->addr & 0xf); template = p->ainsn.insn->bundle.quad0.template; - if (slot == 1 && bundle_encoding[template][1] == L) - slot = 2; + if (slot == 1 && bundle_encoding[template][1] == L) + slot = 2; if (p->ainsn.inst_flag) { if (p->ainsn.inst_flag & INST_FLAG_FIX_RELATIVE_IP_ADDR) { /* Fix relative IP address */ - regs->cr_iip = (regs->cr_iip - bundle_addr) + resume_addr; + regs->cr_iip = (regs->cr_iip - bundle_addr) + + resume_addr; } if (p->ainsn.inst_flag & INST_FLAG_FIX_BRANCH_REG) { @@ -540,18 +546,18 @@ static void __kprobes resume_execution(struct kprobe *p, struct pt_regs *regs) } if (slot == 2) { - if (regs->cr_iip == bundle_addr + 0x10) { - regs->cr_iip = resume_addr + 0x10; - } - } else { - if (regs->cr_iip == bundle_addr) { - regs->cr_iip = resume_addr; - } + if (regs->cr_iip == bundle_addr + 0x10) { + regs->cr_iip = resume_addr + 0x10; + } + } else { + if (regs->cr_iip == bundle_addr) { + regs->cr_iip = resume_addr; + } } turn_ss_off: - /* Turn off Single Step bit */ - ia64_psr(regs)->ss = 0; + /* Turn off Single Step bit */ + ia64_psr(regs)->ss = 0; } static void __kprobes prepare_ss(struct kprobe *p, struct pt_regs *regs) @@ -587,7 +593,7 @@ static int __kprobes is_ia64_break_inst(struct pt_regs *regs) /* Move to slot 2, if bundle is MLX type and kprobe slot is 1 */ if (slot == 1 && bundle_encoding[template][1] == L) - slot++; + slot++; /* Get Kprobe probe instruction at given slot*/ get_kprobe_inst(&bundle, slot, &kprobe_inst, &major_opcode); @@ -627,7 +633,7 @@ static int __kprobes pre_kprobes_handler(struct die_args *args) if (p) { if ((kcb->kprobe_status == KPROBE_HIT_SS) && (p->ainsn.inst_flag == INST_FLAG_BREAK_INST)) { - ia64_psr(regs)->ss = 0; + ia64_psr(regs)->ss = 0; goto no_kprobe; } /* We have reentered the pre_kprobe_handler(), since @@ -887,7 +893,7 @@ int __kprobes setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) * fix the return address to our jprobe_inst_return() function * in the jprobes.S file */ - regs->b0 = ((struct fnptr *)(jprobe_inst_return))->ip; + regs->b0 = ((struct fnptr *)(jprobe_inst_return))->ip; return 1; } diff --git a/arch/ia64/kernel/numa.c b/arch/ia64/kernel/numa.c index 20340631179..a78b45f5fe2 100644 --- a/arch/ia64/kernel/numa.c +++ b/arch/ia64/kernel/numa.c @@ -28,6 +28,7 @@ u16 cpu_to_node_map[NR_CPUS] __cacheline_aligned; EXPORT_SYMBOL(cpu_to_node_map); cpumask_t node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned; +EXPORT_SYMBOL(node_to_cpu_mask); void __cpuinit map_cpu_to_node(int cpu, int nid) { diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index ea914cc6812..51922b98086 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -8,8 +8,6 @@ * 2005-10-07 Keith Owens <kaos@sgi.com> * Add notify_die() hooks. */ -#define __KERNEL_SYSCALLS__ /* see <asm/unistd.h> */ - #include <linux/cpu.h> #include <linux/pm.h> #include <linux/elf.h> diff --git a/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/arch/ia64/sn/kernel/sn2/sn_hwperf.c index b632b9c1e3b..462ea178f49 100644 --- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c +++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c @@ -423,7 +423,7 @@ static int sn_topology_show(struct seq_file *s, void *d) "coherency_domain %d, " "region_size %d\n", - partid, system_utsname.nodename, + partid, utsname()->nodename, shubtype ? "shub2" : "shub1", (u64)nasid_mask << nasid_shift, nasid_msb, nasid_shift, system_size, sharing_size, coher, region_size); diff --git a/arch/m32r/kernel/sys_m32r.c b/arch/m32r/kernel/sys_m32r.c index a9cea32eb82..b567351f3c5 100644 --- a/arch/m32r/kernel/sys_m32r.c +++ b/arch/m32r/kernel/sys_m32r.c @@ -25,6 +25,8 @@ #include <asm/cachectl.h> #include <asm/cacheflush.h> #include <asm/ipc.h> +#include <asm/syscall.h> +#include <asm/unistd.h> /* * sys_tas() - test-and-set @@ -205,7 +207,7 @@ asmlinkage int sys_uname(struct old_utsname * name) if (!name) return -EFAULT; down_read(&uts_sem); - err=copy_to_user(name, &system_utsname, sizeof (*name)); + err = copy_to_user(name, utsname(), sizeof (*name)); up_read(&uts_sem); return err?-EFAULT:0; } @@ -223,3 +225,21 @@ asmlinkage int sys_cachectl(char *addr, int nbytes, int op) return -ENOSYS; } +/* + * Do a system call from kernel instead of calling sys_execve so we + * end up with proper pt_regs. + */ +int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +{ + register long __scno __asm__ ("r7") = __NR_execve; + register long __arg3 __asm__ ("r2") = (long)(envp); + register long __arg2 __asm__ ("r1") = (long)(argv); + register long __res __asm__ ("r0") = (long)(filename); + __asm__ __volatile__ ( + "trap #" SYSCALL_VECTOR "|| nop" + : "=r" (__res) + : "r" (__scno), "0" (__res), "r" (__arg2), + "r" (__arg3) + : "memory"); + return __res; +} diff --git a/arch/m68k/kernel/sys_m68k.c b/arch/m68k/kernel/sys_m68k.c index 143c552d38f..90238a8c9e1 100644 --- a/arch/m68k/kernel/sys_m68k.c +++ b/arch/m68k/kernel/sys_m68k.c @@ -27,6 +27,7 @@ #include <asm/traps.h> #include <asm/ipc.h> #include <asm/page.h> +#include <asm/unistd.h> /* * sys_pipe() is the normal C calling standard for creating @@ -663,3 +664,18 @@ asmlinkage int sys_getpagesize(void) { return PAGE_SIZE; } + +/* + * Do a system call from kernel instead of calling sys_execve so we + * end up with proper pt_regs. + */ +int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +{ + register long __res asm ("%d0") = __NR_execve; + register long __a asm ("%d1") = (long)(filename); + register long __b asm ("%d2") = (long)(argv); + register long __c asm ("%d3") = (long)(envp); + asm volatile ("trap #0" : "+d" (__res) + : "d" (__a), "d" (__b), "d" (__c)); + return __res; +} diff --git a/arch/m68knommu/kernel/sys_m68k.c b/arch/m68knommu/kernel/sys_m68k.c index d87e1e0a133..c3494b8447d 100644 --- a/arch/m68knommu/kernel/sys_m68k.c +++ b/arch/m68knommu/kernel/sys_m68k.c @@ -26,6 +26,7 @@ #include <asm/traps.h> #include <asm/ipc.h> #include <asm/cacheflush.h> +#include <asm/unistd.h> /* * sys_pipe() is the normal C calling standard for creating @@ -206,3 +207,17 @@ asmlinkage int sys_getpagesize(void) return PAGE_SIZE; } +/* + * Do a system call from kernel instead of calling sys_execve so we + * end up with proper pt_regs. + */ +int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +{ + register long __res asm ("%d0") = __NR_execve; + register long __a asm ("%d1") = (long)(filename); + register long __b asm ("%d2") = (long)(argv); + register long __c asm ("%d3") = (long)(envp); + asm volatile ("trap #0" : "+d" (__res) + : "d" (__a), "d" (__b), "d" (__c)); + return __res; +} diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c index 43b1162d714..52cada45b35 100644 --- a/arch/mips/kernel/linux32.c +++ b/arch/mips/kernel/linux32.c @@ -1039,7 +1039,7 @@ asmlinkage long sys32_newuname(struct new_utsname __user * name) int ret = 0; down_read(&uts_sem); - if (copy_to_user(name,&system_utsname,sizeof *name)) + if (copy_to_user(name, utsname(), sizeof *name)) ret = -EFAULT; up_read(&uts_sem); diff --git a/arch/mips/kernel/signal_n32.c b/arch/mips/kernel/signal_n32.c index 50c17eaa7f2..477c5334ec1 100644 --- a/arch/mips/kernel/signal_n32.c +++ b/arch/mips/kernel/signal_n32.c @@ -42,8 +42,6 @@ #include "signal-common.h" -extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat); - /* * Including <asm/unistd.h> would give use the 64-bit syscall numbers ... */ @@ -83,6 +81,8 @@ struct rt_sigframe_n32 { #endif }; +extern void sigset_from_compat (sigset_t *set, compat_sigset_t *compat); + save_static_function(sysn32_rt_sigsuspend); __attribute_used__ noinline static int _sysn32_rt_sigsuspend(nabi_no_regargs struct pt_regs regs) diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c index 9951240cc3f..26e1a7e78d1 100644 --- a/arch/mips/kernel/syscall.c +++ b/arch/mips/kernel/syscall.c @@ -231,7 +231,7 @@ out: */ asmlinkage int sys_uname(struct old_utsname __user * name) { - if (name && !copy_to_user(name, &system_utsname, sizeof (*name))) + if (name && !copy_to_user(name, utsname(), sizeof (*name))) return 0; return -EFAULT; } @@ -248,16 +248,21 @@ asmlinkage int sys_olduname(struct oldold_utsname __user * name) if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname))) return -EFAULT; - error = __copy_to_user(&name->sysname,&system_utsname.sysname,__OLD_UTS_LEN); - error -= __put_user(0,name->sysname+__OLD_UTS_LEN); - error -= __copy_to_user(&name->nodename,&system_utsname.nodename,__OLD_UTS_LEN); - error -= __put_user(0,name->nodename+__OLD_UTS_LEN); - error -= __copy_to_user(&name->release,&system_utsname.release,__OLD_UTS_LEN); - error -= __put_user(0,name->release+__OLD_UTS_LEN); - error -= __copy_to_user(&name->version,&system_utsname.version,__OLD_UTS_LEN); - error -= __put_user(0,name->version+__OLD_UTS_LEN); - error -= __copy_to_user(&name->machine,&system_utsname.machine,__OLD_UTS_LEN); - error = __put_user(0,name->machine+__OLD_UTS_LEN); + error = __copy_to_user(&name->sysname, &utsname()->sysname, + __OLD_UTS_LEN); + error -= __put_user(0, name->sysname + __OLD_UTS_LEN); + error -= __copy_to_user(&name->nodename, &utsname()->nodename, + __OLD_UTS_LEN); + error -= __put_user(0, name->nodename + __OLD_UTS_LEN); + error -= __copy_to_user(&name->release, &utsname()->release, + __OLD_UTS_LEN); + error -= __put_user(0, name->release + __OLD_UTS_LEN); + error -= __copy_to_user(&name->version, &utsname()->version, + __OLD_UTS_LEN); + error -= __put_user(0, name->version + __OLD_UTS_LEN); + error -= __copy_to_user(&name->machine, &utsname()->machine, + __OLD_UTS_LEN); + error = __put_user(0, name->machine + __OLD_UTS_LEN); error = error ? -EFAULT : 0; return error; @@ -401,3 +406,32 @@ asmlinkage void bad_stack(void) { do_exit(SIGSEGV); } + +/* + * Do a system call from kernel instead of calling sys_execve so we + * end up with proper pt_regs. + */ +int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +{ + register unsigned long __a0 asm("$4") = (unsigned long) filename; + register unsigned long __a1 asm("$5") = (unsigned long) argv; + register unsigned long __a2 asm("$6") = (unsigned long) envp; + register unsigned long __a3 asm("$7"); + unsigned long __v0; + + __asm__ volatile (" \n" + " .set noreorder \n" + " li $2, %5 # __NR_execve \n" + " syscall \n" + " move %0, $2 \n" + " .set reorder \n" + : "=&r" (__v0), "=r" (__a3) + : "r" (__a0), "r" (__a1), "r" (__a2), "i" (__NR_execve) + : "$2", "$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15", "$24", + "memory"); + + if (__a3 == 0) + return __v0; + + return -__v0; +} diff --git a/arch/mips/kernel/sysirix.c b/arch/mips/kernel/sysirix.c index 1137dd6ea7a..11bb9717497 100644 --- a/arch/mips/kernel/sysirix.c +++ b/arch/mips/kernel/sysirix.c @@ -884,7 +884,7 @@ asmlinkage int irix_getdomainname(char __user *name, int len) down_read(&uts_sem); if (len > __NEW_UTS_LEN) len = __NEW_UTS_LEN; - err = copy_to_user(name, system_utsname.domainname, len) ? -EFAULT : 0; + err = copy_to_user(name, utsname()->domainname, len) ? -EFAULT : 0; up_read(&uts_sem); return err; @@ -1127,11 +1127,11 @@ struct iuname { asmlinkage int irix_uname(struct iuname __user *buf) { down_read(&uts_sem); - if (copy_from_user(system_utsname.sysname, buf->sysname, 65) - || copy_from_user(system_utsname.nodename, buf->nodename, 65) - || copy_from_user(system_utsname.release, buf->release, 65) - || copy_from_user(system_utsname.version, buf->version, 65) - || copy_from_user(system_utsname.machine, buf->machine, 65)) { + if (copy_from_user(utsname()->sysname, buf->sysname, 65) + || copy_from_user(utsname()->nodename, buf->nodename, 65) + || copy_from_user(utsname()->release, buf->release, 65) + || copy_from_user(utsname()->version, buf->version, 65) + || copy_from_user(utsname()->machine, buf->machine, 65)) { return -EFAULT; } up_read(&uts_sem); diff --git a/arch/mips/sgi-ip22/ip22-reset.c b/arch/mips/sgi-ip22/ip22-reset.c index 8134220ed60..7a941ecff3b 100644 --- a/arch/mips/sgi-ip22/ip22-reset.c +++ b/arch/mips/sgi-ip22/ip22-reset.c @@ -123,7 +123,8 @@ static inline void power_button(void) if (machine_state & MACHINE_PANICED) return; - if ((machine_state & MACHINE_SHUTTING_DOWN) || kill_proc(1,SIGINT,1)) { + if ((machine_state & MACHINE_SHUTTING_DOWN) || + kill_cad_pid(SIGINT, 1)) { /* No init process or button pressed twice. */ sgi_machine_power_off(); } diff --git a/arch/mips/sgi-ip32/ip32-reset.c b/arch/mips/sgi-ip32/ip32-reset.c index 79ddb460565..fd0932b2d52 100644 --- a/arch/mips/sgi-ip32/ip32-reset.c +++ b/arch/mips/sgi-ip32/ip32-reset.c @@ -120,7 +120,7 @@ static inline void ip32_power_button(void) if (has_panicked) return; - if (shuting_down || kill_proc(1, SIGINT, 1)) { + if (shuting_down || kill_cad_pid(SIGINT, 1)) { /* No init process or button pressed twice. */ ip32_machine_power_off(); } diff --git a/arch/parisc/hpux/sys_hpux.c b/arch/parisc/hpux/sys_hpux.c index cb69727027a..2e2dc4f2c85 100644 --- a/arch/parisc/hpux/sys_hpux.c +++ b/arch/parisc/hpux/sys_hpux.c @@ -266,16 +266,21 @@ static int hpux_uname(struct hpux_utsname *name) down_read(&uts_sem); - error = __copy_to_user(&name->sysname,&system_utsname.sysname,HPUX_UTSLEN-1); - error |= __put_user(0,name->sysname+HPUX_UTSLEN-1); - error |= __copy_to_user(&name->nodename,&system_utsname.nodename,HPUX_UTSLEN-1); - error |= __put_user(0,name->nodename+HPUX_UTSLEN-1); - error |= __copy_to_user(&name->release,&system_utsname.release,HPUX_UTSLEN-1); - error |= __put_user(0,name->release+HPUX_UTSLEN-1); - error |= __copy_to_user(&name->version,&system_utsname.version,HPUX_UTSLEN-1); - error |= __put_user(0,name->version+HPUX_UTSLEN-1); - error |= __copy_to_user(&name->machine,&system_utsname.machine,HPUX_UTSLEN-1); - error |= __put_user(0,name->machine+HPUX_UTSLEN-1); + error = __copy_to_user(&name->sysname, &utsname()->sysname, + HPUX_UTSLEN - 1); + error |= __put_user(0, name->sysname + HPUX_UTSLEN - 1); + error |= __copy_to_user(&name->nodename, &utsname()->nodename, + HPUX_UTSLEN - 1); + error |= __put_user(0, name->nodename + HPUX_UTSLEN - 1); + error |= __copy_to_user(&name->release, &utsname()->release, + HPUX_UTSLEN - 1); + error |= __put_user(0, name->release + HPUX_UTSLEN - 1); + error |= __copy_to_user(&name->version, &utsname()->version, + HPUX_UTSLEN - 1); + error |= __put_user(0, name->version + HPUX_UTSLEN - 1); + error |= __copy_to_user(&name->machine, &utsname()->machine, + HPUX_UTSLEN - 1); + error |= __put_user(0, name->machine + HPUX_UTSLEN - 1); up_read(&uts_sem); @@ -373,8 +378,8 @@ int hpux_utssys(char *ubuf, int n, int type) /* TODO: print a warning about using this? */ down_write(&uts_sem); error = -EFAULT; - if (!copy_from_user(system_utsname.sysname, ubuf, len)) { - system_utsname.sysname[len] = 0; + if (!copy_from_user(utsname()->sysname, ubuf, len)) { + utsname()->sysname[len] = 0; error = 0; } up_write(&uts_sem); @@ -400,8 +405,8 @@ int hpux_utssys(char *ubuf, int n, int type) /* TODO: print a warning about this? */ down_write(&uts_sem); error = -EFAULT; - if (!copy_from_user(system_utsname.release, ubuf, len)) { - system_utsname.release[len] = 0; + if (!copy_from_user(utsname()->release, ubuf, len)) { + utsname()->release[len] = 0; error = 0; } up_write(&uts_sem); @@ -422,13 +427,13 @@ int hpux_getdomainname(char *name, int len) down_read(&uts_sem); - nlen = strlen(system_utsname.domainname) + 1; + nlen = strlen(utsname()->domainname) + 1; if (nlen < len) len = nlen; if(len > __NEW_UTS_LEN) goto done; - if(copy_to_user(name, system_utsname.domainname, len)) + if(copy_to_user(name, utsname()->domainname, len)) goto done; err = 0; done: diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 0b485ef4be8..2f9f9dfa66f 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -368,7 +368,14 @@ out: return error; } -unsigned long +extern int __execve(const char *filename, char *const argv[], + char *const envp[], struct task_struct *task); +int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +{ + return __execve(filename, argv, envp, current); +} + +unsigned long get_wchan(struct task_struct *p) { struct unwind_frame_info info; diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index a0dd1b0ee48..032e6ab5d3c 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -1069,7 +1069,7 @@ source "arch/powerpc/oprofile/Kconfig" config KPROBES bool "Kprobes (EXPERIMENTAL)" - depends on PPC64 && EXPERIMENTAL && MODULES + depends on PPC64 && KALLSYMS && EXPERIMENTAL && MODULES help Kprobes allows you to trap at almost any kernel address and execute a callback function. register_kprobe() establishes diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index cd65c367b8b..7b8d12b9026 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -259,14 +259,15 @@ void kretprobe_trampoline_holder(void) */ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) { - struct kretprobe_instance *ri = NULL; - struct hlist_head *head; - struct hlist_node *node, *tmp; + struct kretprobe_instance *ri = NULL; + struct hlist_head *head, empty_rp; + struct hlist_node *node, *tmp; unsigned long flags, orig_ret_address = 0; unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; + INIT_HLIST_HEAD(&empty_rp); spin_lock_irqsave(&kretprobe_lock, flags); - head = kretprobe_inst_table_head(current); + head = kretprobe_inst_table_head(current); /* * It is possible to have multiple instances associated with a given @@ -277,20 +278,20 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) * We can handle this because: * - instances are always inserted at the head of the list * - when multiple return probes are registered for the same - * function, the first instance's ret_addr will point to the + * function, the first instance's ret_addr will point to the * real return address, and all the rest will point to * kretprobe_trampoline */ hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { - if (ri->task != current) + if (ri->task != current) /* another task is sharing our hash bucket */ - continue; + continue; if (ri->rp && ri->rp->handler) ri->rp->handler(ri, regs); orig_ret_address = (unsigned long)ri->ret_addr; - recycle_rp_inst(ri); + recycle_rp_inst(ri, &empty_rp); if (orig_ret_address != trampoline_address) /* @@ -308,12 +309,16 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) spin_unlock_irqrestore(&kretprobe_lock, flags); preempt_enable_no_resched(); - /* - * By returning a non-zero value, we are telling - * kprobe_handler() that we don't want the post_handler - * to run (and have re-enabled preemption) - */ - return 1; + hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { + hlist_del(&ri->hlist); + kfree(ri); + } + /* + * By returning a non-zero value, we are telling + * kprobe_handler() that we don't want the post_handler + * to run (and have re-enabled preemption) + */ + return 1; } /* diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 58758d88336..88fd73fdf04 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -843,7 +843,7 @@ _GLOBAL(kernel_thread) addi r1,r1,16 blr -_GLOBAL(execve) +_GLOBAL(kernel_execve) li r0,__NR_execve sc bnslr diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index e3ed21cd3d9..9c54eccad99 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -556,7 +556,7 @@ _GLOBAL(giveup_altivec) #endif /* CONFIG_ALTIVEC */ -_GLOBAL(execve) +_GLOBAL(kernel_execve) li r0,__NR_execve sc bnslr diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index a127a1e3c09..7b2f6452ba7 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -424,7 +424,7 @@ void show_regs(struct pt_regs * regs) printk("NIP: "REG" LR: "REG" CTR: "REG"\n", regs->nip, regs->link, regs->ctr); printk("REGS: %p TRAP: %04lx %s (%s)\n", - regs, regs->trap, print_tainted(), system_utsname.release); + regs, regs->trap, print_tainted(), init_utsname()->release); printk("MSR: "REG" ", regs->msr); printbits(regs->msr, msr_bits); printk(" CR: %08lX XER: %08lX\n", regs->ccr, regs->xer); diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 962ad5ebc76..cda2dbe70a7 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -414,7 +414,7 @@ void __init setup_system(void) smp_release_cpus(); #endif - printk("Starting Linux PPC64 %s\n", system_utsname.version); + printk("Starting Linux PPC64 %s\n", init_utsname()->version); printk("-----------------------------------------------------\n"); printk("ppc64_pft_size = 0x%lx\n", ppc64_pft_size); diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c index 9b69d99a910..d358866b880 100644 --- a/arch/powerpc/kernel/syscalls.c +++ b/arch/powerpc/kernel/syscalls.c @@ -260,7 +260,7 @@ long ppc_newuname(struct new_utsname __user * name) int err = 0; down_read(&uts_sem); - if (copy_to_user(name, &system_utsname, sizeof(*name))) + if (copy_to_user(name, utsname(), sizeof(*name))) err = -EFAULT; up_read(&uts_sem); if (!err) @@ -273,7 +273,7 @@ int sys_uname(struct old_utsname __user *name) int err = 0; down_read(&uts_sem); - if (copy_to_user(name, &system_utsname, sizeof(*name))) + if (copy_to_user(name, utsname(), sizeof(*name))) err = -EFAULT; up_read(&uts_sem); if (!err) @@ -289,19 +289,19 @@ int sys_olduname(struct oldold_utsname __user *name) return -EFAULT; down_read(&uts_sem); - error = __copy_to_user(&name->sysname, &system_utsname.sysname, + error = __copy_to_user(&name->sysname, &utsname()->sysname, __OLD_UTS_LEN); error |= __put_user(0, name->sysname + __OLD_UTS_LEN); - error |= __copy_to_user(&name->nodename, &system_utsname.nodename, + error |= __copy_to_user(&name->nodename, &utsname()->nodename, __OLD_UTS_LEN); error |= __put_user(0, name->nodename + __OLD_UTS_LEN); - error |= __copy_to_user(&name->release, &system_utsname.release, + error |= __copy_to_user(&name->release, &utsname()->release, __OLD_UTS_LEN); error |= __put_user(0, name->release + __OLD_UTS_LEN); - error |= __copy_to_user(&name->version, &system_utsname.version, + error |= __copy_to_user(&name->version, &utsname()->version, __OLD_UTS_LEN); error |= __put_user(0, name->version + __OLD_UTS_LEN); - error |= __copy_to_user(&name->machine, &system_utsname.machine, + error |= __copy_to_user(&name->machine, &utsname()->machine, __OLD_UTS_LEN); error |= override_machine(name->machine); up_read(&uts_sem); diff --git a/arch/powerpc/platforms/iseries/mf.c b/arch/powerpc/platforms/iseries/mf.c index 1a2c2a50f92..1983b640bac 100644 --- a/arch/powerpc/platforms/iseries/mf.c +++ b/arch/powerpc/platforms/iseries/mf.c @@ -357,7 +357,7 @@ static int dma_and_signal_ce_msg(char *ce_msg, */ static int shutdown(void) { - int rc = kill_proc(1, SIGINT, 1); + int rc = kill_cad_pid(SIGINT, 1); if (rc) { printk(KERN_ALERT "mf.c: SIGINT to init failed (%d), " diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 8ed36214045..98189d8efac 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -342,7 +342,7 @@ static int __init pSeries_init_panel(void) { /* Manually leave the kernel version on the panel. */ ppc_md.progress("Linux ppc64\n", 0); - ppc_md.progress(system_utsname.release, 0); + ppc_md.progress(init_utsname()->version, 0); return 0; } diff --git a/arch/ppc/4xx_io/serial_sicc.c b/arch/ppc/4xx_io/serial_sicc.c index b81a367dc27..87fe9a89dba 100644 --- a/arch/ppc/4xx_io/serial_sicc.c +++ b/arch/ppc/4xx_io/serial_sicc.c @@ -1720,7 +1720,7 @@ static int siccuart_open(struct tty_struct *tty, struct file *filp) return 0; } -static struct tty_operations sicc_ops = { +static const struct tty_operations sicc_ops = { .open = siccuart_open, .close = siccuart_close, .write = siccuart_write, diff --git a/arch/ppc/kernel/misc.S b/arch/ppc/kernel/misc.S index 50b4bbd0680..5f6684012de 100644 --- a/arch/ppc/kernel/misc.S +++ b/arch/ppc/kernel/misc.S @@ -942,20 +942,16 @@ _GLOBAL(kernel_thread) addi r1,r1,16 blr +_GLOBAL(kernel_execve) + li r0,__NR_execve + sc + bnslr + neg r3,r3 + blr + /* * This routine is just here to keep GCC happy - sigh... */ _GLOBAL(__main) blr -#define SYSCALL(name) \ -_GLOBAL(name) \ - li r0,__NR_##name; \ - sc; \ - bnslr; \ - lis r4,errno@ha; \ - stw r3,errno@l(r4); \ - li r3,-1; \ - blr - -SYSCALL(execve) diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index ca28fb0b379..4d9ff5ce4cb 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -369,11 +369,12 @@ void __kprobes kretprobe_trampoline_holder(void) int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) { struct kretprobe_instance *ri = NULL; - struct hlist_head *head; + struct hlist_head *head, empty_rp; struct hlist_node *node, *tmp; unsigned long flags, orig_ret_address = 0; unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; + INIT_HLIST_HEAD(&empty_rp); spin_lock_irqsave(&kretprobe_lock, flags); head = kretprobe_inst_table_head(current); @@ -399,7 +400,7 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) ri->rp->handler(ri, regs); orig_ret_address = (unsigned long)ri->ret_addr; - recycle_rp_inst(ri); + recycle_rp_inst(ri, &empty_rp); if (orig_ret_address != trampoline_address) { /* @@ -417,6 +418,10 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) spin_unlock_irqrestore(&kretprobe_lock, flags); preempt_enable_no_resched(); + hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { + hlist_del(&ri->hlist); + kfree(ri); + } /* * By returning a non-zero value, we are telling * kprobe_handler() that we don't want the post_handler diff --git a/arch/s390/kernel/sys_s390.c b/arch/s390/kernel/sys_s390.c index e351780bb66..584ed95f338 100644 --- a/arch/s390/kernel/sys_s390.c +++ b/arch/s390/kernel/sys_s390.c @@ -27,6 +27,7 @@ #include <linux/file.h> #include <linux/utsname.h> #include <linux/personality.h> +#include <linux/unistd.h> #include <asm/uaccess.h> #include <asm/ipc.h> @@ -266,3 +267,22 @@ s390_fadvise64_64(struct fadvise64_64_args __user *args) return sys_fadvise64_64(a.fd, a.offset, a.len, a.advice); } +/* + * Do a system call from kernel instead of calling sys_execve so we + * end up with proper pt_regs. + */ +int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +{ + register const char *__arg1 asm("2") = filename; + register char *const*__arg2 asm("3") = argv; + register char *const*__arg3 asm("4") = envp; + register long __svcres asm("2"); + asm volatile( + "svc %b1" + : "=d" (__svcres) + : "i" (__NR_execve), + "0" (__arg1), + "d" (__arg2), + "d" (__arg3) : "memory"); + return __svcres; +} diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c index 5f587332234..77491cf9b25 100644 --- a/arch/sh/kernel/setup.c +++ b/arch/sh/kernel/setup.c @@ -459,7 +459,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "machine\t\t: %s\n", get_system_type()); seq_printf(m, "processor\t: %d\n", cpu); - seq_printf(m, "cpu family\t: %s\n", system_utsname.machine); + seq_printf(m, "cpu family\t: %s\n", init_utsname()->machine); seq_printf(m, "cpu type\t: %s\n", get_cpu_subtype()); show_cpuflags(m); diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c index 6c0fb7c4af1..dbebaddcfe3 100644 --- a/arch/sh/kernel/smp.c +++ b/arch/sh/kernel/smp.c @@ -42,6 +42,7 @@ cpumask_t cpu_possible_map; EXPORT_SYMBOL(cpu_possible_map); cpumask_t cpu_online_map; +EXPORT_SYMBOL(cpu_online_map); static atomic_t cpus_booted = ATOMIC_INIT(0); /* These are defined by the board-specific code. */ diff --git a/arch/sh/kernel/sys_sh.c b/arch/sh/kernel/sys_sh.c index b68ff705f06..8fde95001c3 100644 --- a/arch/sh/kernel/sys_sh.c +++ b/arch/sh/kernel/sys_sh.c @@ -25,6 +25,7 @@ #include <asm/cacheflush.h> #include <asm/uaccess.h> #include <asm/ipc.h> +#include <asm/unistd.h> /* * sys_pipe() is the normal C calling standard for creating @@ -281,7 +282,7 @@ asmlinkage int sys_uname(struct old_utsname * name) if (!name) return -EFAULT; down_read(&uts_sem); - err=copy_to_user(name, &system_utsname, sizeof (*name)); + err = copy_to_user(name, utsname(), sizeof (*name)); up_read(&uts_sem); return err?-EFAULT:0; } @@ -309,3 +310,19 @@ asmlinkage int sys_fadvise64_64_wrapper(int fd, u32 offset0, u32 offset1, (u64)len0 << 32 | len1, advice); #endif } + +/* + * Do a system call from kernel instead of calling sys_execve so we + * end up with proper pt_regs. + */ +int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +{ + register long __sc0 __asm__ ("r3") = __NR_execve; + register long __sc4 __asm__ ("r4") = (long) filename; + register long __sc5 __asm__ ("r5") = (long) argv; + register long __sc6 __asm__ ("r6") = (long) envp; + __asm__ __volatile__ ("trapa #0x13" : "=z" (__sc0) + : "0" (__sc0), "r" (__sc4), "r" (__sc5), "r" (__sc6) + : "memory"); + return __sc0; +} diff --git a/arch/sh64/kernel/process.c b/arch/sh64/kernel/process.c index db475b7833f..525d0ec19b7 100644 --- a/arch/sh64/kernel/process.c +++ b/arch/sh64/kernel/process.c @@ -20,261 +20,16 @@ /* * This file handles the architecture-dependent parts of process handling.. */ - -/* Temporary flags/tests. All to be removed/undefined. BEGIN */ -#define IDLE_TRACE -#define VM_SHOW_TABLES -#define VM_TEST_FAULT -#define VM_TEST_RTLBMISS -#define VM_TEST_WTLBMISS - -#undef VM_SHOW_TABLES -#undef IDLE_TRACE -/* Temporary flags/tests. All to be removed/undefined. END */ - -#define __KERNEL_SYSCALLS__ -#include <stdarg.h> - -#include <linux/kernel.h> -#include <linux/rwsem.h> #include <linux/mm.h> -#include <linux/smp.h> -#include <linux/smp_lock.h> #include <linux/ptrace.h> -#include <linux/slab.h> -#include <linux/vmalloc.h> -#include <linux/user.h> -#include <linux/a.out.h> -#include <linux/interrupt.h> -#include <linux/unistd.h> -#include <linux/delay.h> #include <linux/reboot.h> #include <linux/init.h> - +#include <linux/module.h> #include <asm/uaccess.h> #include <asm/pgtable.h> -#include <asm/system.h> -#include <asm/io.h> -#include <asm/processor.h> /* includes also <asm/registers.h> */ -#include <asm/mmu_context.h> -#include <asm/elf.h> -#include <asm/page.h> - -#include <linux/irq.h> struct task_struct *last_task_used_math = NULL; -#ifdef IDLE_TRACE -#ifdef VM_SHOW_TABLES -/* For testing */ -static void print_PTE(long base) -{ - int i, skip=0; - long long x, y, *p = (long long *) base; - - for (i=0; i< 512; i++, p++){ - if (*p == 0) { - if (!skip) { - skip++; - printk("(0s) "); - } - } else { - skip=0; - x = (*p) >> 32; - y = (*p) & 0xffffffff; - printk("%08Lx%08Lx ", x, y); - if (!((i+1)&0x3)) printk("\n"); - } - } -} - -/* For testing */ -static void print_DIR(long base) -{ - int i, skip=0; - long *p = (long *) base; - - for (i=0; i< 512; i++, p++){ - if (*p == 0) { - if (!skip) { - skip++; - printk("(0s) "); - } - } else { - skip=0; - printk("%08lx ", *p); - if (!((i+1)&0x7)) printk("\n"); - } - } -} - -/* For testing */ -static void print_vmalloc_first_tables(void) -{ - -#define PRESENT 0x800 /* Bit 11 */ - - /* - * Do it really dirty by looking at raw addresses, - * raw offsets, no types. If we used pgtable/pgalloc - * macros/definitions we could hide potential bugs. - * - * Note that pointers are 32-bit for CDC. - */ - long pgdt, pmdt, ptet; - - pgdt = (long) &swapper_pg_dir; - printk("-->PGD (0x%08lx):\n", pgdt); - print_DIR(pgdt); - printk("\n"); - - /* VMALLOC pool is mapped at 0xc0000000, second (pointer) entry in PGD */ - pgdt += 4; - pmdt = (long) (* (long *) pgdt); - if (!(pmdt & PRESENT)) { - printk("No PMD\n"); - return; - } else pmdt &= 0xfffff000; - - printk("-->PMD (0x%08lx):\n", pmdt); - print_DIR(pmdt); - printk("\n"); - - /* Get the pmdt displacement for 0xc0000000 */ - pmdt += 2048; - - /* just look at first two address ranges ... */ - /* ... 0xc0000000 ... */ - ptet = (long) (* (long *) pmdt); - if (!(ptet & PRESENT)) { - printk("No PTE0\n"); - return; - } else ptet &= 0xfffff000; - - printk("-->PTE0 (0x%08lx):\n", ptet); - print_PTE(ptet); - printk("\n"); - - /* ... 0xc0001000 ... */ - ptet += 4; - if (!(ptet & PRESENT)) { - printk("No PTE1\n"); - return; - } else ptet &= 0xfffff000; - printk("-->PTE1 (0x%08lx):\n", ptet); - print_PTE(ptet); - printk("\n"); -} -#else -#define print_vmalloc_first_tables() -#endif /* VM_SHOW_TABLES */ - -static void test_VM(void) -{ - void *a, *b, *c; - -#ifdef VM_SHOW_TABLES - printk("Initial PGD/PMD/PTE\n"); -#endif - print_vmalloc_first_tables(); - - printk("Allocating 2 bytes\n"); - a = vmalloc(2); - print_vmalloc_first_tables(); - - printk("Allocating 4100 bytes\n"); - b = vmalloc(4100); - print_vmalloc_first_tables(); - - printk("Allocating 20234 bytes\n"); - c = vmalloc(20234); - print_vmalloc_first_tables(); - -#ifdef VM_TEST_FAULT - /* Here you may want to fault ! */ - -#ifdef VM_TEST_RTLBMISS - printk("Ready to fault upon read.\n"); - if (* (char *) a) { - printk("RTLBMISSed on area a !\n"); - } - printk("RTLBMISSed on area a !\n"); -#endif - -#ifdef VM_TEST_WTLBMISS - printk("Ready to fault upon write.\n"); - *((char *) b) = 'L'; - printk("WTLBMISSed on area b !\n"); -#endif - -#endif /* VM_TEST_FAULT */ - - printk("Deallocating the 4100 byte chunk\n"); - vfree(b); - print_vmalloc_first_tables(); - - printk("Deallocating the 2 byte chunk\n"); - vfree(a); - print_vmalloc_first_tables(); - - printk("Deallocating the last chunk\n"); - vfree(c); - print_vmalloc_first_tables(); -} - -extern unsigned long volatile jiffies; -int once = 0; -unsigned long old_jiffies; -int pid = -1, pgid = -1; - -void idle_trace(void) -{ - - _syscall0(int, getpid) - _syscall1(int, getpgid, int, pid) - - if (!once) { - /* VM allocation/deallocation simple test */ - test_VM(); - pid = getpid(); - - printk("Got all through to Idle !!\n"); - printk("I'm now going to loop forever ...\n"); - printk("Any ! below is a timer tick.\n"); - printk("Any . below is a getpgid system call from pid = %d.\n", pid); - - - old_jiffies = jiffies; - once++; - } - - if (old_jiffies != jiffies) { - old_jiffies = jiffies - old_jiffies; - switch (old_jiffies) { - case 1: - printk("!"); - break; - case 2: - printk("!!"); - break; - case 3: - printk("!!!"); - break; - case 4: - printk("!!!!"); - break; - default: - printk("(%d!)", (int) old_jiffies); - } - old_jiffies = jiffies; - } - pgid = getpgid(pid); - printk("."); -} -#else -#define idle_trace() do { } while (0) -#endif /* IDLE_TRACE */ - static int hlt_counter = 1; #define HARD_IDLE_TIMEOUT (HZ / 3) @@ -323,7 +78,6 @@ void cpu_idle(void) local_irq_disable(); while (!need_resched()) { local_irq_enable(); - idle_trace(); hlt(); local_irq_disable(); } @@ -622,6 +376,10 @@ void free_task_struct(struct task_struct *p) /* * Create a kernel thread */ +ATTRIB_NORET void kernel_thread_helper(void *arg, int (*fn)(void *)) +{ + do_exit(fn(arg)); +} /* * This is the mechanism for creating a new kernel thread. @@ -633,19 +391,17 @@ void free_task_struct(struct task_struct *p) */ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) { - /* A bit less processor dependent than older sh ... */ - unsigned int reply; + struct pt_regs regs; -static __inline__ _syscall2(int,clone,unsigned long,flags,unsigned long,newsp) -static __inline__ _syscall1(int,exit,int,ret) + memset(®s, 0, sizeof(regs)); + regs.regs[2] = (unsigned long)arg; + regs.regs[3] = (unsigned long)fn; - reply = clone(flags | CLONE_VM, 0); - if (!reply) { - /* Child */ - reply = exit(fn(arg)); - } + regs.pc = (unsigned long)kernel_thread_helper; + regs.sr = (1 << 30); - return reply; + return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, + ®s, 0, NULL, NULL); } /* diff --git a/arch/sh64/kernel/sys_sh64.c b/arch/sh64/kernel/sys_sh64.c index 58ff7d522d8..ad0fa4e003e 100644 --- a/arch/sh64/kernel/sys_sh64.c +++ b/arch/sh64/kernel/sys_sh64.c @@ -32,6 +32,7 @@ #include <asm/uaccess.h> #include <asm/ipc.h> #include <asm/ptrace.h> +#include <asm/unistd.h> #define REG_3 3 @@ -279,7 +280,25 @@ asmlinkage int sys_uname(struct old_utsname * name) if (!name) return -EFAULT; down_read(&uts_sem); - err=copy_to_user(name, &system_utsname, sizeof (*name)); + err = copy_to_user(name, utsname(), sizeof (*name)); up_read(&uts_sem); return err?-EFAULT:0; } + +/* + * Do a system call from kernel instead of calling sys_execve so we + * end up with proper pt_regs. + */ +int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +{ + register unsigned long __sc0 __asm__ ("r9") = ((0x13 << 16) | __NR_execve); + register unsigned long __sc2 __asm__ ("r2") = (unsigned long) filename; + register unsigned long __sc3 __asm__ ("r3") = (unsigned long) argv; + register unsigned long __sc4 __asm__ ("r4") = (unsigned long) envp; + __asm__ __volatile__ ("trapa %1 !\t\t\t execve(%2,%3,%4)" + : "=r" (__sc0) + : "r" (__sc0), "r" (__sc2), "r" (__sc3), "r" (__sc4) ); + __asm__ __volatile__ ("!dummy %0 %1 %2 %3" + : : "r" (__sc0), "r" (__sc2), "r" (__sc3), "r" (__sc4) : "memory"); + return __sc0; +} diff --git a/arch/sparc/kernel/sys_sparc.c b/arch/sparc/kernel/sys_sparc.c index 896863fb208..a954a0c0000 100644 --- a/arch/sparc/kernel/sys_sparc.c +++ b/arch/sparc/kernel/sys_sparc.c @@ -24,6 +24,7 @@ #include <asm/uaccess.h> #include <asm/ipc.h> +#include <asm/unistd.h> /* #define DEBUG_UNIMP_SYSCALL */ @@ -475,16 +476,38 @@ asmlinkage int sys_getdomainname(char __user *name, int len) down_read(&uts_sem); - nlen = strlen(system_utsname.domainname) + 1; + nlen = strlen(utsname()->domainname) + 1; err = -EINVAL; if (nlen > len) goto out; err = -EFAULT; - if (!copy_to_user(name, system_utsname.domainname, nlen)) + if (!copy_to_user(name, utsname()->domainname, nlen)) err = 0; out: up_read(&uts_sem); return err; } + +/* + * Do a system call from kernel instead of calling sys_execve so we + * end up with proper pt_regs. + */ +int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +{ + long __res; + register long __g1 __asm__ ("g1") = __NR_execve; + register long __o0 __asm__ ("o0") = (long)(filename); + register long __o1 __asm__ ("o1") = (long)(argv); + register long __o2 __asm__ ("o2") = (long)(envp); + asm volatile ("t 0x10\n\t" + "bcc 1f\n\t" + "mov %%o0, %0\n\t" + "sub %%g0, %%o0, %0\n\t" + "1:\n\t" + : "=r" (__res), "=&r" (__o0) + : "1" (__o0), "r" (__o1), "r" (__o2), "r" (__g1) + : "cc"); + return __res; +} diff --git a/arch/sparc/kernel/sys_sunos.c b/arch/sparc/kernel/sys_sunos.c index aa0fb2efb61..9d2cd97d1c3 100644 --- a/arch/sparc/kernel/sys_sunos.c +++ b/arch/sparc/kernel/sys_sunos.c @@ -483,13 +483,18 @@ asmlinkage int sunos_uname(struct sunos_utsname __user *name) { int ret; down_read(&uts_sem); - ret = copy_to_user(&name->sname[0], &system_utsname.sysname[0], sizeof(name->sname) - 1); + ret = copy_to_user(&name->sname[0], &utsname()->sysname[0], + sizeof(name->sname) - 1); if (!ret) { - ret |= __copy_to_user(&name->nname[0], &system_utsname.nodename[0], sizeof(name->nname) - 1); + ret |= __copy_to_user(&name->nname[0], &utsname()->nodename[0], + sizeof(name->nname) - 1); ret |= __put_user('\0', &name->nname[8]); - ret |= __copy_to_user(&name->rel[0], &system_utsname.release[0], sizeof(name->rel) - 1); - ret |= __copy_to_user(&name->ver[0], &system_utsname.version[0], sizeof(name->ver) - 1); - ret |= __copy_to_user(&name->mach[0], &system_utsname.machine[0], sizeof(name->mach) - 1); + ret |= __copy_to_user(&name->rel[0], &utsname()->release[0], + sizeof(name->rel) - 1); + ret |= __copy_to_user(&name->ver[0], &utsname()->version[0], + sizeof(name->ver) - 1); + ret |= __copy_to_user(&name->mach[0], &utsname()->machine[0], + sizeof(name->mach) - 1); } up_read(&uts_sem); return ret ? -EFAULT : 0; diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig index 8d8ca716f7a..b627f8dbcaa 100644 --- a/arch/sparc64/Kconfig +++ b/arch/sparc64/Kconfig @@ -420,7 +420,7 @@ source "arch/sparc64/oprofile/Kconfig" config KPROBES bool "Kprobes (EXPERIMENTAL)" - depends on EXPERIMENTAL && MODULES + depends on KALLSYMS && EXPERIMENTAL && MODULES help Kprobes allows you to trap at almost any kernel address and execute a callback function. register_kprobe() establishes diff --git a/arch/sparc64/kernel/power.c b/arch/sparc64/kernel/power.c index e55466c77b6..0b9c70627ce 100644 --- a/arch/sparc64/kernel/power.c +++ b/arch/sparc64/kernel/power.c @@ -4,8 +4,6 @@ * Copyright (C) 1999 David S. Miller (davem@redhat.com) */ -#define __KERNEL_SYSCALLS__ - #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> @@ -14,6 +12,7 @@ #include <linux/delay.h> #include <linux/interrupt.h> #include <linux/pm.h> +#include <linux/syscalls.h> #include <asm/system.h> #include <asm/auxio.h> @@ -98,7 +97,7 @@ again: /* Ok, down we go... */ button_pressed = 0; - if (execve("/sbin/shutdown", argv, envp) < 0) { + if (kernel_execve("/sbin/shutdown", argv, envp) < 0) { printk("powerd: shutdown execution failed\n"); add_wait_queue(&powerd_wait, &wait); goto again; diff --git a/arch/sparc64/kernel/sys_sparc.c b/arch/sparc64/kernel/sys_sparc.c index c608c947e6c..a53d4abb4b4 100644 --- a/arch/sparc64/kernel/sys_sparc.c +++ b/arch/sparc64/kernel/sys_sparc.c @@ -31,6 +31,7 @@ #include <asm/utrap.h> #include <asm/perfctr.h> #include <asm/a.out.h> +#include <asm/unistd.h> /* #define DEBUG_UNIMP_SYSCALL */ @@ -712,13 +713,13 @@ asmlinkage long sys_getdomainname(char __user *name, int len) down_read(&uts_sem); - nlen = strlen(system_utsname.domainname) + 1; + nlen = strlen(utsname()->domainname) + 1; err = -EINVAL; if (nlen > len) goto out; err = -EFAULT; - if (!copy_to_user(name, system_utsname.domainname, nlen)) + if (!copy_to_user(name, utsname()->domainname, nlen)) err = 0; out: @@ -963,3 +964,23 @@ asmlinkage long sys_perfctr(int opcode, unsigned long arg0, unsigned long arg1, }; return err; } + +/* + * Do a system call from kernel instead of calling sys_execve so we + * end up with proper pt_regs. + */ +int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +{ + long __res; + register long __g1 __asm__ ("g1") = __NR_execve; + register long __o0 __asm__ ("o0") = (long)(filename); + register long __o1 __asm__ ("o1") = (long)(argv); + register long __o2 __asm__ ("o2") = (long)(envp); + asm volatile ("t 0x6d\n\t" + "sub %%g0, %%o0, %0\n\t" + "movcc %%xcc, %%o0, %0\n\t" + : "=r" (__res), "=&r" (__o0) + : "1" (__o0), "r" (__o1), "r" (__o2), "r" (__g1) + : "cc"); + return __res; +} diff --git a/arch/sparc64/kernel/sys_sunos32.c b/arch/sparc64/kernel/sys_sunos32.c index 87ebdf858a3..953296b73f3 100644 --- a/arch/sparc64/kernel/sys_sunos32.c +++ b/arch/sparc64/kernel/sys_sunos32.c @@ -439,16 +439,16 @@ asmlinkage int sunos_uname(struct sunos_utsname __user *name) int ret; down_read(&uts_sem); - ret = copy_to_user(&name->sname[0], &system_utsname.sysname[0], + ret = copy_to_user(&name->sname[0], &utsname()->sysname[0], sizeof(name->sname) - 1); - ret |= copy_to_user(&name->nname[0], &system_utsname.nodename[0], + ret |= copy_to_user(&name->nname[0], &utsname()->nodename[0], sizeof(name->nname) - 1); ret |= put_user('\0', &name->nname[8]); - ret |= copy_to_user(&name->rel[0], &system_utsname.release[0], + ret |= copy_to_user(&name->rel[0], &utsname()->release[0], sizeof(name->rel) - 1); - ret |= copy_to_user(&name->ver[0], &system_utsname.version[0], + ret |= copy_to_user(&name->ver[0], &utsname()->version[0], sizeof(name->ver) - 1); - ret |= copy_to_user(&name->mach[0], &system_utsname.machine[0], + ret |= copy_to_user(&name->mach[0], &utsname()->machine[0], sizeof(name->mach) - 1); up_read(&uts_sem); return (ret ? -EFAULT : 0); diff --git a/arch/sparc64/solaris/misc.c b/arch/sparc64/solaris/misc.c index 9c581328e76..9ed997982f8 100644 --- a/arch/sparc64/solaris/misc.c +++ b/arch/sparc64/solaris/misc.c @@ -249,7 +249,7 @@ asmlinkage int solaris_utssys(u32 buf, u32 flags, int which, u32 buf2) /* Let's cheat */ err = set_utsfield(v->sysname, "SunOS", 1, 0); down_read(&uts_sem); - err |= set_utsfield(v->nodename, system_utsname.nodename, + err |= set_utsfield(v->nodename, utsname()->nodename, 1, 1); up_read(&uts_sem); err |= set_utsfield(v->release, "2.6", 0, 0); @@ -273,7 +273,7 @@ asmlinkage int solaris_utsname(u32 buf) /* Why should we not lie a bit? */ down_read(&uts_sem); err = set_utsfield(v->sysname, "SunOS", 0, 0); - err |= set_utsfield(v->nodename, system_utsname.nodename, 1, 1); + err |= set_utsfield(v->nodename, utsname()->nodename, 1, 1); err |= set_utsfield(v->release, "5.6", 0, 0); err |= set_utsfield(v->version, "Generic", 0, 0); err |= set_utsfield(v->machine, machine(), 0, 0); @@ -305,7 +305,7 @@ asmlinkage int solaris_sysinfo(int cmd, u32 buf, s32 count) case SI_HOSTNAME: r = buffer + 256; down_read(&uts_sem); - for (p = system_utsname.nodename, q = buffer; + for (p = utsname()->nodename, q = buffer; q < r && *p && *p != '.'; *q++ = *p++); up_read(&uts_sem); *q = 0; diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c index 563ce7690a1..24747a41378 100644 --- a/arch/um/drivers/line.c +++ b/arch/um/drivers/line.c @@ -642,9 +642,9 @@ int line_remove(struct line *lines, unsigned int num, int n) } struct tty_driver *line_register_devfs(struct lines *set, - struct line_driver *line_driver, - struct tty_operations *ops, struct line *lines, - int nlines) + struct line_driver *line_driver, + const struct tty_operations *ops, + struct line *lines, int nlines) { int i; struct tty_driver *driver = alloc_tty_driver(nlines); diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 773a134e7fd..a67dcbd78de 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -106,9 +106,9 @@ void mconsole_version(struct mc_request *req) { char version[256]; - sprintf(version, "%s %s %s %s %s", system_utsname.sysname, - system_utsname.nodename, system_utsname.release, - system_utsname.version, system_utsname.machine); + sprintf(version, "%s %s %s %s %s", utsname()->sysname, + utsname()->nodename, utsname()->release, + utsname()->version, utsname()->machine); mconsole_reply(req, version, 0, 0); } diff --git a/arch/um/include/line.h b/arch/um/include/line.h index 642c9a0320f..7be24811bb3 100644 --- a/arch/um/include/line.h +++ b/arch/um/include/line.h @@ -91,10 +91,9 @@ extern int line_setup_irq(int fd, int input, int output, struct line *line, void *data); extern void line_close_chan(struct line *line); extern struct tty_driver * line_register_devfs(struct lines *set, - struct line_driver *line_driver, - struct tty_operations *driver, - struct line *lines, - int nlines); + struct line_driver *line_driver, + const struct tty_operations *driver, + struct line *lines, int nlines); extern void lines_init(struct line *lines, int nlines, struct chan_opts *opts); extern void close_lines(struct line *lines, int nlines); diff --git a/arch/um/kernel/syscall.c b/arch/um/kernel/syscall.c index 48cf88dd02d..f5ed8624648 100644 --- a/arch/um/kernel/syscall.c +++ b/arch/um/kernel/syscall.c @@ -110,7 +110,7 @@ long sys_uname(struct old_utsname __user * name) if (!name) return -EFAULT; down_read(&uts_sem); - err = copy_to_user(name, &system_utsname, sizeof (*name)); + err = copy_to_user(name, utsname(), sizeof (*name)); up_read(&uts_sem); return err?-EFAULT:0; } @@ -126,21 +126,21 @@ long sys_olduname(struct oldold_utsname __user * name) down_read(&uts_sem); - error = __copy_to_user(&name->sysname,&system_utsname.sysname, + error = __copy_to_user(&name->sysname, &utsname()->sysname, __OLD_UTS_LEN); - error |= __put_user(0,name->sysname+__OLD_UTS_LEN); - error |= __copy_to_user(&name->nodename,&system_utsname.nodename, + error |= __put_user(0, name->sysname + __OLD_UTS_LEN); + error |= __copy_to_user(&name->nodename, &utsname()->nodename, __OLD_UTS_LEN); - error |= __put_user(0,name->nodename+__OLD_UTS_LEN); - error |= __copy_to_user(&name->release,&system_utsname.release, + error |= __put_user(0, name->nodename + __OLD_UTS_LEN); + error |= __copy_to_user(&name->release, &utsname()->release, __OLD_UTS_LEN); - error |= __put_user(0,name->release+__OLD_UTS_LEN); - error |= __copy_to_user(&name->version,&system_utsname.version, + error |= __put_user(0, name->release + __OLD_UTS_LEN); + error |= __copy_to_user(&name->version, &utsname()->version, __OLD_UTS_LEN); - error |= __put_user(0,name->version+__OLD_UTS_LEN); - error |= __copy_to_user(&name->machine,&system_utsname.machine, + error |= __put_user(0, name->version + __OLD_UTS_LEN); + error |= __copy_to_user(&name->machine, &utsname()->machine, __OLD_UTS_LEN); - error |= __put_user(0,name->machine+__OLD_UTS_LEN); + error |= __put_user(0, name->machine + __OLD_UTS_LEN); up_read(&uts_sem); @@ -164,3 +164,16 @@ int next_syscall_index(int limit) spin_unlock(&syscall_lock); return(ret); } + +int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +{ + mm_segment_t fs; + int ret; + + fs = get_fs(); + set_fs(KERNEL_DS); + ret = um_execve(filename, argv, envp); + set_fs(fs); + + return ret; +} diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 55005710dcb..97d88e7902f 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -167,7 +167,7 @@ static char *usage_string = static int __init uml_version_setup(char *line, int *add) { - printf("%s\n", system_utsname.release); + printf("%s\n", init_utsname()->release); exit(0); return 0; @@ -278,7 +278,7 @@ static int __init Usage(char *line, int *add) { const char **p; - printf(usage_string, system_utsname.release); + printf(usage_string, init_utsname()->release); p = &__uml_help_start; while (p < &__uml_help_end) { printf("%s", *p); @@ -403,7 +403,7 @@ int linux_main(int argc, char **argv) /* Reserve up to 4M after the current brk */ uml_reserved = ROUND_4M(brk_start) + (1 << 22); - setup_machinename(system_utsname.machine); + setup_machinename(init_utsname()->machine); #ifdef CONFIG_CMDLINE_ON_HOST argv1_begin = argv[1]; diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c index ff203625a4b..51f0893640a 100644 --- a/arch/um/os-Linux/process.c +++ b/arch/um/os-Linux/process.c @@ -11,6 +11,7 @@ #include <sys/mman.h> #include <sys/wait.h> #include <sys/mman.h> +#include <sys/syscall.h> #include "ptrace_user.h" #include "os.h" #include "user.h" @@ -140,11 +141,9 @@ void os_usr1_process(int pid) * syscalls, and also breaks with clone(), which does not unshare the TLS. */ -inline _syscall0(pid_t, getpid) - int os_getpid(void) { - return(getpid()); + return(syscall(__NR_getpid)); } int os_getpgrp(void) diff --git a/arch/um/os-Linux/sys-i386/tls.c b/arch/um/os-Linux/sys-i386/tls.c index 120abbe4e3c..6e945ab4584 100644 --- a/arch/um/os-Linux/sys-i386/tls.c +++ b/arch/um/os-Linux/sys-i386/tls.c @@ -1,10 +1,9 @@ #include <errno.h> #include <linux/unistd.h> +#include <sys/syscall.h> #include "sysdep/tls.h" #include "user_util.h" -static _syscall1(int, get_thread_area, user_desc_t *, u_info); - /* Checks whether host supports TLS, and sets *tls_min according to the value * valid on the host. * i386 host have it == 6; x86_64 host have it == 12, for i386 emulation. */ @@ -17,7 +16,7 @@ void check_host_supports_tls(int *supports_tls, int *tls_min) { user_desc_t info; info.entry_number = val[i]; - if (get_thread_area(&info) == 0) { + if (syscall(__NR_get_thread_area, &info) == 0) { *tls_min = val[i]; *supports_tls = 1; return; diff --git a/arch/um/os-Linux/tls.c b/arch/um/os-Linux/tls.c index 9cb09a45546..a2de2580b8a 100644 --- a/arch/um/os-Linux/tls.c +++ b/arch/um/os-Linux/tls.c @@ -1,5 +1,6 @@ #include <errno.h> #include <sys/ptrace.h> +#include <sys/syscall.h> #include <asm/ldt.h> #include "sysdep/tls.h" #include "uml-config.h" @@ -48,14 +49,11 @@ int os_get_thread_area(user_desc_t *info, int pid) #ifdef UML_CONFIG_MODE_TT #include "linux/unistd.h" -static _syscall1(int, get_thread_area, user_desc_t *, u_info); -static _syscall1(int, set_thread_area, user_desc_t *, u_info); - int do_set_thread_area_tt(user_desc_t *info) { int ret; - ret = set_thread_area(info); + ret = syscall(__NR_set_thread_area,info); if (ret < 0) { ret = -errno; } @@ -66,7 +64,7 @@ int do_get_thread_area_tt(user_desc_t *info) { int ret; - ret = get_thread_area(info); + ret = syscall(__NR_get_thread_area,info); if (ret < 0) { ret = -errno; } diff --git a/arch/um/sys-i386/unmap.c b/arch/um/sys-i386/unmap.c index 1b0ad0e4adc..8e55cd5d3d0 100644 --- a/arch/um/sys-i386/unmap.c +++ b/arch/um/sys-i386/unmap.c @@ -5,20 +5,17 @@ #include <linux/mman.h> #include <asm/unistd.h> +#include <sys/syscall.h> -static int errno; - -static inline _syscall2(int,munmap,void *,start,size_t,len) -static inline _syscall6(void *,mmap2,void *,addr,size_t,len,int,prot,int,flags,int,fd,off_t,offset) int switcheroo(int fd, int prot, void *from, void *to, int size) { - if(munmap(to, size) < 0){ + if (syscall(__NR_munmap, to, size) < 0){ return(-1); } - if(mmap2(to, size, prot, MAP_SHARED | MAP_FIXED, fd, 0) == (void*) -1 ){ + if (syscall(__NR_mmap2, to, size, prot, MAP_SHARED | MAP_FIXED, fd, 0) == (void*) -1 ){ return(-1); } - if(munmap(from, size) < 0){ + if (syscall(__NR_munmap, from, size) < 0){ return(-1); } return(0); diff --git a/arch/um/sys-x86_64/syscalls.c b/arch/um/sys-x86_64/syscalls.c index 6fce9f45dfd..73ce4463f70 100644 --- a/arch/um/sys-x86_64/syscalls.c +++ b/arch/um/sys-x86_64/syscalls.c @@ -21,7 +21,7 @@ asmlinkage long sys_uname64(struct new_utsname __user * name) { int err; down_read(&uts_sem); - err = copy_to_user(name, &system_utsname, sizeof (*name)); + err = copy_to_user(name, utsname(), sizeof (*name)); up_read(&uts_sem); if (personality(current->personality) == PER_LINUX32) err |= copy_to_user(&name->machine, "i686", 5); diff --git a/arch/um/sys-x86_64/sysrq.c b/arch/um/sys-x86_64/sysrq.c index d0a25af19a5..ce3e07fcf28 100644 --- a/arch/um/sys-x86_64/sysrq.c +++ b/arch/um/sys-x86_64/sysrq.c @@ -16,7 +16,7 @@ void __show_regs(struct pt_regs * regs) printk("\n"); print_modules(); printk("Pid: %d, comm: %.20s %s %s\n", - current->pid, current->comm, print_tainted(), system_utsname.release); + current->pid, current->comm, print_tainted(), init_utsname()->release); printk("RIP: %04lx:[<%016lx>] ", PT_REGS_CS(regs) & 0xffff, PT_REGS_RIP(regs)); printk("\nRSP: %016lx EFLAGS: %08lx\n", PT_REGS_RSP(regs), diff --git a/arch/um/sys-x86_64/unmap.c b/arch/um/sys-x86_64/unmap.c index f4a4bffd8a1..57c9286a701 100644 --- a/arch/um/sys-x86_64/unmap.c +++ b/arch/um/sys-x86_64/unmap.c @@ -5,20 +5,17 @@ #include <linux/mman.h> #include <asm/unistd.h> +#include <sys/syscall.h> -static int errno; - -static inline _syscall2(int,munmap,void *,start,size_t,len) -static inline _syscall6(void *,mmap,void *,addr,size_t,len,int,prot,int,flags,int,fd,off_t,offset) int switcheroo(int fd, int prot, void *from, void *to, int size) { - if(munmap(to, size) < 0){ + if (syscall(__NR_munmap, to, size) < 0){ return(-1); } - if(mmap(to, size, prot, MAP_SHARED | MAP_FIXED, fd, 0) == (void*) -1){ + if (syscall(__NR_mmap, to, size, prot, MAP_SHARED | MAP_FIXED, fd, 0) == (void*) -1){ return(-1); } - if(munmap(from, size) < 0){ + if (syscall(__NR_munmap, from, size) < 0){ return(-1); } return(0); diff --git a/arch/v850/kernel/memcons.c b/arch/v850/kernel/memcons.c index 815f8a43926..92f514fdcc7 100644 --- a/arch/v850/kernel/memcons.c +++ b/arch/v850/kernel/memcons.c @@ -104,7 +104,7 @@ int memcons_tty_chars_in_buffer (struct tty_struct *tty) return 0; } -static struct tty_operations ops = { +static const struct tty_operations ops = { .open = memcons_tty_open, .write = memcons_tty_write, .write_room = memcons_tty_write_room, diff --git a/arch/v850/kernel/simcons.c b/arch/v850/kernel/simcons.c index 3975aa02cef..9973596ae30 100644 --- a/arch/v850/kernel/simcons.c +++ b/arch/v850/kernel/simcons.c @@ -77,7 +77,7 @@ int simcons_tty_chars_in_buffer (struct tty_struct *tty) return 0; } -static struct tty_operations ops = { +static const struct tty_operations ops = { .open = simcons_tty_open, .write = simcons_tty_write, .write_room = simcons_tty_write_room, diff --git a/arch/v850/kernel/syscalls.c b/arch/v850/kernel/syscalls.c index 2ec0700fc46..d2b1fb19d24 100644 --- a/arch/v850/kernel/syscalls.c +++ b/arch/v850/kernel/syscalls.c @@ -33,6 +33,7 @@ #include <asm/uaccess.h> #include <asm/ipc.h> #include <asm/semaphore.h> +#include <asm/unistd.h> /* * sys_ipc() is the de-multiplexer for the SysV IPC calls.. @@ -194,3 +195,22 @@ unsigned long sys_mmap (unsigned long addr, size_t len, out: return err; } + +/* + * Do a system call from kernel instead of calling sys_execve so we + * end up with proper pt_regs. + */ +int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +{ + register char *__a __asm__ ("r6") = filename; + register void *__b __asm__ ("r7") = argv; + register void *__c __asm__ ("r8") = envp; + register unsigned long __syscall __asm__ ("r12") = __NR_execve; + register unsigned long __ret __asm__ ("r10"); + __asm__ __volatile__ ("trap 0" + : "=r" (__ret), "=r" (__syscall) + : "1" (__syscall), "r" (__a), "r" (__b), "r" (__c) + : "r1", "r5", "r11", "r13", "r14", + "r15", "r16", "r17", "r18", "r19"); + return __ret; +} diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index b87a19f0d58..0a5d8e659aa 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -690,7 +690,7 @@ source "arch/x86_64/oprofile/Kconfig" config KPROBES bool "Kprobes (EXPERIMENTAL)" - depends on EXPERIMENTAL && MODULES + depends on KALLSYMS && EXPERIMENTAL && MODULES help Kprobes allows you to trap at almost any kernel address and execute a callback function. register_kprobe() establishes diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c index f280d3665f4..26a01717cc1 100644 --- a/arch/x86_64/ia32/sys_ia32.c +++ b/arch/x86_64/ia32/sys_ia32.c @@ -784,36 +784,36 @@ asmlinkage long sys32_olduname(struct oldold_utsname __user * name) if (!name) return -EFAULT; - if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname))) + if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) return -EFAULT; down_read(&uts_sem); - - err = __copy_to_user(&name->sysname,&system_utsname.sysname, + + err = __copy_to_user(&name->sysname,&utsname()->sysname, __OLD_UTS_LEN); err |= __put_user(0,name->sysname+__OLD_UTS_LEN); - err |= __copy_to_user(&name->nodename,&system_utsname.nodename, + err |= __copy_to_user(&name->nodename,&utsname()->nodename, __OLD_UTS_LEN); err |= __put_user(0,name->nodename+__OLD_UTS_LEN); - err |= __copy_to_user(&name->release,&system_utsname.release, + err |= __copy_to_user(&name->release,&utsname()->release, __OLD_UTS_LEN); err |= __put_user(0,name->release+__OLD_UTS_LEN); - err |= __copy_to_user(&name->version,&system_utsname.version, + err |= __copy_to_user(&name->version,&utsname()->version, __OLD_UTS_LEN); err |= __put_user(0,name->version+__OLD_UTS_LEN); - { - char *arch = "x86_64"; - if (personality(current->personality) == PER_LINUX32) - arch = "i686"; + { + char *arch = "x86_64"; + if (personality(current->personality) == PER_LINUX32) + arch = "i686"; - err |= __copy_to_user(&name->machine,arch,strlen(arch)+1); - } - - up_read(&uts_sem); - - err = err ? -EFAULT : 0; - - return err; + err |= __copy_to_user(&name->machine, arch, strlen(arch)+1); + } + + up_read(&uts_sem); + + err = err ? -EFAULT : 0; + + return err; } long sys32_uname(struct old_utsname __user * name) @@ -822,7 +822,7 @@ long sys32_uname(struct old_utsname __user * name) if (!name) return -EFAULT; down_read(&uts_sem); - err=copy_to_user(name, &system_utsname, sizeof (*name)); + err = copy_to_user(name, utsname(), sizeof (*name)); up_read(&uts_sem); if (personality(current->personality) == PER_LINUX32) err |= copy_to_user(&name->machine, "i686", 5); diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 2802524104f..b8285cf1a9c 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -1023,7 +1023,7 @@ ENDPROC(child_rip) * do_sys_execve asm fallback arguments: * rdi: name, rsi: argv, rdx: envp, fake frame on the stack */ -ENTRY(execve) +ENTRY(kernel_execve) CFI_STARTPROC FAKE_STACK_FRAME $0 SAVE_ALL @@ -1036,7 +1036,7 @@ ENTRY(execve) UNFAKE_STACK_FRAME ret CFI_ENDPROC -ENDPROC(execve) +ENDPROC(kernel_execve) KPROBE_ENTRY(page_fault) errorentry do_page_fault diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c index ffc73ac7248..ac241567e68 100644 --- a/arch/x86_64/kernel/kprobes.c +++ b/arch/x86_64/kernel/kprobes.c @@ -270,20 +270,19 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) { unsigned long *sara = (unsigned long *)regs->rsp; - struct kretprobe_instance *ri; + struct kretprobe_instance *ri; - if ((ri = get_free_rp_inst(rp)) != NULL) { - ri->rp = rp; - ri->task = current; + if ((ri = get_free_rp_inst(rp)) != NULL) { + ri->rp = rp; + ri->task = current; ri->ret_addr = (kprobe_opcode_t *) *sara; /* Replace the return addr with trampoline addr */ *sara = (unsigned long) &kretprobe_trampoline; - - add_rp_inst(ri); - } else { - rp->nmissed++; - } + add_rp_inst(ri); + } else { + rp->nmissed++; + } } int __kprobes kprobe_handler(struct pt_regs *regs) @@ -405,14 +404,15 @@ no_kprobe: */ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) { - struct kretprobe_instance *ri = NULL; - struct hlist_head *head; - struct hlist_node *node, *tmp; + struct kretprobe_instance *ri = NULL; + struct hlist_head *head, empty_rp; + struct hlist_node *node, *tmp; unsigned long flags, orig_ret_address = 0; unsigned long trampoline_address =(unsigned long)&kretprobe_trampoline; + INIT_HLIST_HEAD(&empty_rp); spin_lock_irqsave(&kretprobe_lock, flags); - head = kretprobe_inst_table_head(current); + head = kretprobe_inst_table_head(current); /* * It is possible to have multiple instances associated with a given @@ -423,20 +423,20 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) * We can handle this because: * - instances are always inserted at the head of the list * - when multiple return probes are registered for the same - * function, the first instance's ret_addr will point to the + * function, the first instance's ret_addr will point to the * real return address, and all the rest will point to * kretprobe_trampoline */ hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { - if (ri->task != current) + if (ri->task != current) /* another task is sharing our hash bucket */ - continue; + continue; if (ri->rp && ri->rp->handler) ri->rp->handler(ri, regs); orig_ret_address = (unsigned long)ri->ret_addr; - recycle_rp_inst(ri); + recycle_rp_inst(ri, &empty_rp); if (orig_ret_address != trampoline_address) /* @@ -454,12 +454,16 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) spin_unlock_irqrestore(&kretprobe_lock, flags); preempt_enable_no_resched(); - /* - * By returning a non-zero value, we are telling - * kprobe_handler() that we don't want the post_handler + hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { + hlist_del(&ri->hlist); + kfree(ri); + } + /* + * By returning a non-zero value, we are telling + * kprobe_handler() that we don't want the post_handler * to run (and have re-enabled preemption) - */ - return 1; + */ + return 1; } /* diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 458006ae19f..de10cb8a2c9 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -294,9 +294,9 @@ void __show_regs(struct pt_regs * regs) print_modules(); printk("Pid: %d, comm: %.20s %s %s %.*s\n", current->pid, current->comm, print_tainted(), - system_utsname.release, - (int)strcspn(system_utsname.version, " "), - system_utsname.version); + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip); printk_address(regs->rip); printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, diff --git a/arch/x86_64/kernel/sys_x86_64.c b/arch/x86_64/kernel/sys_x86_64.c index 6449ea8fe75..76bf7c241fe 100644 --- a/arch/x86_64/kernel/sys_x86_64.c +++ b/arch/x86_64/kernel/sys_x86_64.c @@ -148,7 +148,7 @@ asmlinkage long sys_uname(struct new_utsname __user * name) { int err; down_read(&uts_sem); - err = copy_to_user(name, &system_utsname, sizeof (*name)); + err = copy_to_user(name, utsname(), sizeof (*name)); up_read(&uts_sem); if (personality(current->personality) == PER_LINUX32) err |= copy_to_user(&name->machine, "i686", 5); diff --git a/arch/xtensa/kernel/syscalls.c b/arch/xtensa/kernel/syscalls.c index 4688ba2db84..d9285d4d556 100644 --- a/arch/xtensa/kernel/syscalls.c +++ b/arch/xtensa/kernel/syscalls.c @@ -128,7 +128,7 @@ out: int sys_uname(struct old_utsname * name) { - if (name && !copy_to_user(name, &system_utsname, sizeof (*name))) + if (name && !copy_to_user(name, utsname(), sizeof (*name))) return 0; return -EFAULT; } @@ -266,3 +266,23 @@ void system_call (struct pt_regs *regs) regs->areg[2] = res; do_syscall_trace(); } + +/* + * Do a system call from kernel instead of calling sys_execve so we + * end up with proper pt_regs. + */ +int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +{ + long __res; + asm volatile ( + " mov a5, %2 \n" + " mov a4, %4 \n" + " mov a3, %3 \n" + " movi a2, %1 \n" + " syscall \n" + " mov %0, a2 \n" + : "=a" (__res) + : "i" (__NR_execve), "a" (filename), "a" (argv), "a" (envp) + : "a2", "a3", "a4", "a5"); + return __res; +} diff --git a/arch/xtensa/platform-iss/console.c b/arch/xtensa/platform-iss/console.c index 22d3c571a7b..5c947cae752 100644 --- a/arch/xtensa/platform-iss/console.c +++ b/arch/xtensa/platform-iss/console.c @@ -191,7 +191,7 @@ static int rs_read_proc(char *page, char **start, off_t off, int count, } -static struct tty_operations serial_ops = { +static const struct tty_operations serial_ops = { .open = rs_open, .close = rs_close, .write = rs_write, diff --git a/drivers/char/amiserial.c b/drivers/char/amiserial.c index 9d6713a93ed..d0e92ed0a36 100644 --- a/drivers/char/amiserial.c +++ b/drivers/char/amiserial.c @@ -1958,7 +1958,7 @@ static void show_serial_version(void) } -static struct tty_operations serial_ops = { +static const struct tty_operations serial_ops = { .open = rs_open, .close = rs_close, .write = rs_write, diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c index c1c67281750..f85b4eb1661 100644 --- a/drivers/char/cyclades.c +++ b/drivers/char/cyclades.c @@ -5205,7 +5205,7 @@ done: extra ports are ignored. */ -static struct tty_operations cy_ops = { +static const struct tty_operations cy_ops = { .open = cy_open, .close = cy_close, .write = cy_write, diff --git a/drivers/char/epca.c b/drivers/char/epca.c index 86d290e9f30..3baa2ab8cbd 100644 --- a/drivers/char/epca.c +++ b/drivers/char/epca.c @@ -1125,7 +1125,7 @@ static void __exit epca_module_exit(void) module_exit(epca_module_exit); -static struct tty_operations pc_ops = { +static const struct tty_operations pc_ops = { .open = pc_open, .close = pc_close, .write = pc_write, diff --git a/drivers/char/esp.c b/drivers/char/esp.c index afcd83d9984..05788c75d7f 100644 --- a/drivers/char/esp.c +++ b/drivers/char/esp.c @@ -2376,7 +2376,7 @@ static inline int autoconfig(struct esp_struct * info) return (port_detected); } -static struct tty_operations esp_ops = { +static const struct tty_operations esp_ops = { .open = esp_open, .close = rs_close, .write = rs_write, diff --git a/drivers/char/hvc_console.c b/drivers/char/hvc_console.c index a76d2c40dd5..4053d1cd393 100644 --- a/drivers/char/hvc_console.c +++ b/drivers/char/hvc_console.c @@ -696,7 +696,7 @@ int khvcd(void *unused) return 0; } -static struct tty_operations hvc_ops = { +static const struct tty_operations hvc_ops = { .open = hvc_open, .close = hvc_close, .write = hvc_write, diff --git a/drivers/char/hvcs.c b/drivers/char/hvcs.c index 4589ff302b0..0b89bcde8c5 100644 --- a/drivers/char/hvcs.c +++ b/drivers/char/hvcs.c @@ -1306,7 +1306,7 @@ static int hvcs_chars_in_buffer(struct tty_struct *tty) return hvcsd->chars_in_buffer; } -static struct tty_operations hvcs_ops = { +static const struct tty_operations hvcs_ops = { .open = hvcs_open, .close = hvcs_close, .hangup = hvcs_hangup, diff --git a/drivers/char/hvsi.c b/drivers/char/hvsi.c index a89a95fb5e4..c07dc58d5c1 100644 --- a/drivers/char/hvsi.c +++ b/drivers/char/hvsi.c @@ -1130,7 +1130,7 @@ static int hvsi_tiocmset(struct tty_struct *tty, struct file *file, } -static struct tty_operations hvsi_ops = { +static const struct tty_operations hvsi_ops = { .open = hvsi_open, .close = hvsi_close, .write = hvsi_write, diff --git a/drivers/char/ip2/ip2main.c b/drivers/char/ip2/ip2main.c index 331f447e622..4828bc914ce 100644 --- a/drivers/char/ip2/ip2main.c +++ b/drivers/char/ip2/ip2main.c @@ -458,7 +458,7 @@ cleanup_module(void) } #endif /* MODULE */ -static struct tty_operations ip2_ops = { +static const struct tty_operations ip2_ops = { .open = ip2_open, .close = ip2_close, .write = ip2_write, diff --git a/drivers/char/isicom.c b/drivers/char/isicom.c index 2e1da632aee..ea2bbf80ad3 100644 --- a/drivers/char/isicom.c +++ b/drivers/char/isicom.c @@ -1550,7 +1550,7 @@ static void isicom_unregister_ioregion(struct pci_dev *pdev) board->base = 0; } -static struct tty_operations isicom_ops = { +static const struct tty_operations isicom_ops = { .open = isicom_open, .close = isicom_close, .write = isicom_write, diff --git a/drivers/char/istallion.c b/drivers/char/istallion.c index 6b4d82a4565..d6e031542c6 100644 --- a/drivers/char/istallion.c +++ b/drivers/char/istallion.c @@ -4636,7 +4636,7 @@ static int stli_memioctl(struct inode *ip, struct file *fp, unsigned int cmd, un return rc; } -static struct tty_operations stli_ops = { +static const struct tty_operations stli_ops = { .open = stli_open, .close = stli_close, .write = stli_write, diff --git a/drivers/char/keyboard.c b/drivers/char/keyboard.c index 3e90aac3751..99fb070fdbf 100644 --- a/drivers/char/keyboard.c +++ b/drivers/char/keyboard.c @@ -108,7 +108,11 @@ const int NR_TYPES = ARRAY_SIZE(max_vals); struct kbd_struct kbd_table[MAX_NR_CONSOLES]; static struct kbd_struct *kbd = kbd_table; -int spawnpid, spawnsig; +struct vt_spawn_console vt_spawn_con = { + .lock = SPIN_LOCK_UNLOCKED, + .pid = NULL, + .sig = 0, +}; /* * Variables exported for vt.c @@ -578,9 +582,13 @@ static void fn_compose(struct vc_data *vc, struct pt_regs *regs) static void fn_spawn_con(struct vc_data *vc, struct pt_regs *regs) { - if (spawnpid) - if (kill_proc(spawnpid, spawnsig, 1)) - spawnpid = 0; + spin_lock(&vt_spawn_con.lock); + if (vt_spawn_con.pid) + if (kill_pid(vt_spawn_con.pid, vt_spawn_con.sig, 1)) { + put_pid(vt_spawn_con.pid); + vt_spawn_con.pid = NULL; + } + spin_unlock(&vt_spawn_con.lock); } static void fn_SAK(struct vc_data *vc, struct pt_regs *regs) diff --git a/drivers/char/moxa.c b/drivers/char/moxa.c index c1a6d3c48da..b401383808c 100644 --- a/drivers/char/moxa.c +++ b/drivers/char/moxa.c @@ -281,7 +281,7 @@ static int moxa_get_serial_info(struct moxa_str *, struct serial_struct __user * static int moxa_set_serial_info(struct moxa_str *, struct serial_struct __user *); static void MoxaSetFifo(int port, int enable); -static struct tty_operations moxa_ops = { +static const struct tty_operations moxa_ops = { .open = moxa_open, .close = moxa_close, .write = moxa_write, diff --git a/drivers/char/mxser.c b/drivers/char/mxser.c index 27a65377204..8253fca8efd 100644 --- a/drivers/char/mxser.c +++ b/drivers/char/mxser.c @@ -453,7 +453,7 @@ static int CheckIsMoxaMust(int io) /* above is modified by Victor Yu. 08-15-2002 */ -static struct tty_operations mxser_ops = { +static const struct tty_operations mxser_ops = { .open = mxser_open, .close = mxser_close, .write = mxser_write, diff --git a/drivers/char/nwbutton.c b/drivers/char/nwbutton.c index 7c57ebfa864..ea1aa7764f8 100644 --- a/drivers/char/nwbutton.c +++ b/drivers/char/nwbutton.c @@ -127,9 +127,8 @@ static void button_consume_callbacks (int bpcount) static void button_sequence_finished (unsigned long parameters) { #ifdef CONFIG_NWBUTTON_REBOOT /* Reboot using button is enabled */ - if (button_press_count == reboot_count) { - kill_proc (1, SIGINT, 1); /* Ask init to reboot us */ - } + if (button_press_count == reboot_count) + kill_cad_pid(SIGINT, 1); /* Ask init to reboot us */ #endif /* CONFIG_NWBUTTON_REBOOT */ button_consume_callbacks (button_press_count); bcount = sprintf (button_output_buffer, "%d\n", button_press_count); diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c index 00f574cbb0d..dd845cbefe9 100644 --- a/drivers/char/pcmcia/synclink_cs.c +++ b/drivers/char/pcmcia/synclink_cs.c @@ -3010,7 +3010,7 @@ static struct pcmcia_driver mgslpc_driver = { .resume = mgslpc_resume, }; -static struct tty_operations mgslpc_ops = { +static const struct tty_operations mgslpc_ops = { .open = mgslpc_open, .close = mgslpc_close, .write = mgslpc_write, diff --git a/drivers/char/pty.c b/drivers/char/pty.c index 34dd4c38110..80d3eedd7f9 100644 --- a/drivers/char/pty.c +++ b/drivers/char/pty.c @@ -224,7 +224,7 @@ static void pty_set_termios(struct tty_struct *tty, struct termios *old_termios) tty->termios->c_cflag |= (CS8 | CREAD); } -static struct tty_operations pty_ops = { +static const struct tty_operations pty_ops = { .open = pty_open, .close = pty_close, .write = pty_write, diff --git a/drivers/char/random.c b/drivers/char/random.c index b430a12eb81..07f47a0208a 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -889,8 +889,8 @@ static void init_std_data(struct entropy_store *r) do_gettimeofday(&tv); add_entropy_words(r, (__u32 *)&tv, sizeof(tv)/4); - add_entropy_words(r, (__u32 *)&system_utsname, - sizeof(system_utsname)/4); + add_entropy_words(r, (__u32 *)utsname(), + sizeof(*(utsname()))/4); } static int __init rand_initialize(void) diff --git a/drivers/char/rio/rio_linux.c b/drivers/char/rio/rio_linux.c index 3fa80aaf452..202a3b0945b 100644 --- a/drivers/char/rio/rio_linux.c +++ b/drivers/char/rio/rio_linux.c @@ -727,7 +727,7 @@ static struct vpd_prom *get_VPD_PROM(struct Host *hp) return &vpdp; } -static struct tty_operations rio_ops = { +static const struct tty_operations rio_ops = { .open = riotopen, .close = gs_close, .write = gs_write, diff --git a/drivers/char/riscom8.c b/drivers/char/riscom8.c index 06b9f78a95d..214d850112f 100644 --- a/drivers/char/riscom8.c +++ b/drivers/char/riscom8.c @@ -1583,7 +1583,7 @@ static void do_softint(void *private_) } } -static struct tty_operations riscom_ops = { +static const struct tty_operations riscom_ops = { .open = rc_open, .close = rc_close, .write = rc_write, diff --git a/drivers/char/rocket.c b/drivers/char/rocket.c index 0ac13188132..bac80056f7e 100644 --- a/drivers/char/rocket.c +++ b/drivers/char/rocket.c @@ -2334,7 +2334,7 @@ static int __init init_ISA(int i) return (1); } -static struct tty_operations rocket_ops = { +static const struct tty_operations rocket_ops = { .open = rp_open, .close = rp_close, .write = rp_write, diff --git a/drivers/char/ser_a2232.c b/drivers/char/ser_a2232.c index 510bd3e0e88..65c751d0d64 100644 --- a/drivers/char/ser_a2232.c +++ b/drivers/char/ser_a2232.c @@ -661,7 +661,7 @@ static void a2232_init_portstructs(void) } } -static struct tty_operations a2232_ops = { +static const struct tty_operations a2232_ops = { .open = a2232_open, .close = gs_close, .write = gs_write, diff --git a/drivers/char/serial167.c b/drivers/char/serial167.c index 21a710cb4bb..b4ea1266b66 100644 --- a/drivers/char/serial167.c +++ b/drivers/char/serial167.c @@ -2158,7 +2158,7 @@ mvme167_serial_console_setup(int cflag) rcor >> 5, rbpr); } /* serial_console_init */ -static struct tty_operations cy_ops = { +static const struct tty_operations cy_ops = { .open = cy_open, .close = cy_close, .write = cy_write, diff --git a/drivers/char/snsc_event.c b/drivers/char/snsc_event.c index d12d4f629ce..864854c5886 100644 --- a/drivers/char/snsc_event.c +++ b/drivers/char/snsc_event.c @@ -220,7 +220,7 @@ scdrv_dispatch_event(char *event, int len) " Sending SIGPWR to init...\n"); /* give a SIGPWR signal to init proc */ - kill_proc(1, SIGPWR, 0); + kill_cad_pid(SIGPWR, 0); } else { /* print to system log */ printk("%s|$(0x%x)%s\n", severity, esp_code, desc); diff --git a/drivers/char/specialix.c b/drivers/char/specialix.c index f52c7c31bad..902c48dca3b 100644 --- a/drivers/char/specialix.c +++ b/drivers/char/specialix.c @@ -2363,7 +2363,7 @@ static void do_softint(void *private_) func_exit(); } -static struct tty_operations sx_ops = { +static const struct tty_operations sx_ops = { .open = sx_open, .close = sx_close, .write = sx_write, diff --git a/drivers/char/stallion.c b/drivers/char/stallion.c index 3beb2203d24..bd711537ec4 100644 --- a/drivers/char/stallion.c +++ b/drivers/char/stallion.c @@ -2993,7 +2993,7 @@ static int stl_memioctl(struct inode *ip, struct file *fp, unsigned int cmd, uns return(rc); } -static struct tty_operations stl_ops = { +static const struct tty_operations stl_ops = { .open = stl_open, .close = stl_close, .write = stl_write, diff --git a/drivers/char/sx.c b/drivers/char/sx.c index e1cd2bc4b1e..57e31e5eaed 100644 --- a/drivers/char/sx.c +++ b/drivers/char/sx.c @@ -2226,7 +2226,7 @@ static int probe_si (struct sx_board *board) return 1; } -static struct tty_operations sx_ops = { +static const struct tty_operations sx_ops = { .break_ctl = sx_break, .open = sx_open, .close = gs_close, diff --git a/drivers/char/synclink.c b/drivers/char/synclink.c index 78b1b1a2732..244dc308c77 100644 --- a/drivers/char/synclink.c +++ b/drivers/char/synclink.c @@ -4360,7 +4360,7 @@ static struct mgsl_struct* mgsl_allocate_device(void) } /* end of mgsl_allocate_device()*/ -static struct tty_operations mgsl_ops = { +static const struct tty_operations mgsl_ops = { .open = mgsl_open, .close = mgsl_close, .write = mgsl_write, diff --git a/drivers/char/synclink_gt.c b/drivers/char/synclink_gt.c index 78bc85180c8..bdc7cb248b8 100644 --- a/drivers/char/synclink_gt.c +++ b/drivers/char/synclink_gt.c @@ -3441,7 +3441,7 @@ static void __devexit remove_one(struct pci_dev *dev) { } -static struct tty_operations ops = { +static const struct tty_operations ops = { .open = open, .close = close, .write = write, diff --git a/drivers/char/synclinkmp.c b/drivers/char/synclinkmp.c index 66f3754fbbd..6eb75dcd796 100644 --- a/drivers/char/synclinkmp.c +++ b/drivers/char/synclinkmp.c @@ -3929,7 +3929,7 @@ void device_init(int adapter_num, struct pci_dev *pdev) } } -static struct tty_operations ops = { +static const struct tty_operations ops = { .open = open, .close = close, .write = write, diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c index 333741770f1..e90ea39c7c4 100644 --- a/drivers/char/tty_io.c +++ b/drivers/char/tty_io.c @@ -3680,7 +3680,8 @@ void put_tty_driver(struct tty_driver *driver) kfree(driver); } -void tty_set_operations(struct tty_driver *driver, struct tty_operations *op) +void tty_set_operations(struct tty_driver *driver, + const struct tty_operations *op) { driver->open = op->open; driver->close = op->close; diff --git a/drivers/char/viocons.c b/drivers/char/viocons.c index f3efeaf2826..a362ee9c92d 100644 --- a/drivers/char/viocons.c +++ b/drivers/char/viocons.c @@ -1047,7 +1047,7 @@ static int send_open(HvLpIndex remoteLp, void *sem) 0, 0, 0, 0); } -static struct tty_operations serial_ops = { +static const struct tty_operations serial_ops = { .open = viotty_open, .close = viotty_close, .write = viotty_write, diff --git a/drivers/char/vme_scc.c b/drivers/char/vme_scc.c index bfe5ea948f6..c2ca31eb850 100644 --- a/drivers/char/vme_scc.c +++ b/drivers/char/vme_scc.c @@ -113,7 +113,7 @@ static struct real_driver scc_real_driver = { }; -static struct tty_operations scc_ops = { +static const struct tty_operations scc_ops = { .open = scc_open, .close = gs_close, .write = gs_write, diff --git a/drivers/char/vt.c b/drivers/char/vt.c index fb75da940b5..ec0c070bf15 100644 --- a/drivers/char/vt.c +++ b/drivers/char/vt.c @@ -903,6 +903,7 @@ void vc_deallocate(unsigned int currcons) if (vc_cons_allocated(currcons)) { struct vc_data *vc = vc_cons[currcons].d; vc->vc_sw->con_deinit(vc); + put_pid(vc->vt_pid); module_put(vc->vc_sw->owner); if (vc->vc_kmalloced) kfree(vc->vc_screenbuf); @@ -2674,7 +2675,7 @@ static int __init con_init(void) } console_initcall(con_init); -static struct tty_operations con_ops = { +static const struct tty_operations con_ops = { .open = con_open, .close = con_close, .write = con_write, diff --git a/drivers/char/vt_ioctl.c b/drivers/char/vt_ioctl.c index a53e382cc10..ac5d60edbaf 100644 --- a/drivers/char/vt_ioctl.c +++ b/drivers/char/vt_ioctl.c @@ -645,13 +645,16 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, */ case KDSIGACCEPT: { - extern int spawnpid, spawnsig; if (!perm || !capable(CAP_KILL)) return -EPERM; if (!valid_signal(arg) || arg < 1 || arg == SIGKILL) return -EINVAL; - spawnpid = current->pid; - spawnsig = arg; + + spin_lock_irq(&vt_spawn_con.lock); + put_pid(vt_spawn_con.pid); + vt_spawn_con.pid = get_pid(task_pid(current)); + vt_spawn_con.sig = arg; + spin_unlock_irq(&vt_spawn_con.lock); return 0; } @@ -669,7 +672,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, vc->vt_mode = tmp; /* the frsig is ignored, so we set it to 0 */ vc->vt_mode.frsig = 0; - vc->vt_pid = current->pid; + put_pid(xchg(&vc->vt_pid, get_pid(task_pid(current)))); /* no switch is required -- saw@shade.msu.ru */ vc->vt_newvt = -1; release_console_sem(); @@ -1060,7 +1063,7 @@ void reset_vc(struct vc_data *vc) vc->vt_mode.relsig = 0; vc->vt_mode.acqsig = 0; vc->vt_mode.frsig = 0; - vc->vt_pid = -1; + put_pid(xchg(&vc->vt_pid, NULL)); vc->vt_newvt = -1; if (!in_interrupt()) /* Via keyboard.c:SAK() - akpm */ reset_palette(vc); @@ -1111,7 +1114,7 @@ static void complete_change_console(struct vc_data *vc) * tell us if the process has gone or something else * is awry */ - if (kill_proc(vc->vt_pid, vc->vt_mode.acqsig, 1) != 0) { + if (kill_pid(vc->vt_pid, vc->vt_mode.acqsig, 1) != 0) { /* * The controlling process has died, so we revert back to * normal operation. In this case, we'll also change back @@ -1171,7 +1174,7 @@ void change_console(struct vc_data *new_vc) * tell us if the process has gone or something else * is awry */ - if (kill_proc(vc->vt_pid, vc->vt_mode.relsig, 1) == 0) { + if (kill_pid(vc->vt_pid, vc->vt_mode.relsig, 1) == 0) { /* * It worked. Mark the vt to switch to and * return. The process needs to send us a diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c index 42eaed88c28..a5456108dba 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs.c +++ b/drivers/infiniband/hw/ipath/ipath_verbs.c @@ -1601,7 +1601,7 @@ int ipath_register_ib_device(struct ipath_devdata *dd) dev->mmap = ipath_mmap; snprintf(dev->node_desc, sizeof(dev->node_desc), - IPATH_IDSTR " %s", system_utsname.nodename); + IPATH_IDSTR " %s", init_utsname()->nodename); ret = ib_register_device(dev); if (ret) diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c index 669f76393b5..11844bbfe93 100644 --- a/drivers/isdn/capi/capi.c +++ b/drivers/isdn/capi/capi.c @@ -1298,7 +1298,7 @@ static int capinc_tty_read_proc(char *page, char **start, off_t off, static struct tty_driver *capinc_tty_driver; -static struct tty_operations capinc_ops = { +static const struct tty_operations capinc_ops = { .open = capinc_tty_open, .close = capinc_tty_close, .write = capinc_tty_write, diff --git a/drivers/isdn/gigaset/interface.c b/drivers/isdn/gigaset/interface.c index bd2e4267528..596f3aebe2f 100644 --- a/drivers/isdn/gigaset/interface.c +++ b/drivers/isdn/gigaset/interface.c @@ -134,7 +134,7 @@ static int if_tiocmset(struct tty_struct *tty, struct file *file, static int if_write(struct tty_struct *tty, const unsigned char *buf, int count); -static struct tty_operations if_ops = { +static const struct tty_operations if_ops = { .open = if_open, .close = if_close, .ioctl = if_ioctl, diff --git a/drivers/isdn/gigaset/proc.c b/drivers/isdn/gigaset/proc.c index 9ae3a7f3e7b..9ad840e95db 100644 --- a/drivers/isdn/gigaset/proc.c +++ b/drivers/isdn/gigaset/proc.c @@ -83,5 +83,6 @@ void gigaset_init_dev_sysfs(struct cardstate *cs) return; gig_dbg(DEBUG_INIT, "setting up sysfs"); - class_device_create_file(cs->class, &class_device_attr_cidmode); + if (class_device_create_file(cs->class, &class_device_attr_cidmode)) + dev_err(cs->dev, "could not create sysfs attribute\n"); } diff --git a/drivers/isdn/hisax/hisax.h b/drivers/isdn/hisax/hisax.h index 75920aa0a3c..2f9d5118cea 100644 --- a/drivers/isdn/hisax/hisax.h +++ b/drivers/isdn/hisax/hisax.h @@ -1316,7 +1316,18 @@ void dlogframe(struct IsdnCardState *cs, struct sk_buff *skb, int dir); void iecpy(u_char * dest, u_char * iestart, int ieoffset); #endif /* __KERNEL__ */ -#define HZDELAY(jiffs) {int tout = jiffs; while (tout--) udelay(1000000/HZ);} +/* + * Busywait delay for `jiffs' jiffies + */ +#define HZDELAY(jiffs) do { \ + int tout = jiffs; \ + \ + while (tout--) { \ + int loops = USEC_PER_SEC / HZ; \ + while (loops--) \ + udelay(1); \ + } \ + } while (0) int ll_run(struct IsdnCardState *cs, int addfeatures); int CallcNew(void); diff --git a/drivers/isdn/i4l/isdn_tty.c b/drivers/isdn/i4l/isdn_tty.c index 9ab66e8960d..2b91bb07fc7 100644 --- a/drivers/isdn/i4l/isdn_tty.c +++ b/drivers/isdn/i4l/isdn_tty.c @@ -1860,7 +1860,7 @@ modem_write_profile(atemu * m) send_sig(SIGIO, dev->profd, 1); } -static struct tty_operations modem_ops = { +static const struct tty_operations modem_ops = { .open = isdn_tty_open, .close = isdn_tty_close, .write = isdn_tty_write, diff --git a/drivers/media/dvb/dvb-core/dvb_ringbuffer.c b/drivers/media/dvb/dvb-core/dvb_ringbuffer.c index c972fe014c5..9878183ba3f 100644 --- a/drivers/media/dvb/dvb-core/dvb_ringbuffer.c +++ b/drivers/media/dvb/dvb-core/dvb_ringbuffer.c @@ -26,7 +26,6 @@ -#define __KERNEL_SYSCALLS__ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/module.h> diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index 19c2b85249c..c1bf1fb04c5 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -3,5 +3,6 @@ # obj- := misc.o # Dummy rule to force built-in.o to be made -obj-$(CONFIG_IBM_ASM) += ibmasm/ +obj-$(CONFIG_IBM_ASM) += ibmasm/ obj-$(CONFIG_HDPU_FEATURES) += hdpuftrs/ +obj-$(CONFIG_LKDTM) += lkdtm.o diff --git a/drivers/misc/lkdtm.c b/drivers/misc/lkdtm.c new file mode 100644 index 00000000000..e689ee94ac3 --- /dev/null +++ b/drivers/misc/lkdtm.c @@ -0,0 +1,342 @@ +/* + * Kprobe module for testing crash dumps + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2006 + * + * Author: Ankita Garg <ankita@in.ibm.com> + * + * This module induces system failures at predefined crashpoints to + * evaluate the reliability of crash dumps obtained using different dumping + * solutions. + * + * It is adapted from the Linux Kernel Dump Test Tool by + * Fernando Luis Vazquez Cao <http://lkdtt.sourceforge.net> + * + * Usage : insmod lkdtm.ko [recur_count={>0}] cpoint_name=<> cpoint_type=<> + * [cpoint_count={>0}] + * + * recur_count : Recursion level for the stack overflow test. Default is 10. + * + * cpoint_name : Crash point where the kernel is to be crashed. It can be + * one of INT_HARDWARE_ENTRY, INT_HW_IRQ_EN, INT_TASKLET_ENTRY, + * FS_DEVRW, MEM_SWAPOUT, TIMERADD, SCSI_DISPATCH_CMD, + * IDE_CORE_CP + * + * cpoint_type : Indicates the action to be taken on hitting the crash point. + * It can be one of PANIC, BUG, EXCEPTION, LOOP, OVERFLOW + * + * cpoint_count : Indicates the number of times the crash point is to be hit + * to trigger an action. The default is 10. + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/kprobes.h> +#include <linux/kallsyms.h> +#include <linux/init.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <scsi/scsi_cmnd.h> + +#ifdef CONFIG_IDE +#include <linux/ide.h> +#endif + +#define NUM_CPOINTS 8 +#define NUM_CPOINT_TYPES 5 +#define DEFAULT_COUNT 10 +#define REC_NUM_DEFAULT 10 + +enum cname { + INVALID, + INT_HARDWARE_ENTRY, + INT_HW_IRQ_EN, + INT_TASKLET_ENTRY, + FS_DEVRW, + MEM_SWAPOUT, + TIMERADD, + SCSI_DISPATCH_CMD, + IDE_CORE_CP +}; + +enum ctype { + NONE, + PANIC, + BUG, + EXCEPTION, + LOOP, + OVERFLOW +}; + +static char* cp_name[] = { + "INT_HARDWARE_ENTRY", + "INT_HW_IRQ_EN", + "INT_TASKLET_ENTRY", + "FS_DEVRW", + "MEM_SWAPOUT", + "TIMERADD", + "SCSI_DISPATCH_CMD", + "IDE_CORE_CP" +}; + +static char* cp_type[] = { + "PANIC", + "BUG", + "EXCEPTION", + "LOOP", + "OVERFLOW" +}; + +static struct jprobe lkdtm; + +static int lkdtm_parse_commandline(void); +static void lkdtm_handler(void); + +static char* cpoint_name = INVALID; +static char* cpoint_type = NONE; +static int cpoint_count = DEFAULT_COUNT; +static int recur_count = REC_NUM_DEFAULT; + +static enum cname cpoint = INVALID; +static enum ctype cptype = NONE; +static int count = DEFAULT_COUNT; + +module_param(recur_count, int, 0644); +MODULE_PARM_DESC(recur_count, "Recurcion level for the stack overflow test,\ + default is 10"); +module_param(cpoint_name, charp, 0644); +MODULE_PARM_DESC(cpoint_name, "Crash Point, where kernel is to be crashed"); +module_param(cpoint_type, charp, 06444); +MODULE_PARM_DESC(cpoint_type, "Crash Point Type, action to be taken on\ + hitting the crash point"); +module_param(cpoint_count, int, 06444); +MODULE_PARM_DESC(cpoint_count, "Crash Point Count, number of times the \ + crash point is to be hit to trigger action"); + +unsigned int jp_do_irq(unsigned int irq, struct pt_regs *regs) +{ + lkdtm_handler(); + jprobe_return(); + return 0; +} + +irqreturn_t jp_handle_irq_event(unsigned int irq, struct pt_regs *regs, + struct irqaction *action) +{ + lkdtm_handler(); + jprobe_return(); + return 0; +} + +void jp_tasklet_action(struct softirq_action *a) +{ + lkdtm_handler(); + jprobe_return(); +} + +void jp_ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) +{ + lkdtm_handler(); + jprobe_return(); +} + +struct scan_control; + +unsigned long jp_shrink_page_list(struct list_head *page_list, + struct scan_control *sc) +{ + lkdtm_handler(); + jprobe_return(); + return 0; +} + +int jp_hrtimer_start(struct hrtimer *timer, ktime_t tim, + const enum hrtimer_mode mode) +{ + lkdtm_handler(); + jprobe_return(); + return 0; +} + +int jp_scsi_dispatch_cmd(struct scsi_cmnd *cmd) +{ + lkdtm_handler(); + jprobe_return(); + return 0; +} + +#ifdef CONFIG_IDE +int jp_generic_ide_ioctl(ide_drive_t *drive, struct file *file, + struct block_device *bdev, unsigned int cmd, + unsigned long arg) +{ + lkdtm_handler(); + jprobe_return(); + return 0; +} +#endif + +static int lkdtm_parse_commandline(void) +{ + int i; + + if (cpoint_name == INVALID || cpoint_type == NONE || + cpoint_count < 1 || recur_count < 1) + return -EINVAL; + + for (i = 0; i < NUM_CPOINTS; ++i) { + if (!strcmp(cpoint_name, cp_name[i])) { + cpoint = i + 1; + break; + } + } + + for (i = 0; i < NUM_CPOINT_TYPES; ++i) { + if (!strcmp(cpoint_type, cp_type[i])) { + cptype = i + 1; + break; + } + } + + if (cpoint == INVALID || cptype == NONE) + return -EINVAL; + + count = cpoint_count; + + return 0; +} + +static int recursive_loop(int a) +{ + char buf[1024]; + + memset(buf,0xFF,1024); + recur_count--; + if (!recur_count) + return 0; + else + return recursive_loop(a); +} + +void lkdtm_handler(void) +{ + printk(KERN_INFO "lkdtm : Crash point %s of type %s hit\n", + cpoint_name, cpoint_type); + --count; + + if (count == 0) { + switch (cptype) { + case NONE: + break; + case PANIC: + printk(KERN_INFO "lkdtm : PANIC\n"); + panic("dumptest"); + break; + case BUG: + printk(KERN_INFO "lkdtm : BUG\n"); + BUG(); + break; + case EXCEPTION: + printk(KERN_INFO "lkdtm : EXCEPTION\n"); + *((int *) 0) = 0; + break; + case LOOP: + printk(KERN_INFO "lkdtm : LOOP\n"); + for (;;); + break; + case OVERFLOW: + printk(KERN_INFO "lkdtm : OVERFLOW\n"); + (void) recursive_loop(0); + break; + default: + break; + } + count = cpoint_count; + } +} + +int lkdtm_module_init(void) +{ + int ret; + + if (lkdtm_parse_commandline() == -EINVAL) { + printk(KERN_INFO "lkdtm : Invalid command\n"); + return -EINVAL; + } + + switch (cpoint) { + case INT_HARDWARE_ENTRY: + lkdtm.kp.symbol_name = "__do_IRQ"; + lkdtm.entry = (kprobe_opcode_t*) jp_do_irq; + break; + case INT_HW_IRQ_EN: + lkdtm.kp.symbol_name = "handle_IRQ_event"; + lkdtm.entry = (kprobe_opcode_t*) jp_handle_irq_event; + break; + case INT_TASKLET_ENTRY: + lkdtm.kp.symbol_name = "tasklet_action"; + lkdtm.entry = (kprobe_opcode_t*) jp_tasklet_action; + break; + case FS_DEVRW: + lkdtm.kp.symbol_name = "ll_rw_block"; + lkdtm.entry = (kprobe_opcode_t*) jp_ll_rw_block; + break; + case MEM_SWAPOUT: + lkdtm.kp.symbol_name = "shrink_page_list"; + lkdtm.entry = (kprobe_opcode_t*) jp_shrink_page_list; + break; + case TIMERADD: + lkdtm.kp.symbol_name = "hrtimer_start"; + lkdtm.entry = (kprobe_opcode_t*) jp_hrtimer_start; + break; + case SCSI_DISPATCH_CMD: + lkdtm.kp.symbol_name = "scsi_dispatch_cmd"; + lkdtm.entry = (kprobe_opcode_t*) jp_scsi_dispatch_cmd; + break; + case IDE_CORE_CP: +#ifdef CONFIG_IDE + lkdtm.kp.symbol_name = "generic_ide_ioctl"; + lkdtm.entry = (kprobe_opcode_t*) jp_generic_ide_ioctl; +#else + printk(KERN_INFO "lkdtm : Crash point not available\n"); +#endif + break; + default: + printk(KERN_INFO "lkdtm : Invalid Crash Point\n"); + break; + } + + if ((ret = register_jprobe(&lkdtm)) < 0) { + printk(KERN_INFO "lkdtm : Couldn't register jprobe\n"); + return ret; + } + + printk(KERN_INFO "lkdtm : Crash point %s of type %s registered\n", + cpoint_name, cpoint_type); + return 0; +} + +void lkdtm_module_exit(void) +{ + unregister_jprobe(&lkdtm); + printk(KERN_INFO "lkdtm : Crash point unregistered\n"); +} + +module_init(lkdtm_module_init); +module_exit(lkdtm_module_exit); + +MODULE_LICENSE("GPL"); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index bc0face01d2..151a2e10e4f 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -697,7 +697,7 @@ static int tun_chr_fasync(int fd, struct file *file, int on) return ret; if (on) { - ret = f_setown(file, current->pid, 0); + ret = __f_setown(file, task_pid(current), PIDTYPE_PID, 0); if (ret) return ret; tun->flags |= TUN_FASYNC; diff --git a/drivers/net/wireless/ipw2100.c b/drivers/net/wireless/ipw2100.c index 97937809de0..eddfa8786e8 100644 --- a/drivers/net/wireless/ipw2100.c +++ b/drivers/net/wireless/ipw2100.c @@ -150,7 +150,6 @@ that only one external action is invoked at a time. #include <linux/skbuff.h> #include <asm/uaccess.h> #include <asm/io.h> -#define __KERNEL_SYSCALLS__ #include <linux/fs.h> #include <linux/mm.h> #include <linux/slab.h> diff --git a/drivers/parisc/led.c b/drivers/parisc/led.c index bf00fa2537b..8dac2ba82bb 100644 --- a/drivers/parisc/led.c +++ b/drivers/parisc/led.c @@ -684,7 +684,7 @@ int __init led_init(void) int ret; snprintf(lcd_text_default, sizeof(lcd_text_default), - "Linux %s", system_utsname.release); + "Linux %s", init_utsname()->release); /* Work around the buggy PDC of KittyHawk-machines */ switch (CPU_HVERSION) { diff --git a/drivers/parisc/power.c b/drivers/parisc/power.c index fad5a33bf0f..4a9f025a6b5 100644 --- a/drivers/parisc/power.c +++ b/drivers/parisc/power.c @@ -84,8 +84,7 @@ static void deferred_poweroff(void *dummy) { - extern int cad_pid; /* from kernel/sys.c */ - if (kill_proc(cad_pid, SIGINT, 1)) { + if (kill_cad_pid(SIGINT, 1)) { /* just in case killing init process failed */ machine_power_off(); } diff --git a/drivers/s390/char/con3215.c b/drivers/s390/char/con3215.c index 2fa566fa6da..d7de175d53f 100644 --- a/drivers/s390/char/con3215.c +++ b/drivers/s390/char/con3215.c @@ -1103,7 +1103,7 @@ tty3215_start(struct tty_struct *tty) } } -static struct tty_operations tty3215_ops = { +static const struct tty_operations tty3215_ops = { .open = tty3215_open, .close = tty3215_close, .write = tty3215_write, diff --git a/drivers/s390/char/fs3270.c b/drivers/s390/char/fs3270.c index b4557fa3085..78f8bda81da 100644 --- a/drivers/s390/char/fs3270.c +++ b/drivers/s390/char/fs3270.c @@ -27,7 +27,7 @@ struct raw3270_fn fs3270_fn; struct fs3270 { struct raw3270_view view; - pid_t fs_pid; /* Pid of controlling program. */ + struct pid *fs_pid; /* Pid of controlling program. */ int read_command; /* ccw command to use for reads. */ int write_command; /* ccw command to use for writes. */ int attention; /* Got attention. */ @@ -102,7 +102,7 @@ fs3270_restore_callback(struct raw3270_request *rq, void *data) fp = (struct fs3270 *) rq->view; if (rq->rc != 0 || rq->rescnt != 0) { if (fp->fs_pid) - kill_proc(fp->fs_pid, SIGHUP, 1); + kill_pid(fp->fs_pid, SIGHUP, 1); } fp->rdbuf_size = 0; raw3270_request_reset(rq); @@ -173,7 +173,7 @@ fs3270_save_callback(struct raw3270_request *rq, void *data) */ if (rq->rc != 0 || rq->rescnt == 0) { if (fp->fs_pid) - kill_proc(fp->fs_pid, SIGHUP, 1); + kill_pid(fp->fs_pid, SIGHUP, 1); fp->rdbuf_size = 0; } else fp->rdbuf_size = fp->rdbuf->size - rq->rescnt; @@ -442,7 +442,7 @@ fs3270_open(struct inode *inode, struct file *filp) return PTR_ERR(fp); init_waitqueue_head(&fp->wait); - fp->fs_pid = current->pid; + fp->fs_pid = get_pid(task_pid(current)); rc = raw3270_add_view(&fp->view, &fs3270_fn, minor); if (rc) { fs3270_free_view(&fp->view); @@ -480,7 +480,8 @@ fs3270_close(struct inode *inode, struct file *filp) fp = filp->private_data; filp->private_data = NULL; if (fp) { - fp->fs_pid = 0; + put_pid(fp->fs_pid); + fp->fs_pid = NULL; raw3270_reset(&fp->view); raw3270_put_view(&fp->view); raw3270_del_view(&fp->view); diff --git a/drivers/s390/char/sclp_tty.c b/drivers/s390/char/sclp_tty.c index f6cf9023039..6f43e04dbef 100644 --- a/drivers/s390/char/sclp_tty.c +++ b/drivers/s390/char/sclp_tty.c @@ -711,7 +711,7 @@ static struct sclp_register sclp_input_event = .receiver_fn = sclp_tty_receiver }; -static struct tty_operations sclp_ops = { +static const struct tty_operations sclp_ops = { .open = sclp_tty_open, .close = sclp_tty_close, .write = sclp_tty_write, diff --git a/drivers/s390/char/sclp_vt220.c b/drivers/s390/char/sclp_vt220.c index 54fba6f1718..723bf4191bf 100644 --- a/drivers/s390/char/sclp_vt220.c +++ b/drivers/s390/char/sclp_vt220.c @@ -655,7 +655,7 @@ __sclp_vt220_init(int early) return 0; } -static struct tty_operations sclp_vt220_ops = { +static const struct tty_operations sclp_vt220_ops = { .open = sclp_vt220_open, .close = sclp_vt220_close, .write = sclp_vt220_write, diff --git a/drivers/s390/char/tty3270.c b/drivers/s390/char/tty3270.c index 06e2eeec847..4717c361160 100644 --- a/drivers/s390/char/tty3270.c +++ b/drivers/s390/char/tty3270.c @@ -1737,7 +1737,7 @@ tty3270_ioctl(struct tty_struct *tty, struct file *file, return kbd_ioctl(tp->kbd, file, cmd, arg); } -static struct tty_operations tty3270_ops = { +static const struct tty_operations tty3270_ops = { .open = tty3270_open, .close = tty3270_close, .write = tty3270_write, diff --git a/drivers/s390/s390mach.c b/drivers/s390/s390mach.c index 479364d0332..e088b5e2871 100644 --- a/drivers/s390/s390mach.c +++ b/drivers/s390/s390mach.c @@ -208,7 +208,7 @@ s390_handle_mcck(void) */ __ctl_clear_bit(14, 24); /* Disable WARNING MCH */ if (xchg(&mchchk_wng_posted, 1) == 0) - kill_proc(1, SIGPWR, 1); + kill_cad_pid(SIGPWR, 1); } #endif diff --git a/drivers/sbus/char/aurora.c b/drivers/sbus/char/aurora.c index 4fdb2c93221..a305d409154 100644 --- a/drivers/sbus/char/aurora.c +++ b/drivers/sbus/char/aurora.c @@ -2187,7 +2187,7 @@ static void do_softint(void *private_) #endif } -static struct tty_operations aurora_ops = { +static const struct tty_operations aurora_ops = { .open = aurora_open, .close = aurora_close, .write = aurora_write, diff --git a/drivers/sbus/char/bbc_envctrl.c b/drivers/sbus/char/bbc_envctrl.c index 1cc706e1111..d27e4f6d704 100644 --- a/drivers/sbus/char/bbc_envctrl.c +++ b/drivers/sbus/char/bbc_envctrl.c @@ -4,9 +4,6 @@ * Copyright (C) 2001 David S. Miller (davem@redhat.com) */ -#define __KERNEL_SYSCALLS__ -static int errno; - #include <linux/kernel.h> #include <linux/kthread.h> #include <linux/sched.h> @@ -200,7 +197,7 @@ static void do_envctrl_shutdown(struct bbc_cpu_temperature *tp) printk(KERN_CRIT "kenvctrld: Shutting down the system now.\n"); shutting_down = 1; - if (execve("/sbin/shutdown", argv, envp) < 0) + if (kernel_execve("/sbin/shutdown", argv, envp) < 0) printk(KERN_CRIT "envctrl: shutdown execution failed\n"); } diff --git a/drivers/sbus/char/envctrl.c b/drivers/sbus/char/envctrl.c index 063e676a3ac..728a133d0fc 100644 --- a/drivers/sbus/char/envctrl.c +++ b/drivers/sbus/char/envctrl.c @@ -19,9 +19,6 @@ * Daniele Bellucci <bellucda@tiscali.it> */ -#define __KERNEL_SYSCALLS__ -static int errno; - #include <linux/module.h> #include <linux/sched.h> #include <linux/kthread.h> @@ -976,13 +973,15 @@ static void envctrl_do_shutdown(void) "HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL }; char *argv[] = { "/sbin/shutdown", "-h", "now", NULL }; + int ret; if (inprog != 0) return; inprog = 1; printk(KERN_CRIT "kenvctrld: WARNING: Shutting down the system now.\n"); - if (0 > execve("/sbin/shutdown", argv, envp)) { + ret = kernel_execve("/sbin/shutdown", argv, envp); + if (ret < 0) { printk(KERN_CRIT "kenvctrld: WARNING: system shutdown failed!\n"); inprog = 0; /* unlikely to succeed, but we could try again */ } diff --git a/drivers/scsi/lpfc/lpfc_ct.c b/drivers/scsi/lpfc/lpfc_ct.c index ae410645899..1b53afb1cb5 100644 --- a/drivers/scsi/lpfc/lpfc_ct.c +++ b/drivers/scsi/lpfc/lpfc_ct.c @@ -933,8 +933,8 @@ lpfc_fdmi_cmd(struct lpfc_hba * phba, struct lpfc_nodelist * ndlp, int cmdcode) ae = (ATTRIBUTE_ENTRY *) ((uint8_t *) rh + size); ae->ad.bits.AttrType = be16_to_cpu(OS_NAME_VERSION); sprintf(ae->un.OsNameVersion, "%s %s %s", - system_utsname.sysname, system_utsname.release, - system_utsname.version); + init_utsname()->sysname, init_utsname()->release, + init_utsname()->version); len = strlen(ae->un.OsNameVersion); len += (len & 3) ? (4 - (len & 3)) : 4; ae->ad.bits.AttrLen = be16_to_cpu(FOURBYTES + len); @@ -1052,7 +1052,7 @@ lpfc_fdmi_cmd(struct lpfc_hba * phba, struct lpfc_nodelist * ndlp, int cmdcode) size); ae->ad.bits.AttrType = be16_to_cpu(HOST_NAME); sprintf(ae->un.HostName, "%s", - system_utsname.nodename); + init_utsname()->nodename); len = strlen(ae->un.HostName); len += (len & 3) ? (4 - (len & 3)) : 4; ae->ad.bits.AttrLen = @@ -1140,7 +1140,7 @@ lpfc_fdmi_tmo_handler(struct lpfc_hba *phba) ndlp = lpfc_findnode_did(phba, NLP_SEARCH_ALL, FDMI_DID); if (ndlp) { - if (system_utsname.nodename[0] != '\0') { + if (init_utsname()->nodename[0] != '\0') { lpfc_fdmi_cmd(phba, ndlp, SLI_MGMT_DHBA); } else { mod_timer(&phba->fc_fdmitmo, jiffies + HZ * 60); diff --git a/drivers/serial/68328serial.c b/drivers/serial/68328serial.c index 993a702422e..bac853c5abb 100644 --- a/drivers/serial/68328serial.c +++ b/drivers/serial/68328serial.c @@ -1378,7 +1378,7 @@ void startup_console(void) #endif /* CONFIG_PM_LEGACY */ -static struct tty_operations rs_ops = { +static const struct tty_operations rs_ops = { .open = rs_open, .close = rs_close, .write = rs_write, diff --git a/drivers/serial/68360serial.c b/drivers/serial/68360serial.c index e80e70e9b12..1b299e8c57c 100644 --- a/drivers/serial/68360serial.c +++ b/drivers/serial/68360serial.c @@ -2424,7 +2424,7 @@ long console_360_init(long kmem_start, long kmem_end) */ static int baud_idx; -static struct tty_operations rs_360_ops = { +static const struct tty_operations rs_360_ops = { .owner = THIS_MODULE, .open = rs_360_open, .close = rs_360_close, diff --git a/drivers/serial/crisv10.c b/drivers/serial/crisv10.c index cabd048c863..9851d9eff02 100644 --- a/drivers/serial/crisv10.c +++ b/drivers/serial/crisv10.c @@ -4825,7 +4825,7 @@ show_serial_version(void) /* rs_init inits the driver at boot (using the module_init chain) */ -static struct tty_operations rs_ops = { +static const struct tty_operations rs_ops = { .open = rs_open, .close = rs_close, .write = rs_write, diff --git a/drivers/serial/mcfserial.c b/drivers/serial/mcfserial.c index 832abd3c470..00d7859c167 100644 --- a/drivers/serial/mcfserial.c +++ b/drivers/serial/mcfserial.c @@ -1666,7 +1666,7 @@ static void show_serial_version(void) printk(mcfrs_drivername); } -static struct tty_operations mcfrs_ops = { +static const struct tty_operations mcfrs_ops = { .open = mcfrs_open, .close = mcfrs_close, .write = mcfrs_write, diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c index 5f7ba1adb30..de5e8930a6f 100644 --- a/drivers/serial/serial_core.c +++ b/drivers/serial/serial_core.c @@ -2111,7 +2111,7 @@ uart_configure_port(struct uart_driver *drv, struct uart_state *state, } } -static struct tty_operations uart_ops = { +static const struct tty_operations uart_ops = { .open = uart_open, .close = uart_close, .write = uart_write, diff --git a/drivers/tc/zs.c b/drivers/tc/zs.c index 5e8a27620f6..622881f2676 100644 --- a/drivers/tc/zs.c +++ b/drivers/tc/zs.c @@ -1701,7 +1701,7 @@ static void __init probe_sccs(void) spin_unlock_irqrestore(&zs_lock, flags); } -static struct tty_operations serial_ops = { +static const struct tty_operations serial_ops = { .open = rs_open, .close = rs_close, .write = rs_write, diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index ca90326f2f5..71288295df2 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -1120,7 +1120,7 @@ static struct usb_driver acm_driver = { * TTY driver structures. */ -static struct tty_operations acm_ops = { +static const struct tty_operations acm_ops = { .open = acm_tty_open, .close = acm_tty_close, .write = acm_tty_write, diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index a94c63bef63..3f509beb88e 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -65,7 +65,7 @@ DEFINE_MUTEX(usbfs_mutex); struct async { struct list_head asynclist; struct dev_state *ps; - pid_t pid; + struct pid *pid; uid_t uid, euid; unsigned int signr; unsigned int ifnum; @@ -225,6 +225,7 @@ static struct async *alloc_async(unsigned int numisoframes) static void free_async(struct async *as) { + put_pid(as->pid); kfree(as->urb->transfer_buffer); kfree(as->urb->setup_packet); usb_free_urb(as->urb); @@ -317,7 +318,7 @@ static void async_completed(struct urb *urb, struct pt_regs *regs) sinfo.si_errno = as->urb->status; sinfo.si_code = SI_ASYNCIO; sinfo.si_addr = as->userurb; - kill_proc_info_as_uid(as->signr, &sinfo, as->pid, as->uid, + kill_pid_info_as_uid(as->signr, &sinfo, as->pid, as->uid, as->euid, as->secid); } snoop(&urb->dev->dev, "urb complete\n"); @@ -573,7 +574,7 @@ static int usbdev_open(struct inode *inode, struct file *file) INIT_LIST_HEAD(&ps->async_completed); init_waitqueue_head(&ps->wait); ps->discsignr = 0; - ps->disc_pid = current->pid; + ps->disc_pid = get_pid(task_pid(current)); ps->disc_uid = current->uid; ps->disc_euid = current->euid; ps->disccontext = NULL; @@ -611,6 +612,7 @@ static int usbdev_release(struct inode *inode, struct file *file) usb_autosuspend_device(dev, 1); usb_unlock_device(dev); usb_put_dev(dev); + put_pid(ps->disc_pid); kfree(ps); return 0; } @@ -1063,7 +1065,7 @@ static int proc_do_submiturb(struct dev_state *ps, struct usbdevfs_urb *uurb, as->userbuffer = NULL; as->signr = uurb->signr; as->ifnum = ifnum; - as->pid = current->pid; + as->pid = get_pid(task_pid(current)); as->uid = current->uid; as->euid = current->euid; security_task_getsecid(current, &as->secid); diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index 37f9f5e7425..e658089f7b5 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -318,8 +318,8 @@ static int rh_string ( // id 3 == vendor description } else if (id == 3) { - snprintf (buf, sizeof buf, "%s %s %s", system_utsname.sysname, - system_utsname.release, hcd->driver->description); + snprintf (buf, sizeof buf, "%s %s %s", init_utsname()->sysname, + init_utsname()->release, hcd->driver->description); // unsupported IDs --> "protocol stall" } else diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c index 7c77c2d8d30..b5d6a79af0b 100644 --- a/drivers/usb/core/inode.c +++ b/drivers/usb/core/inode.c @@ -699,7 +699,7 @@ static void usbfs_remove_device(struct usb_device *dev) sinfo.si_errno = EPIPE; sinfo.si_code = SI_ASYNCIO; sinfo.si_addr = ds->disccontext; - kill_proc_info_as_uid(ds->discsignr, &sinfo, ds->disc_pid, ds->disc_uid, ds->disc_euid, ds->secid); + kill_pid_info_as_uid(ds->discsignr, &sinfo, ds->disc_pid, ds->disc_uid, ds->disc_euid, ds->secid); } } } diff --git a/drivers/usb/core/usb.h b/drivers/usb/core/usb.h index f69df137ec0..13322e33f91 100644 --- a/drivers/usb/core/usb.h +++ b/drivers/usb/core/usb.h @@ -139,7 +139,7 @@ struct dev_state { struct list_head async_completed; wait_queue_head_t wait; /* wake up if a request completed */ unsigned int discsignr; - pid_t disc_pid; + struct pid *disc_pid; uid_t disc_uid, disc_euid; void __user *disccontext; unsigned long ifclaimed; diff --git a/drivers/usb/gadget/ether.c b/drivers/usb/gadget/ether.c index 366dc0a9e52..1c17d26d03b 100644 --- a/drivers/usb/gadget/ether.c +++ b/drivers/usb/gadget/ether.c @@ -2260,7 +2260,7 @@ eth_bind (struct usb_gadget *gadget) return -ENODEV; } snprintf (manufacturer, sizeof manufacturer, "%s %s/%s", - system_utsname.sysname, system_utsname.release, + init_utsname()->sysname, init_utsname()->release, gadget->name); /* If there's an RNDIS configuration, that's what Windows wants to diff --git a/drivers/usb/gadget/file_storage.c b/drivers/usb/gadget/file_storage.c index c83d3b6c68f..8b975d15538 100644 --- a/drivers/usb/gadget/file_storage.c +++ b/drivers/usb/gadget/file_storage.c @@ -4001,7 +4001,7 @@ static int __init fsg_bind(struct usb_gadget *gadget) usb_gadget_set_selfpowered(gadget); snprintf(manufacturer, sizeof manufacturer, "%s %s with %s", - system_utsname.sysname, system_utsname.release, + init_utsname()->sysname, init_utsname()->release, gadget->name); /* On a real device, serial[] would be loaded from permanent diff --git a/drivers/usb/gadget/gmidi.c b/drivers/usb/gadget/gmidi.c index b68cecd5741..83601d4009e 100644 --- a/drivers/usb/gadget/gmidi.c +++ b/drivers/usb/gadget/gmidi.c @@ -1189,7 +1189,7 @@ static int __devinit gmidi_bind(struct usb_gadget *gadget) strlcpy(manufacturer, iManufacturer, sizeof(manufacturer)); } else { snprintf(manufacturer, sizeof(manufacturer), "%s %s with %s", - system_utsname.sysname, system_utsname.release, + init_utsname()->sysname, init_utsname()->release, gadget->name); } if (iProduct) { diff --git a/drivers/usb/gadget/serial.c b/drivers/usb/gadget/serial.c index b893e3118e1..208e55a667a 100644 --- a/drivers/usb/gadget/serial.c +++ b/drivers/usb/gadget/serial.c @@ -271,7 +271,7 @@ static unsigned int use_acm = GS_DEFAULT_USE_ACM; /* tty driver struct */ -static struct tty_operations gs_tty_ops = { +static const struct tty_operations gs_tty_ops = { .open = gs_open, .close = gs_close, .write = gs_write, @@ -1434,7 +1434,7 @@ static int __init gs_bind(struct usb_gadget *gadget) return -ENOMEM; snprintf(manufacturer, sizeof(manufacturer), "%s %s with %s", - system_utsname.sysname, system_utsname.release, + init_utsname()->sysname, init_utsname()->release, gadget->name); memset(dev, 0, sizeof(struct gs_dev)); diff --git a/drivers/usb/gadget/zero.c b/drivers/usb/gadget/zero.c index b7018ee487e..0f809dd6849 100644 --- a/drivers/usb/gadget/zero.c +++ b/drivers/usb/gadget/zero.c @@ -1242,7 +1242,7 @@ autoconf_fail: EP_OUT_NAME, EP_IN_NAME); snprintf (manufacturer, sizeof manufacturer, "%s %s with %s", - system_utsname.sysname, system_utsname.release, + init_utsname()->sysname, init_utsname()->release, gadget->name); return 0; diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c index 0222d92842b..8006e51c34b 100644 --- a/drivers/usb/serial/usb-serial.c +++ b/drivers/usb/serial/usb-serial.c @@ -1015,7 +1015,7 @@ void usb_serial_disconnect(struct usb_interface *interface) dev_info(dev, "device disconnected\n"); } -static struct tty_operations serial_ops = { +static const struct tty_operations serial_ops = { .open = serial_open, .close = serial_close, .write = serial_write, diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 0e9ba0b9d71..c78762051da 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -772,12 +772,12 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol) separator[1] = 0; memset(vol->source_rfc1001_name,0x20,15); - for(i=0;i < strnlen(system_utsname.nodename,15);i++) { + for(i=0;i < strnlen(utsname()->nodename,15);i++) { /* does not have to be a perfect mapping since the field is informational, only used for servers that do not support port 445 and it can be overridden at mount time */ vol->source_rfc1001_name[i] = - toupper(system_utsname.nodename[i]); + toupper(utsname()->nodename[i]); } vol->source_rfc1001_name[15] = 0; /* null target name indicates to use *SMBSERVR default called name @@ -2153,7 +2153,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, 32, nls_codepage); bcc_ptr += 2 * bytes_returned; bytes_returned = - cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release, + cifs_strtoUCS((__le16 *) bcc_ptr, utsname()->release, 32, nls_codepage); bcc_ptr += 2 * bytes_returned; bcc_ptr += 2; @@ -2180,8 +2180,8 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, } strcpy(bcc_ptr, "Linux version "); bcc_ptr += strlen("Linux version "); - strcpy(bcc_ptr, system_utsname.release); - bcc_ptr += strlen(system_utsname.release) + 1; + strcpy(bcc_ptr, utsname()->release); + bcc_ptr += strlen(utsname()->release) + 1; strcpy(bcc_ptr, CIFS_NETWORK_OPSYS); bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1; } @@ -2445,7 +2445,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, 32, nls_codepage); bcc_ptr += 2 * bytes_returned; bytes_returned = - cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release, 32, + cifs_strtoUCS((__le16 *) bcc_ptr, utsname()->release, 32, nls_codepage); bcc_ptr += 2 * bytes_returned; bcc_ptr += 2; /* null terminate Linux version */ @@ -2462,8 +2462,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, } else { /* ASCII */ strcpy(bcc_ptr, "Linux version "); bcc_ptr += strlen("Linux version "); - strcpy(bcc_ptr, system_utsname.release); - bcc_ptr += strlen(system_utsname.release) + 1; + strcpy(bcc_ptr, utsname()->release); + bcc_ptr += strlen(utsname()->release) + 1; strcpy(bcc_ptr, CIFS_NETWORK_OPSYS); bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1; bcc_ptr++; /* empty domain field */ @@ -2836,7 +2836,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses, 32, nls_codepage); bcc_ptr += 2 * bytes_returned; bytes_returned = - cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release, 32, + cifs_strtoUCS((__le16 *) bcc_ptr, utsname()->release, 32, nls_codepage); bcc_ptr += 2 * bytes_returned; bcc_ptr += 2; /* null term version string */ @@ -2888,8 +2888,8 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses, strcpy(bcc_ptr, "Linux version "); bcc_ptr += strlen("Linux version "); - strcpy(bcc_ptr, system_utsname.release); - bcc_ptr += strlen(system_utsname.release) + 1; + strcpy(bcc_ptr, utsname()->release); + bcc_ptr += strlen(utsname()->release) + 1; strcpy(bcc_ptr, CIFS_NETWORK_OPSYS); bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1; bcc_ptr++; /* null domain */ diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index d1705ab8136..22b4c35dcfe 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -111,7 +111,7 @@ static void unicode_ssetup_strings(char ** pbcc_area, struct cifsSesInfo *ses, bytes_ret = cifs_strtoUCS((__le16 *)bcc_ptr, "Linux version ", 32, nls_cp); bcc_ptr += 2 * bytes_ret; - bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release, + bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, init_utsname()->release, 32, nls_cp); bcc_ptr += 2 * bytes_ret; bcc_ptr += 2; /* trailing null */ @@ -158,8 +158,8 @@ static void ascii_ssetup_strings(char ** pbcc_area, struct cifsSesInfo *ses, strcpy(bcc_ptr, "Linux version "); bcc_ptr += strlen("Linux version "); - strcpy(bcc_ptr, system_utsname.release); - bcc_ptr += strlen(system_utsname.release) + 1; + strcpy(bcc_ptr, init_utsname()->release); + bcc_ptr += strlen(init_utsname()->release) + 1; strcpy(bcc_ptr, CIFS_NETWORK_OPSYS); bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1; diff --git a/fs/compat.c b/fs/compat.c index 13fb08d096c..d98c96f4a44 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -56,8 +56,6 @@ int compat_log = 1; -extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat); - int compat_printk(const char *fmt, ...) { va_list ap; diff --git a/fs/dnotify.c b/fs/dnotify.c index f932591df5a..2b0442db67e 100644 --- a/fs/dnotify.c +++ b/fs/dnotify.c @@ -92,7 +92,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) prev = &odn->dn_next; } - error = f_setown(filp, current->pid, 0); + error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); if (error) goto out_free; diff --git a/fs/exec.c b/fs/exec.c index 6270f8f20a6..d993ea1a81a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1318,7 +1318,7 @@ static void format_corename(char *corename, const char *pattern, long signr) case 'h': down_read(&uts_sem); rc = snprintf(out_ptr, out_end - out_ptr, - "%s", system_utsname.nodename); + "%s", utsname()->nodename); up_read(&uts_sem); if (rc > out_end - out_ptr) goto out; diff --git a/fs/fcntl.c b/fs/fcntl.c index d35cbc6bc11..e4f26165f12 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -250,19 +250,22 @@ static int setfl(int fd, struct file * filp, unsigned long arg) return error; } -static void f_modown(struct file *filp, unsigned long pid, +static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, uid_t uid, uid_t euid, int force) { write_lock_irq(&filp->f_owner.lock); if (force || !filp->f_owner.pid) { - filp->f_owner.pid = pid; + put_pid(filp->f_owner.pid); + filp->f_owner.pid = get_pid(pid); + filp->f_owner.pid_type = type; filp->f_owner.uid = uid; filp->f_owner.euid = euid; } write_unlock_irq(&filp->f_owner.lock); } -int f_setown(struct file *filp, unsigned long arg, int force) +int __f_setown(struct file *filp, struct pid *pid, enum pid_type type, + int force) { int err; @@ -270,15 +273,44 @@ int f_setown(struct file *filp, unsigned long arg, int force) if (err) return err; - f_modown(filp, arg, current->uid, current->euid, force); + f_modown(filp, pid, type, current->uid, current->euid, force); return 0; } +EXPORT_SYMBOL(__f_setown); +int f_setown(struct file *filp, unsigned long arg, int force) +{ + enum pid_type type; + struct pid *pid; + int who = arg; + int result; + type = PIDTYPE_PID; + if (who < 0) { + type = PIDTYPE_PGID; + who = -who; + } + rcu_read_lock(); + pid = find_pid(who); + result = __f_setown(filp, pid, type, force); + rcu_read_unlock(); + return result; +} EXPORT_SYMBOL(f_setown); void f_delown(struct file *filp) { - f_modown(filp, 0, 0, 0, 1); + f_modown(filp, NULL, PIDTYPE_PID, 0, 0, 1); +} + +pid_t f_getown(struct file *filp) +{ + pid_t pid; + read_lock(&filp->f_owner.lock); + pid = pid_nr(filp->f_owner.pid); + if (filp->f_owner.pid_type == PIDTYPE_PGID) + pid = -pid; + read_unlock(&filp->f_owner.lock); + return pid; } static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, @@ -319,7 +351,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, * current syscall conventions, the only way * to fix this will be in libc. */ - err = filp->f_owner.pid; + err = f_getown(filp); force_successful_syscall_return(); break; case F_SETOWN: @@ -470,24 +502,19 @@ static void send_sigio_to_task(struct task_struct *p, void send_sigio(struct fown_struct *fown, int fd, int band) { struct task_struct *p; - int pid; + enum pid_type type; + struct pid *pid; read_lock(&fown->lock); + type = fown->pid_type; pid = fown->pid; if (!pid) goto out_unlock_fown; read_lock(&tasklist_lock); - if (pid > 0) { - p = find_task_by_pid(pid); - if (p) { - send_sigio_to_task(p, fown, fd, band); - } - } else { - do_each_task_pid(-pid, PIDTYPE_PGID, p) { - send_sigio_to_task(p, fown, fd, band); - } while_each_task_pid(-pid, PIDTYPE_PGID, p); - } + do_each_pid_task(pid, type, p) { + send_sigio_to_task(p, fown, fd, band); + } while_each_pid_task(pid, type, p); read_unlock(&tasklist_lock); out_unlock_fown: read_unlock(&fown->lock); @@ -503,9 +530,12 @@ static void send_sigurg_to_task(struct task_struct *p, int send_sigurg(struct fown_struct *fown) { struct task_struct *p; - int pid, ret = 0; + enum pid_type type; + struct pid *pid; + int ret = 0; read_lock(&fown->lock); + type = fown->pid_type; pid = fown->pid; if (!pid) goto out_unlock_fown; @@ -513,16 +543,9 @@ int send_sigurg(struct fown_struct *fown) ret = 1; read_lock(&tasklist_lock); - if (pid > 0) { - p = find_task_by_pid(pid); - if (p) { - send_sigurg_to_task(p, fown); - } - } else { - do_each_task_pid(-pid, PIDTYPE_PGID, p) { - send_sigurg_to_task(p, fown); - } while_each_task_pid(-pid, PIDTYPE_PGID, p); - } + do_each_pid_task(pid, type, p) { + send_sigurg_to_task(p, fown); + } while_each_pid_task(pid, type, p); read_unlock(&tasklist_lock); out_unlock_fown: read_unlock(&fown->lock); diff --git a/fs/file_table.c b/fs/file_table.c index bc35a40417d..24f25a057d9 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -174,6 +174,7 @@ void fastcall __fput(struct file *file) fops_put(file->f_op); if (file->f_mode & FMODE_WRITE) put_write_access(inode); + put_pid(file->f_owner.pid); file_kill(file); file->f_dentry = NULL; file->f_vfsmnt = NULL; diff --git a/fs/inode.c b/fs/inode.c index ada7643104e..bf6bec4e54f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -657,7 +657,7 @@ static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_he return inode; } -static inline unsigned long hash(struct super_block *sb, unsigned long hashval) +static unsigned long hash(struct super_block *sb, unsigned long hashval) { unsigned long tmp; @@ -1003,7 +1003,7 @@ void generic_delete_inode(struct inode *inode) list_del_init(&inode->i_list); list_del_init(&inode->i_sb_list); - inode->i_state|=I_FREEING; + inode->i_state |= I_FREEING; inodes_stat.nr_inodes--; spin_unlock(&inode_lock); @@ -1210,13 +1210,15 @@ void file_update_time(struct file *file) return; now = current_fs_time(inode->i_sb); - if (!timespec_equal(&inode->i_mtime, &now)) + if (!timespec_equal(&inode->i_mtime, &now)) { + inode->i_mtime = now; sync_it = 1; - inode->i_mtime = now; + } - if (!timespec_equal(&inode->i_ctime, &now)) + if (!timespec_equal(&inode->i_ctime, &now)) { + inode->i_ctime = now; sync_it = 1; - inode->i_ctime = now; + } if (sync_it) mark_inode_dirty_sync(inode); diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index f95cc3f3c42..87e1d03e826 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -202,7 +202,7 @@ reclaimer(void *ptr) /* This one ensures that our parent doesn't terminate while the * reclaim is in progress */ lock_kernel(); - lockd_up(); + lockd_up(0); /* note: this cannot fail as lockd is already running */ nlmclnt_prepare_reclaim(host); /* First, reclaim all locks that have been marked. */ diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 271e2165fff..0116729cec5 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -129,11 +129,11 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl) nlmclnt_next_cookie(&argp->cookie); argp->state = nsm_local_state; memcpy(&lock->fh, NFS_FH(fl->fl_file->f_dentry->d_inode), sizeof(struct nfs_fh)); - lock->caller = system_utsname.nodename; + lock->caller = utsname()->nodename; lock->oh.data = req->a_owner; lock->oh.len = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s", (unsigned int)fl->fl_u.nfs_fl.owner->pid, - system_utsname.nodename); + utsname()->nodename); lock->svid = fl->fl_u.nfs_fl.owner->pid; lock->fl.fl_start = fl->fl_start; lock->fl.fl_end = fl->fl_end; diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 5954dcb497e..a816b920d43 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -145,7 +145,7 @@ xdr_encode_common(struct rpc_rqst *rqstp, u32 *p, struct nsm_args *argp) */ sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(argp->addr)); if (!(p = xdr_encode_string(p, buffer)) - || !(p = xdr_encode_string(p, system_utsname.nodename))) + || !(p = xdr_encode_string(p, utsname()->nodename))) return ERR_PTR(-EIO); *p++ = htonl(argp->prog); *p++ = htonl(argp->vers); diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 9a991b52c64..3cc369e5693 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -31,6 +31,7 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/svcsock.h> +#include <net/ip.h> #include <linux/lockd/lockd.h> #include <linux/nfs.h> @@ -46,6 +47,7 @@ EXPORT_SYMBOL(nlmsvc_ops); static DEFINE_MUTEX(nlmsvc_mutex); static unsigned int nlmsvc_users; static pid_t nlmsvc_pid; +static struct svc_serv *nlmsvc_serv; int nlmsvc_grace_period; unsigned long nlmsvc_timeout; @@ -96,7 +98,6 @@ static inline void clear_grace_period(void) static void lockd(struct svc_rqst *rqstp) { - struct svc_serv *serv = rqstp->rq_server; int err = 0; unsigned long grace_period_expire; @@ -112,6 +113,7 @@ lockd(struct svc_rqst *rqstp) * Let our maker know we're running. */ nlmsvc_pid = current->pid; + nlmsvc_serv = rqstp->rq_server; complete(&lockd_start_done); daemonize("lockd"); @@ -161,7 +163,7 @@ lockd(struct svc_rqst *rqstp) * Find a socket with data available and call its * recvfrom routine. */ - err = svc_recv(serv, rqstp, timeout); + err = svc_recv(rqstp, timeout); if (err == -EAGAIN || err == -EINTR) continue; if (err < 0) { @@ -174,7 +176,7 @@ lockd(struct svc_rqst *rqstp) dprintk("lockd: request from %08x\n", (unsigned)ntohl(rqstp->rq_addr.sin_addr.s_addr)); - svc_process(serv, rqstp); + svc_process(rqstp); } @@ -189,6 +191,7 @@ lockd(struct svc_rqst *rqstp) nlmsvc_invalidate_all(); nlm_shutdown_hosts(); nlmsvc_pid = 0; + nlmsvc_serv = NULL; } else printk(KERN_DEBUG "lockd: new process, skipping host shutdown\n"); @@ -205,54 +208,77 @@ lockd(struct svc_rqst *rqstp) module_put_and_exit(0); } + +static int find_socket(struct svc_serv *serv, int proto) +{ + struct svc_sock *svsk; + int found = 0; + list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) + if (svsk->sk_sk->sk_protocol == proto) { + found = 1; + break; + } + return found; +} + +static int make_socks(struct svc_serv *serv, int proto) +{ + /* Make any sockets that are needed but not present. + * If nlm_udpport or nlm_tcpport were set as module + * options, make those sockets unconditionally + */ + static int warned; + int err = 0; + if (proto == IPPROTO_UDP || nlm_udpport) + if (!find_socket(serv, IPPROTO_UDP)) + err = svc_makesock(serv, IPPROTO_UDP, nlm_udpport); + if (err == 0 && (proto == IPPROTO_TCP || nlm_tcpport)) + if (!find_socket(serv, IPPROTO_TCP)) + err= svc_makesock(serv, IPPROTO_TCP, nlm_tcpport); + if (!err) + warned = 0; + else if (warned++ == 0) + printk(KERN_WARNING + "lockd_up: makesock failed, error=%d\n", err); + return err; +} + /* * Bring up the lockd process if it's not already up. */ int -lockd_up(void) +lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */ { - static int warned; struct svc_serv * serv; int error = 0; mutex_lock(&nlmsvc_mutex); /* - * Unconditionally increment the user count ... this is - * the number of clients who _want_ a lockd process. - */ - nlmsvc_users++; - /* * Check whether we're already up and running. */ - if (nlmsvc_pid) + if (nlmsvc_pid) { + if (proto) + error = make_socks(nlmsvc_serv, proto); goto out; + } /* * Sanity check: if there's no pid, * we should be the first user ... */ - if (nlmsvc_users > 1) + if (nlmsvc_users) printk(KERN_WARNING "lockd_up: no pid, %d users??\n", nlmsvc_users); error = -ENOMEM; - serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE); + serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); goto out; } - if ((error = svc_makesock(serv, IPPROTO_UDP, nlm_udpport)) < 0 -#ifdef CONFIG_NFSD_TCP - || (error = svc_makesock(serv, IPPROTO_TCP, nlm_tcpport)) < 0 -#endif - ) { - if (warned++ == 0) - printk(KERN_WARNING - "lockd_up: makesock failed, error=%d\n", error); + if ((error = make_socks(serv, proto)) < 0) goto destroy_and_out; - } - warned = 0; /* * Create the kernel thread and wait for it to start. @@ -272,6 +298,8 @@ lockd_up(void) destroy_and_out: svc_destroy(serv); out: + if (!error) + nlmsvc_users++; mutex_unlock(&nlmsvc_mutex); return error; } diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index c9d419703cf..93c00ee7189 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -325,7 +325,7 @@ static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock) { locks_copy_lock(&call->a_args.lock.fl, &lock->fl); memcpy(&call->a_args.lock.fh, &lock->fh, sizeof(call->a_args.lock.fh)); - call->a_args.lock.caller = system_utsname.nodename; + call->a_args.lock.caller = utsname()->nodename; call->a_args.lock.oh.len = lock->oh.len; /* set default data area */ diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 033ea4ac2c3..61c46facf25 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -515,7 +515,7 @@ nlmclt_decode_res(struct rpc_rqst *req, u32 *p, struct nlm_res *resp) */ #define NLM_void_sz 0 #define NLM_cookie_sz 1+XDR_QUADLEN(NLM_MAXCOOKIELEN) -#define NLM_caller_sz 1+XDR_QUADLEN(sizeof(system_utsname.nodename)) +#define NLM_caller_sz 1+XDR_QUADLEN(sizeof(utsname()->nodename)) #define NLM_netobj_sz 1+XDR_QUADLEN(XDR_MAX_NETOBJ) /* #define NLM_owner_sz 1+XDR_QUADLEN(NLM_MAXOWNER) */ #define NLM_fhandle_sz 1+XDR_QUADLEN(NFS2_FHSIZE) diff --git a/fs/locks.c b/fs/locks.c index 21dfadfca2b..e0b6a80649a 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1514,7 +1514,7 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg) goto out_unlock; } - error = f_setown(filp, current->pid, 0); + error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); out_unlock: unlock_kernel(); return error; diff --git a/fs/namespace.c b/fs/namespace.c index 66d921e14fe..55442a6cf22 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -133,7 +133,7 @@ struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) static inline int check_mnt(struct vfsmount *mnt) { - return mnt->mnt_namespace == current->namespace; + return mnt->mnt_namespace == current->nsproxy->namespace; } static void touch_namespace(struct namespace *ns) @@ -830,7 +830,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt, if (parent_nd) { detach_mnt(source_mnt, parent_nd); attach_mnt(source_mnt, nd); - touch_namespace(current->namespace); + touch_namespace(current->nsproxy->namespace); } else { mnt_set_mountpoint(dest_mnt, dest_dentry, source_mnt); commit_tree(source_mnt); @@ -1441,7 +1441,7 @@ dput_out: */ struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs) { - struct namespace *namespace = tsk->namespace; + struct namespace *namespace = tsk->nsproxy->namespace; struct namespace *new_ns; struct vfsmount *rootmnt = NULL, *pwdmnt = NULL, *altrootmnt = NULL; struct vfsmount *p, *q; @@ -1508,7 +1508,7 @@ struct namespace *dup_namespace(struct task_struct *tsk, struct fs_struct *fs) int copy_namespace(int flags, struct task_struct *tsk) { - struct namespace *namespace = tsk->namespace; + struct namespace *namespace = tsk->nsproxy->namespace; struct namespace *new_ns; int err = 0; @@ -1531,7 +1531,7 @@ int copy_namespace(int flags, struct task_struct *tsk) goto out; } - tsk->namespace = new_ns; + tsk->nsproxy->namespace = new_ns; out: put_namespace(namespace); @@ -1754,7 +1754,7 @@ asmlinkage long sys_pivot_root(const char __user * new_root, detach_mnt(user_nd.mnt, &root_parent); attach_mnt(user_nd.mnt, &old_nd); /* mount old root on put_old */ attach_mnt(new_nd.mnt, &root_parent); /* mount new_root on / */ - touch_namespace(current->namespace); + touch_namespace(current->nsproxy->namespace); spin_unlock(&vfsmount_lock); chroot_fs_refs(&user_nd, &new_nd); security_sb_post_pivotroot(&user_nd, &new_nd); @@ -1780,7 +1780,6 @@ static void __init init_mount_tree(void) { struct vfsmount *mnt; struct namespace *namespace; - struct task_struct *g, *p; mnt = do_kern_mount("rootfs", 0, "rootfs", NULL); if (IS_ERR(mnt)) @@ -1796,13 +1795,8 @@ static void __init init_mount_tree(void) namespace->root = mnt; mnt->mnt_namespace = namespace; - init_task.namespace = namespace; - read_lock(&tasklist_lock); - do_each_thread(g, p) { - get_namespace(namespace); - p->namespace = namespace; - } while_each_thread(g, p); - read_unlock(&tasklist_lock); + init_task.nsproxy->namespace = namespace; + get_namespace(namespace); set_fs_pwd(current->fs, namespace->root, namespace->root->mnt_root); set_fs_root(current->fs, namespace->root, namespace->root->mnt_root); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index a3ee11364db..7933e2e99db 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -58,7 +58,6 @@ module_param_call(callback_tcpport, param_set_port, param_get_int, */ static void nfs_callback_svc(struct svc_rqst *rqstp) { - struct svc_serv *serv = rqstp->rq_server; int err; __module_get(THIS_MODULE); @@ -80,7 +79,7 @@ static void nfs_callback_svc(struct svc_rqst *rqstp) /* * Listen for a request on the socket */ - err = svc_recv(serv, rqstp, MAX_SCHEDULE_TIMEOUT); + err = svc_recv(rqstp, MAX_SCHEDULE_TIMEOUT); if (err == -EAGAIN || err == -EINTR) continue; if (err < 0) { @@ -91,7 +90,7 @@ static void nfs_callback_svc(struct svc_rqst *rqstp) } dprintk("%s: request from %u.%u.%u.%u\n", __FUNCTION__, NIPQUAD(rqstp->rq_addr.sin_addr.s_addr)); - svc_process(serv, rqstp); + svc_process(rqstp); } svc_exit_thread(rqstp); @@ -116,7 +115,7 @@ int nfs_callback_up(void) goto out; init_completion(&nfs_callback_info.started); init_completion(&nfs_callback_info.stopped); - serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE); + serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL); ret = -ENOMEM; if (!serv) goto out_err; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index ec1938d4b81..8106f3b29e4 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -460,7 +460,8 @@ static int nfs_start_lockd(struct nfs_server *server) goto out; if (server->flags & NFS_MOUNT_NONLM) goto out; - error = lockd_up(); + error = lockd_up((server->flags & NFS_MOUNT_TCP) ? + IPPROTO_TCP : IPPROTO_UDP); if (error < 0) server->flags |= NFS_MOUNT_NONLM; else diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index c0a754ecdee..1d656a64519 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -312,7 +312,7 @@ static int __init root_nfs_name(char *name) /* Override them by options set on kernel command-line */ root_nfs_parse(name, buf); - cp = system_utsname.nodename; + cp = utsname()->nodename; if (strlen(buf) + strlen(cp) > NFS_MAXPATHLEN) { printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n"); return -1; diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 01bc68c628a..cfe141e5d75 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -370,7 +370,7 @@ static int check_export(struct inode *inode, int flags) */ if (!(inode->i_sb->s_type->fs_flags & FS_REQUIRES_DEV) && !(flags & NFSEXP_FSID)) { - dprintk("exp_export: export of non-dev fs without fsid"); + dprintk("exp_export: export of non-dev fs without fsid\n"); return -EINVAL; } if (!inode->i_sb->s_export_op) { @@ -1078,6 +1078,7 @@ exp_pseudoroot(struct auth_domain *clp, struct svc_fh *fhp, /* Iterator */ static void *e_start(struct seq_file *m, loff_t *pos) + __acquires(svc_export_cache.hash_lock) { loff_t n = *pos; unsigned hash, export; @@ -1086,7 +1087,7 @@ static void *e_start(struct seq_file *m, loff_t *pos) exp_readlock(); read_lock(&svc_export_cache.hash_lock); if (!n--) - return (void *)1; + return SEQ_START_TOKEN; hash = n >> 32; export = n & ((1LL<<32) - 1); @@ -1110,7 +1111,7 @@ static void *e_next(struct seq_file *m, void *p, loff_t *pos) struct cache_head *ch = p; int hash = (*pos >> 32); - if (p == (void *)1) + if (p == SEQ_START_TOKEN) hash = 0; else if (ch->next == NULL) { hash++; @@ -1131,6 +1132,7 @@ static void *e_next(struct seq_file *m, void *p, loff_t *pos) } static void e_stop(struct seq_file *m, void *p) + __releases(svc_export_cache.hash_lock) { read_unlock(&svc_export_cache.hash_lock); exp_readunlock(); @@ -1178,15 +1180,13 @@ static int e_show(struct seq_file *m, void *p) { struct cache_head *cp = p; struct svc_export *exp = container_of(cp, struct svc_export, h); - svc_client *clp; - if (p == (void *)1) { + if (p == SEQ_START_TOKEN) { seq_puts(m, "# Version 1.1\n"); seq_puts(m, "# Path Client(Flags) # IPs\n"); return 0; } - clp = exp->ex_client; cache_get(&exp->h); if (cache_check(&svc_export_cache, &exp->h, NULL)) return 0; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 8583d99ee74..f6ca9fb3fc6 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -131,7 +131,7 @@ xdr_error: \ #define READ_BUF(nbytes) do { \ p = xdr_inline_decode(xdr, nbytes); \ if (!p) { \ - dprintk("NFSD: %s: reply buffer overflowed in line %d.", \ + dprintk("NFSD: %s: reply buffer overflowed in line %d.\n", \ __FUNCTION__, __LINE__); \ return -EIO; \ } \ diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index ee4eff27aed..15ded7a30a7 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -600,7 +600,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_se &setattr->sa_stateid, CHECK_FH | WR_STATE, NULL); nfs4_unlock_state(); if (status) { - dprintk("NFSD: nfsd4_setattr: couldn't process stateid!"); + dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); return status; } } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 7046ac9cf97..5c6a477c20e 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -23,10 +23,14 @@ #include <linux/pagemap.h> #include <linux/init.h> #include <linux/string.h> +#include <linux/smp_lock.h> +#include <linux/ctype.h> #include <linux/nfs.h> #include <linux/nfsd_idmap.h> +#include <linux/lockd/bind.h> #include <linux/sunrpc/svc.h> +#include <linux/sunrpc/svcsock.h> #include <linux/nfsd/nfsd.h> #include <linux/nfsd/cache.h> #include <linux/nfsd/xdr.h> @@ -35,8 +39,6 @@ #include <asm/uaccess.h> -unsigned int nfsd_versbits = ~0; - /* * We have a single directory with 9 nodes in it. */ @@ -52,7 +54,9 @@ enum { NFSD_List, NFSD_Fh, NFSD_Threads, + NFSD_Pool_Threads, NFSD_Versions, + NFSD_Ports, /* * The below MUST come last. Otherwise we leave a hole in nfsd_files[] * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops @@ -75,7 +79,9 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size); static ssize_t write_getfs(struct file *file, char *buf, size_t size); static ssize_t write_filehandle(struct file *file, char *buf, size_t size); static ssize_t write_threads(struct file *file, char *buf, size_t size); +static ssize_t write_pool_threads(struct file *file, char *buf, size_t size); static ssize_t write_versions(struct file *file, char *buf, size_t size); +static ssize_t write_ports(struct file *file, char *buf, size_t size); #ifdef CONFIG_NFSD_V4 static ssize_t write_leasetime(struct file *file, char *buf, size_t size); static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); @@ -91,7 +97,9 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = { [NFSD_Getfs] = write_getfs, [NFSD_Fh] = write_filehandle, [NFSD_Threads] = write_threads, + [NFSD_Pool_Threads] = write_pool_threads, [NFSD_Versions] = write_versions, + [NFSD_Ports] = write_ports, #ifdef CONFIG_NFSD_V4 [NFSD_Leasetime] = write_leasetime, [NFSD_RecoveryDir] = write_recoverydir, @@ -358,6 +366,72 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size) return strlen(buf); } +extern int nfsd_nrpools(void); +extern int nfsd_get_nrthreads(int n, int *); +extern int nfsd_set_nrthreads(int n, int *); + +static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) +{ + /* if size > 0, look for an array of number of threads per node + * and apply them then write out number of threads per node as reply + */ + char *mesg = buf; + int i; + int rv; + int len; + int npools = nfsd_nrpools(); + int *nthreads; + + if (npools == 0) { + /* + * NFS is shut down. The admin can start it by + * writing to the threads file but NOT the pool_threads + * file, sorry. Report zero threads. + */ + strcpy(buf, "0\n"); + return strlen(buf); + } + + nthreads = kcalloc(npools, sizeof(int), GFP_KERNEL); + if (nthreads == NULL) + return -ENOMEM; + + if (size > 0) { + for (i = 0; i < npools; i++) { + rv = get_int(&mesg, &nthreads[i]); + if (rv == -ENOENT) + break; /* fewer numbers than pools */ + if (rv) + goto out_free; /* syntax error */ + rv = -EINVAL; + if (nthreads[i] < 0) + goto out_free; + } + rv = nfsd_set_nrthreads(i, nthreads); + if (rv) + goto out_free; + } + + rv = nfsd_get_nrthreads(npools, nthreads); + if (rv) + goto out_free; + + mesg = buf; + size = SIMPLE_TRANSACTION_LIMIT; + for (i = 0; i < npools && size > 0; i++) { + snprintf(mesg, size, "%d%c", nthreads[i], (i == npools-1 ? '\n' : ' ')); + len = strlen(mesg); + size -= len; + mesg += len; + } + + return (mesg-buf); + +out_free: + kfree(nthreads); + return rv; +} + static ssize_t write_versions(struct file *file, char *buf, size_t size) { /* @@ -372,6 +446,10 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size) if (size>0) { if (nfsd_serv) + /* Cannot change versions without updating + * nfsd_serv->sv_xdrsize, and reallocing + * rq_argp and rq_resp + */ return -EBUSY; if (buf[size-1] != '\n') return -EINVAL; @@ -390,10 +468,7 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size) case 2: case 3: case 4: - if (sign != '-') - NFSCTL_VERSET(nfsd_versbits, num); - else - NFSCTL_VERUNSET(nfsd_versbits, num); + nfsd_vers(num, sign == '-' ? NFSD_CLEAR : NFSD_SET); break; default: return -EINVAL; @@ -404,16 +479,15 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size) /* If all get turned off, turn them back on, as * having no versions is BAD */ - if ((nfsd_versbits & NFSCTL_VERALL)==0) - nfsd_versbits = NFSCTL_VERALL; + nfsd_reset_versions(); } /* Now write current state into reply buffer */ len = 0; sep = ""; for (num=2 ; num <= 4 ; num++) - if (NFSCTL_VERISSET(NFSCTL_VERALL, num)) { + if (nfsd_vers(num, NFSD_AVAIL)) { len += sprintf(buf+len, "%s%c%d", sep, - NFSCTL_VERISSET(nfsd_versbits, num)?'+':'-', + nfsd_vers(num, NFSD_TEST)?'+':'-', num); sep = " "; } @@ -421,6 +495,62 @@ static ssize_t write_versions(struct file *file, char *buf, size_t size) return len; } +static ssize_t write_ports(struct file *file, char *buf, size_t size) +{ + if (size == 0) { + int len = 0; + lock_kernel(); + if (nfsd_serv) + len = svc_sock_names(buf, nfsd_serv, NULL); + unlock_kernel(); + return len; + } + /* Either a single 'fd' number is written, in which + * case it must be for a socket of a supported family/protocol, + * and we use it as an nfsd socket, or + * A '-' followed by the 'name' of a socket in which case + * we close the socket. + */ + if (isdigit(buf[0])) { + char *mesg = buf; + int fd; + int err; + err = get_int(&mesg, &fd); + if (err) + return -EINVAL; + if (fd < 0) + return -EINVAL; + err = nfsd_create_serv(); + if (!err) { + int proto = 0; + err = lockd_up(proto); + if (!err) { + err = svc_addsock(nfsd_serv, fd, buf, &proto); + if (err) + lockd_down(); + } + /* Decrease the count, but don't shutdown the + * the service + */ + nfsd_serv->sv_nrthreads--; + } + return err; + } + if (buf[0] == '-') { + char *toclose = kstrdup(buf+1, GFP_KERNEL); + int len = 0; + if (!toclose) + return -ENOMEM; + lock_kernel(); + if (nfsd_serv) + len = svc_sock_names(buf, nfsd_serv, toclose); + unlock_kernel(); + kfree(toclose); + return len; + } + return -EINVAL; +} + #ifdef CONFIG_NFSD_V4 extern time_t nfs4_leasetime(void); @@ -483,7 +613,9 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) [NFSD_List] = {"exports", &exports_operations, S_IRUGO}, [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, #ifdef CONFIG_NFSD_V4 [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index ec1decf29ba..19443056ec3 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -57,12 +57,6 @@ static atomic_t nfsd_busy; static unsigned long nfsd_last_call; static DEFINE_SPINLOCK(nfsd_call_lock); -struct nfsd_list { - struct list_head list; - struct task_struct *task; -}; -static struct list_head nfsd_list = LIST_HEAD_INIT(nfsd_list); - #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) static struct svc_stat nfsd_acl_svcstats; static struct svc_version * nfsd_acl_version[] = { @@ -117,6 +111,32 @@ struct svc_program nfsd_program = { }; +int nfsd_vers(int vers, enum vers_op change) +{ + if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS) + return -1; + switch(change) { + case NFSD_SET: + nfsd_versions[vers] = nfsd_version[vers]; + break; +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) + if (vers < NFSD_ACL_NRVERS) + nfsd_acl_version[vers] = nfsd_acl_version[vers]; +#endif + case NFSD_CLEAR: + nfsd_versions[vers] = NULL; +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) + if (vers < NFSD_ACL_NRVERS) + nfsd_acl_version[vers] = NULL; +#endif + break; + case NFSD_TEST: + return nfsd_versions[vers] != NULL; + case NFSD_AVAIL: + return nfsd_version[vers] != NULL; + } + return 0; +} /* * Maximum number of nfsd processes */ @@ -130,16 +150,175 @@ int nfsd_nrthreads(void) return nfsd_serv->sv_nrthreads; } +static int killsig; /* signal that was used to kill last nfsd */ +static void nfsd_last_thread(struct svc_serv *serv) +{ + /* When last nfsd thread exits we need to do some clean-up */ + struct svc_sock *svsk; + list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) + lockd_down(); + nfsd_serv = NULL; + nfsd_racache_shutdown(); + nfs4_state_shutdown(); + + printk(KERN_WARNING "nfsd: last server has exited\n"); + if (killsig != SIG_NOCLEAN) { + printk(KERN_WARNING "nfsd: unexporting all filesystems\n"); + nfsd_export_flush(); + } +} + +void nfsd_reset_versions(void) +{ + int found_one = 0; + int i; + + for (i = NFSD_MINVERS; i < NFSD_NRVERS; i++) { + if (nfsd_program.pg_vers[i]) + found_one = 1; + } + + if (!found_one) { + for (i = NFSD_MINVERS; i < NFSD_NRVERS; i++) + nfsd_program.pg_vers[i] = nfsd_version[i]; +#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) + for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++) + nfsd_acl_program.pg_vers[i] = + nfsd_acl_version[i]; +#endif + } +} + +int nfsd_create_serv(void) +{ + int err = 0; + lock_kernel(); + if (nfsd_serv) { + svc_get(nfsd_serv); + unlock_kernel(); + return 0; + } + + atomic_set(&nfsd_busy, 0); + nfsd_serv = svc_create_pooled(&nfsd_program, NFSD_BUFSIZE, + nfsd_last_thread, + nfsd, SIG_NOCLEAN, THIS_MODULE); + if (nfsd_serv == NULL) + err = -ENOMEM; + unlock_kernel(); + do_gettimeofday(&nfssvc_boot); /* record boot time */ + return err; +} + +static int nfsd_init_socks(int port) +{ + int error; + if (!list_empty(&nfsd_serv->sv_permsocks)) + return 0; + + error = lockd_up(IPPROTO_UDP); + if (error >= 0) { + error = svc_makesock(nfsd_serv, IPPROTO_UDP, port); + if (error < 0) + lockd_down(); + } + if (error < 0) + return error; + +#ifdef CONFIG_NFSD_TCP + error = lockd_up(IPPROTO_TCP); + if (error >= 0) { + error = svc_makesock(nfsd_serv, IPPROTO_TCP, port); + if (error < 0) + lockd_down(); + } + if (error < 0) + return error; +#endif + return 0; +} + +int nfsd_nrpools(void) +{ + if (nfsd_serv == NULL) + return 0; + else + return nfsd_serv->sv_nrpools; +} + +int nfsd_get_nrthreads(int n, int *nthreads) +{ + int i = 0; + + if (nfsd_serv != NULL) { + for (i = 0; i < nfsd_serv->sv_nrpools && i < n; i++) + nthreads[i] = nfsd_serv->sv_pools[i].sp_nrthreads; + } + + return 0; +} + +int nfsd_set_nrthreads(int n, int *nthreads) +{ + int i = 0; + int tot = 0; + int err = 0; + + if (nfsd_serv == NULL || n <= 0) + return 0; + + if (n > nfsd_serv->sv_nrpools) + n = nfsd_serv->sv_nrpools; + + /* enforce a global maximum number of threads */ + tot = 0; + for (i = 0; i < n; i++) { + if (nthreads[i] > NFSD_MAXSERVS) + nthreads[i] = NFSD_MAXSERVS; + tot += nthreads[i]; + } + if (tot > NFSD_MAXSERVS) { + /* total too large: scale down requested numbers */ + for (i = 0; i < n && tot > 0; i++) { + int new = nthreads[i] * NFSD_MAXSERVS / tot; + tot -= (nthreads[i] - new); + nthreads[i] = new; + } + for (i = 0; i < n && tot > 0; i++) { + nthreads[i]--; + tot--; + } + } + + /* + * There must always be a thread in pool 0; the admin + * can't shut down NFS completely using pool_threads. + */ + if (nthreads[0] == 0) + nthreads[0] = 1; + + /* apply the new numbers */ + lock_kernel(); + svc_get(nfsd_serv); + for (i = 0; i < n; i++) { + err = svc_set_num_threads(nfsd_serv, &nfsd_serv->sv_pools[i], + nthreads[i]); + if (err) + break; + } + svc_destroy(nfsd_serv); + unlock_kernel(); + + return err; +} + int nfsd_svc(unsigned short port, int nrservs) { int error; - int none_left, found_one, i; - struct list_head *victim; lock_kernel(); - dprintk("nfsd: creating service: vers 0x%x\n", - nfsd_versbits); + dprintk("nfsd: creating service\n"); error = -EINVAL; if (nrservs <= 0) nrservs = 0; @@ -153,91 +332,20 @@ nfsd_svc(unsigned short port, int nrservs) error = nfs4_state_start(); if (error<0) goto out; - if (!nfsd_serv) { - /* - * Use the nfsd_ctlbits to define which - * versions that will be advertised. - * If nfsd_ctlbits doesn't list any version, - * export them all. - */ - found_one = 0; - - for (i = NFSD_MINVERS; i < NFSD_NRVERS; i++) { - if (NFSCTL_VERISSET(nfsd_versbits, i)) { - nfsd_program.pg_vers[i] = nfsd_version[i]; - found_one = 1; - } else - nfsd_program.pg_vers[i] = NULL; - } - if (!found_one) { - for (i = NFSD_MINVERS; i < NFSD_NRVERS; i++) - nfsd_program.pg_vers[i] = nfsd_version[i]; - } + nfsd_reset_versions(); + error = nfsd_create_serv(); -#if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) - found_one = 0; - - for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++) { - if (NFSCTL_VERISSET(nfsd_versbits, i)) { - nfsd_acl_program.pg_vers[i] = - nfsd_acl_version[i]; - found_one = 1; - } else - nfsd_acl_program.pg_vers[i] = NULL; - } - - if (!found_one) { - for (i = NFSD_ACL_MINVERS; i < NFSD_ACL_NRVERS; i++) - nfsd_acl_program.pg_vers[i] = - nfsd_acl_version[i]; - } -#endif - - atomic_set(&nfsd_busy, 0); - error = -ENOMEM; - nfsd_serv = svc_create(&nfsd_program, NFSD_BUFSIZE); - if (nfsd_serv == NULL) - goto out; - error = svc_makesock(nfsd_serv, IPPROTO_UDP, port); - if (error < 0) - goto failure; + if (error) + goto out; + error = nfsd_init_socks(port); + if (error) + goto failure; -#ifdef CONFIG_NFSD_TCP - error = svc_makesock(nfsd_serv, IPPROTO_TCP, port); - if (error < 0) - goto failure; -#endif - do_gettimeofday(&nfssvc_boot); /* record boot time */ - } else - nfsd_serv->sv_nrthreads++; - nrservs -= (nfsd_serv->sv_nrthreads-1); - while (nrservs > 0) { - nrservs--; - __module_get(THIS_MODULE); - error = svc_create_thread(nfsd, nfsd_serv); - if (error < 0) { - module_put(THIS_MODULE); - break; - } - } - victim = nfsd_list.next; - while (nrservs < 0 && victim != &nfsd_list) { - struct nfsd_list *nl = - list_entry(victim,struct nfsd_list, list); - victim = victim->next; - send_sig(SIG_NOCLEAN, nl->task, 1); - nrservs++; - } + error = svc_set_num_threads(nfsd_serv, NULL, nrservs); failure: - none_left = (nfsd_serv->sv_nrthreads == 1); svc_destroy(nfsd_serv); /* Release server */ - if (none_left) { - nfsd_serv = NULL; - nfsd_racache_shutdown(); - nfs4_state_shutdown(); - } out: unlock_kernel(); return error; @@ -270,10 +378,8 @@ update_thread_usage(int busy_threads) static void nfsd(struct svc_rqst *rqstp) { - struct svc_serv *serv = rqstp->rq_server; struct fs_struct *fsp; int err; - struct nfsd_list me; sigset_t shutdown_mask, allowed_mask; /* Lock module and set up kernel thread */ @@ -297,10 +403,7 @@ nfsd(struct svc_rqst *rqstp) nfsdstats.th_cnt++; - lockd_up(); /* start lockd */ - - me.task = current; - list_add(&me.list, &nfsd_list); + rqstp->rq_task = current; unlock_kernel(); @@ -322,8 +425,7 @@ nfsd(struct svc_rqst *rqstp) * Find a socket with data available and call its * recvfrom routine. */ - while ((err = svc_recv(serv, rqstp, - 60*60*HZ)) == -EAGAIN) + while ((err = svc_recv(rqstp, 60*60*HZ)) == -EAGAIN) ; if (err < 0) break; @@ -336,7 +438,7 @@ nfsd(struct svc_rqst *rqstp) /* Process request with signals blocked. */ sigprocmask(SIG_SETMASK, &allowed_mask, NULL); - svc_process(serv, rqstp); + svc_process(rqstp); /* Unlock export hash tables */ exp_readunlock(); @@ -353,29 +455,13 @@ nfsd(struct svc_rqst *rqstp) if (sigismember(¤t->pending.signal, signo) && !sigismember(¤t->blocked, signo)) break; - err = signo; + killsig = signo; } - /* Clear signals before calling lockd_down() and svc_exit_thread() */ + /* Clear signals before calling svc_exit_thread() */ flush_signals(current); lock_kernel(); - /* Release lockd */ - lockd_down(); - - /* Check if this is last thread */ - if (serv->sv_nrthreads==1) { - - printk(KERN_WARNING "nfsd: last server has exited\n"); - if (err != SIG_NOCLEAN) { - printk(KERN_WARNING "nfsd: unexporting all filesystems\n"); - nfsd_export_flush(); - } - nfsd_serv = NULL; - nfsd_racache_shutdown(); /* release read-ahead cache */ - nfs4_state_shutdown(); - } - list_del(&me.list); nfsdstats.th_cnt --; out: diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index c9e3b5a8fe0..443ebc52e38 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1114,7 +1114,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, */ if (!resfhp->fh_dentry) { /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */ - fh_lock(fhp); + fh_lock_nested(fhp, I_MUTEX_PARENT); dchild = lookup_one_len(fname, dentry, flen); err = PTR_ERR(dchild); if (IS_ERR(dchild)) @@ -1240,7 +1240,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, err = nfserr_notdir; if(!dirp->i_op || !dirp->i_op->lookup) goto out; - fh_lock(fhp); + fh_lock_nested(fhp, I_MUTEX_PARENT); /* * Compose the response file handle. @@ -1494,7 +1494,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, if (isdotent(name, len)) goto out; - fh_lock(ffhp); + fh_lock_nested(ffhp, I_MUTEX_PARENT); ddir = ffhp->fh_dentry; dirp = ddir->d_inode; @@ -1644,7 +1644,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (err) goto out; - fh_lock(fhp); + fh_lock_nested(fhp, I_MUTEX_PARENT); dentry = fhp->fh_dentry; dirp = dentry->d_inode; diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index 9de6b495f11..b1317ad5ca1 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c @@ -163,8 +163,6 @@ int register_nls(struct nls_table * nls) { struct nls_table ** tmp = &tables; - if (!nls) - return -EINVAL; if (nls->next) return -EBUSY; diff --git a/fs/proc/array.c b/fs/proc/array.c index c0e554971df..25e917fb473 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -162,7 +162,7 @@ static inline char * task_state(struct task_struct *p, char *buffer) int g; struct fdtable *fdt = NULL; - read_lock(&tasklist_lock); + rcu_read_lock(); buffer += sprintf(buffer, "State:\t%s\n" "SleepAVG:\t%lu%%\n" @@ -174,14 +174,13 @@ static inline char * task_state(struct task_struct *p, char *buffer) "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), (p->sleep_avg/1024)*100/(1020000000/1024), - p->tgid, - p->pid, pid_alive(p) ? p->group_leader->real_parent->tgid : 0, - pid_alive(p) && p->ptrace ? p->parent->pid : 0, + p->tgid, p->pid, + pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0, + pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0, p->uid, p->euid, p->suid, p->fsuid, p->gid, p->egid, p->sgid, p->fsgid); - read_unlock(&tasklist_lock); + task_lock(p); - rcu_read_lock(); if (p->files) fdt = files_fdtable(p->files); buffer += sprintf(buffer, @@ -244,6 +243,7 @@ static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign, static inline char * task_sig(struct task_struct *p, char *buffer) { + unsigned long flags; sigset_t pending, shpending, blocked, ignored, caught; int num_threads = 0; unsigned long qsize = 0; @@ -255,10 +255,8 @@ static inline char * task_sig(struct task_struct *p, char *buffer) sigemptyset(&ignored); sigemptyset(&caught); - /* Gather all the data with the appropriate locks held */ - read_lock(&tasklist_lock); - if (p->sighand) { - spin_lock_irq(&p->sighand->siglock); + rcu_read_lock(); + if (lock_task_sighand(p, &flags)) { pending = p->pending.signal; shpending = p->signal->shared_pending.signal; blocked = p->blocked; @@ -266,9 +264,9 @@ static inline char * task_sig(struct task_struct *p, char *buffer) num_threads = atomic_read(&p->signal->count); qsize = atomic_read(&p->user->sigpending); qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur; - spin_unlock_irq(&p->sighand->siglock); + unlock_task_sighand(p, &flags); } - read_unlock(&tasklist_lock); + rcu_read_unlock(); buffer += sprintf(buffer, "Threads:\t%d\n", num_threads); buffer += sprintf(buffer, "SigQ:\t%lu/%lu\n", qsize, qlim); @@ -322,7 +320,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) sigset_t sigign, sigcatch; char state; int res; - pid_t ppid, pgid = -1, sid = -1; + pid_t ppid = 0, pgid = -1, sid = -1; int num_threads = 0; struct mm_struct *mm; unsigned long long start_time; @@ -330,8 +328,8 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) unsigned long min_flt = 0, maj_flt = 0; cputime_t cutime, cstime, utime, stime; unsigned long rsslim = 0; - struct task_struct *t; char tcomm[sizeof(task->comm)]; + unsigned long flags; state = *get_task_state(task); vsize = eip = esp = 0; @@ -349,15 +347,33 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) cutime = cstime = utime = stime = cputime_zero; mutex_lock(&tty_mutex); - read_lock(&tasklist_lock); - if (task->sighand) { - spin_lock_irq(&task->sighand->siglock); - num_threads = atomic_read(&task->signal->count); + rcu_read_lock(); + if (lock_task_sighand(task, &flags)) { + struct signal_struct *sig = task->signal; + struct tty_struct *tty = sig->tty; + + if (tty) { + /* + * sig->tty is not stable, but tty_mutex + * protects us from release_dev(tty) + */ + barrier(); + tty_pgrp = tty->pgrp; + tty_nr = new_encode_dev(tty_devnum(tty)); + } + + num_threads = atomic_read(&sig->count); collect_sigign_sigcatch(task, &sigign, &sigcatch); + cmin_flt = sig->cmin_flt; + cmaj_flt = sig->cmaj_flt; + cutime = sig->cutime; + cstime = sig->cstime; + rsslim = sig->rlim[RLIMIT_RSS].rlim_cur; + /* add up live thread stats at the group level */ if (whole) { - t = task; + struct task_struct *t = task; do { min_flt += t->min_flt; maj_flt += t->maj_flt; @@ -365,31 +381,20 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole) stime = cputime_add(stime, t->stime); t = next_thread(t); } while (t != task); - } - spin_unlock_irq(&task->sighand->siglock); - } - if (task->signal) { - if (task->signal->tty) { - tty_pgrp = task->signal->tty->pgrp; - tty_nr = new_encode_dev(tty_devnum(task->signal->tty)); + min_flt += sig->min_flt; + maj_flt += sig->maj_flt; + utime = cputime_add(utime, sig->utime); + stime = cputime_add(stime, sig->stime); } + + sid = sig->session; pgid = process_group(task); - sid = task->signal->session; - cmin_flt = task->signal->cmin_flt; - cmaj_flt = task->signal->cmaj_flt; - cutime = task->signal->cutime; - cstime = task->signal->cstime; - rsslim = task->signal->rlim[RLIMIT_RSS].rlim_cur; - if (whole) { - min_flt += task->signal->min_flt; - maj_flt += task->signal->maj_flt; - utime = cputime_add(utime, task->signal->utime); - stime = cputime_add(stime, task->signal->stime); - } + ppid = rcu_dereference(task->real_parent)->tgid; + + unlock_task_sighand(task, &flags); } - ppid = pid_alive(task) ? task->group_leader->real_parent->tgid : 0; - read_unlock(&tasklist_lock); + rcu_read_unlock(); mutex_unlock(&tty_mutex); if (!whole || num_threads<2) diff --git a/fs/proc/base.c b/fs/proc/base.c index 89c20d9d50b..82da55b5cff 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -71,6 +71,7 @@ #include <linux/cpuset.h> #include <linux/audit.h> #include <linux/poll.h> +#include <linux/nsproxy.h> #include "internal.h" /* NOTE: @@ -83,262 +84,44 @@ * in /proc for a task before it execs a suid executable. */ -/* - * For hysterical raisins we keep the same inumbers as in the old procfs. - * Feel free to change the macro below - just keep the range distinct from - * inumbers of the rest of procfs (currently those are in 0x0000--0xffff). - * As soon as we'll get a separate superblock we will be able to forget - * about magical ranges too. - */ - -#define fake_ino(pid,ino) (((pid)<<16)|(ino)) - -enum pid_directory_inos { - PROC_TGID_INO = 2, - PROC_TGID_TASK, - PROC_TGID_STATUS, - PROC_TGID_MEM, -#ifdef CONFIG_SECCOMP - PROC_TGID_SECCOMP, -#endif - PROC_TGID_CWD, - PROC_TGID_ROOT, - PROC_TGID_EXE, - PROC_TGID_FD, - PROC_TGID_ENVIRON, - PROC_TGID_AUXV, - PROC_TGID_CMDLINE, - PROC_TGID_STAT, - PROC_TGID_STATM, - PROC_TGID_MAPS, - PROC_TGID_NUMA_MAPS, - PROC_TGID_MOUNTS, - PROC_TGID_MOUNTSTATS, - PROC_TGID_WCHAN, -#ifdef CONFIG_MMU - PROC_TGID_SMAPS, -#endif -#ifdef CONFIG_SCHEDSTATS - PROC_TGID_SCHEDSTAT, -#endif -#ifdef CONFIG_CPUSETS - PROC_TGID_CPUSET, -#endif -#ifdef CONFIG_SECURITY - PROC_TGID_ATTR, - PROC_TGID_ATTR_CURRENT, - PROC_TGID_ATTR_PREV, - PROC_TGID_ATTR_EXEC, - PROC_TGID_ATTR_FSCREATE, - PROC_TGID_ATTR_KEYCREATE, - PROC_TGID_ATTR_SOCKCREATE, -#endif -#ifdef CONFIG_AUDITSYSCALL - PROC_TGID_LOGINUID, -#endif - PROC_TGID_OOM_SCORE, - PROC_TGID_OOM_ADJUST, - PROC_TID_INO, - PROC_TID_STATUS, - PROC_TID_MEM, -#ifdef CONFIG_SECCOMP - PROC_TID_SECCOMP, -#endif - PROC_TID_CWD, - PROC_TID_ROOT, - PROC_TID_EXE, - PROC_TID_FD, - PROC_TID_ENVIRON, - PROC_TID_AUXV, - PROC_TID_CMDLINE, - PROC_TID_STAT, - PROC_TID_STATM, - PROC_TID_MAPS, - PROC_TID_NUMA_MAPS, - PROC_TID_MOUNTS, - PROC_TID_MOUNTSTATS, - PROC_TID_WCHAN, -#ifdef CONFIG_MMU - PROC_TID_SMAPS, -#endif -#ifdef CONFIG_SCHEDSTATS - PROC_TID_SCHEDSTAT, -#endif -#ifdef CONFIG_CPUSETS - PROC_TID_CPUSET, -#endif -#ifdef CONFIG_SECURITY - PROC_TID_ATTR, - PROC_TID_ATTR_CURRENT, - PROC_TID_ATTR_PREV, - PROC_TID_ATTR_EXEC, - PROC_TID_ATTR_FSCREATE, - PROC_TID_ATTR_KEYCREATE, - PROC_TID_ATTR_SOCKCREATE, -#endif -#ifdef CONFIG_AUDITSYSCALL - PROC_TID_LOGINUID, -#endif - PROC_TID_OOM_SCORE, - PROC_TID_OOM_ADJUST, - - /* Add new entries before this */ - PROC_TID_FD_DIR = 0x8000, /* 0x8000-0xffff */ -}; /* Worst case buffer size needed for holding an integer. */ #define PROC_NUMBUF 10 struct pid_entry { - int type; int len; char *name; mode_t mode; + struct inode_operations *iop; + struct file_operations *fop; + union proc_op op; }; -#define E(type,name,mode) {(type),sizeof(name)-1,(name),(mode)} - -static struct pid_entry tgid_base_stuff[] = { - E(PROC_TGID_TASK, "task", S_IFDIR|S_IRUGO|S_IXUGO), - E(PROC_TGID_FD, "fd", S_IFDIR|S_IRUSR|S_IXUSR), - E(PROC_TGID_ENVIRON, "environ", S_IFREG|S_IRUSR), - E(PROC_TGID_AUXV, "auxv", S_IFREG|S_IRUSR), - E(PROC_TGID_STATUS, "status", S_IFREG|S_IRUGO), - E(PROC_TGID_CMDLINE, "cmdline", S_IFREG|S_IRUGO), - E(PROC_TGID_STAT, "stat", S_IFREG|S_IRUGO), - E(PROC_TGID_STATM, "statm", S_IFREG|S_IRUGO), - E(PROC_TGID_MAPS, "maps", S_IFREG|S_IRUGO), -#ifdef CONFIG_NUMA - E(PROC_TGID_NUMA_MAPS, "numa_maps", S_IFREG|S_IRUGO), -#endif - E(PROC_TGID_MEM, "mem", S_IFREG|S_IRUSR|S_IWUSR), -#ifdef CONFIG_SECCOMP - E(PROC_TGID_SECCOMP, "seccomp", S_IFREG|S_IRUSR|S_IWUSR), -#endif - E(PROC_TGID_CWD, "cwd", S_IFLNK|S_IRWXUGO), - E(PROC_TGID_ROOT, "root", S_IFLNK|S_IRWXUGO), - E(PROC_TGID_EXE, "exe", S_IFLNK|S_IRWXUGO), - E(PROC_TGID_MOUNTS, "mounts", S_IFREG|S_IRUGO), - E(PROC_TGID_MOUNTSTATS, "mountstats", S_IFREG|S_IRUSR), -#ifdef CONFIG_MMU - E(PROC_TGID_SMAPS, "smaps", S_IFREG|S_IRUGO), -#endif -#ifdef CONFIG_SECURITY - E(PROC_TGID_ATTR, "attr", S_IFDIR|S_IRUGO|S_IXUGO), -#endif -#ifdef CONFIG_KALLSYMS - E(PROC_TGID_WCHAN, "wchan", S_IFREG|S_IRUGO), -#endif -#ifdef CONFIG_SCHEDSTATS - E(PROC_TGID_SCHEDSTAT, "schedstat", S_IFREG|S_IRUGO), -#endif -#ifdef CONFIG_CPUSETS - E(PROC_TGID_CPUSET, "cpuset", S_IFREG|S_IRUGO), -#endif - E(PROC_TGID_OOM_SCORE, "oom_score",S_IFREG|S_IRUGO), - E(PROC_TGID_OOM_ADJUST,"oom_adj", S_IFREG|S_IRUGO|S_IWUSR), -#ifdef CONFIG_AUDITSYSCALL - E(PROC_TGID_LOGINUID, "loginuid", S_IFREG|S_IWUSR|S_IRUGO), -#endif - {0,0,NULL,0} -}; -static struct pid_entry tid_base_stuff[] = { - E(PROC_TID_FD, "fd", S_IFDIR|S_IRUSR|S_IXUSR), - E(PROC_TID_ENVIRON, "environ", S_IFREG|S_IRUSR), - E(PROC_TID_AUXV, "auxv", S_IFREG|S_IRUSR), - E(PROC_TID_STATUS, "status", S_IFREG|S_IRUGO), - E(PROC_TID_CMDLINE, "cmdline", S_IFREG|S_IRUGO), - E(PROC_TID_STAT, "stat", S_IFREG|S_IRUGO), - E(PROC_TID_STATM, "statm", S_IFREG|S_IRUGO), - E(PROC_TID_MAPS, "maps", S_IFREG|S_IRUGO), -#ifdef CONFIG_NUMA - E(PROC_TID_NUMA_MAPS, "numa_maps", S_IFREG|S_IRUGO), -#endif - E(PROC_TID_MEM, "mem", S_IFREG|S_IRUSR|S_IWUSR), -#ifdef CONFIG_SECCOMP - E(PROC_TID_SECCOMP, "seccomp", S_IFREG|S_IRUSR|S_IWUSR), -#endif - E(PROC_TID_CWD, "cwd", S_IFLNK|S_IRWXUGO), - E(PROC_TID_ROOT, "root", S_IFLNK|S_IRWXUGO), - E(PROC_TID_EXE, "exe", S_IFLNK|S_IRWXUGO), - E(PROC_TID_MOUNTS, "mounts", S_IFREG|S_IRUGO), -#ifdef CONFIG_MMU - E(PROC_TID_SMAPS, "smaps", S_IFREG|S_IRUGO), -#endif -#ifdef CONFIG_SECURITY - E(PROC_TID_ATTR, "attr", S_IFDIR|S_IRUGO|S_IXUGO), -#endif -#ifdef CONFIG_KALLSYMS - E(PROC_TID_WCHAN, "wchan", S_IFREG|S_IRUGO), -#endif -#ifdef CONFIG_SCHEDSTATS - E(PROC_TID_SCHEDSTAT, "schedstat",S_IFREG|S_IRUGO), -#endif -#ifdef CONFIG_CPUSETS - E(PROC_TID_CPUSET, "cpuset", S_IFREG|S_IRUGO), -#endif - E(PROC_TID_OOM_SCORE, "oom_score",S_IFREG|S_IRUGO), - E(PROC_TID_OOM_ADJUST, "oom_adj", S_IFREG|S_IRUGO|S_IWUSR), -#ifdef CONFIG_AUDITSYSCALL - E(PROC_TID_LOGINUID, "loginuid", S_IFREG|S_IWUSR|S_IRUGO), -#endif - {0,0,NULL,0} -}; - -#ifdef CONFIG_SECURITY -static struct pid_entry tgid_attr_stuff[] = { - E(PROC_TGID_ATTR_CURRENT, "current", S_IFREG|S_IRUGO|S_IWUGO), - E(PROC_TGID_ATTR_PREV, "prev", S_IFREG|S_IRUGO), - E(PROC_TGID_ATTR_EXEC, "exec", S_IFREG|S_IRUGO|S_IWUGO), - E(PROC_TGID_ATTR_FSCREATE, "fscreate", S_IFREG|S_IRUGO|S_IWUGO), - E(PROC_TGID_ATTR_KEYCREATE, "keycreate", S_IFREG|S_IRUGO|S_IWUGO), - E(PROC_TGID_ATTR_SOCKCREATE, "sockcreate", S_IFREG|S_IRUGO|S_IWUGO), - {0,0,NULL,0} -}; -static struct pid_entry tid_attr_stuff[] = { - E(PROC_TID_ATTR_CURRENT, "current", S_IFREG|S_IRUGO|S_IWUGO), - E(PROC_TID_ATTR_PREV, "prev", S_IFREG|S_IRUGO), - E(PROC_TID_ATTR_EXEC, "exec", S_IFREG|S_IRUGO|S_IWUGO), - E(PROC_TID_ATTR_FSCREATE, "fscreate", S_IFREG|S_IRUGO|S_IWUGO), - E(PROC_TID_ATTR_KEYCREATE, "keycreate", S_IFREG|S_IRUGO|S_IWUGO), - E(PROC_TID_ATTR_SOCKCREATE, "sockcreate", S_IFREG|S_IRUGO|S_IWUGO), - {0,0,NULL,0} -}; -#endif - -#undef E - -static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) -{ - struct task_struct *task = get_proc_task(inode); - struct files_struct *files = NULL; - struct file *file; - int fd = proc_fd(inode); - - if (task) { - files = get_files_struct(task); - put_task_struct(task); - } - if (files) { - /* - * We are not taking a ref to the file structure, so we must - * hold ->file_lock. - */ - spin_lock(&files->file_lock); - file = fcheck_files(files, fd); - if (file) { - *mnt = mntget(file->f_vfsmnt); - *dentry = dget(file->f_dentry); - spin_unlock(&files->file_lock); - put_files_struct(files); - return 0; - } - spin_unlock(&files->file_lock); - put_files_struct(files); - } - return -ENOENT; +#define NOD(NAME, MODE, IOP, FOP, OP) { \ + .len = sizeof(NAME) - 1, \ + .name = (NAME), \ + .mode = MODE, \ + .iop = IOP, \ + .fop = FOP, \ + .op = OP, \ } +#define DIR(NAME, MODE, OTYPE) \ + NOD(NAME, (S_IFDIR|(MODE)), \ + &proc_##OTYPE##_inode_operations, &proc_##OTYPE##_operations, \ + {} ) +#define LNK(NAME, OTYPE) \ + NOD(NAME, (S_IFLNK|S_IRWXUGO), \ + &proc_pid_link_inode_operations, NULL, \ + { .proc_get_link = &proc_##OTYPE##_link } ) +#define REG(NAME, MODE, OTYPE) \ + NOD(NAME, (S_IFREG|(MODE)), NULL, \ + &proc_##OTYPE##_operations, {}) +#define INF(NAME, MODE, OTYPE) \ + NOD(NAME, (S_IFREG|(MODE)), \ + NULL, &proc_info_file_operations, \ + { .proc_read = &proc_##OTYPE } ) + static struct fs_struct *get_fs_struct(struct task_struct *task) { struct fs_struct *fs; @@ -587,7 +370,7 @@ static int mounts_open(struct inode *inode, struct file *file) if (task) { task_lock(task); - namespace = task->namespace; + namespace = task->nsproxy->namespace; if (namespace) get_namespace(namespace); task_unlock(task); @@ -658,7 +441,7 @@ static int mountstats_open(struct inode *inode, struct file *file) if (task) { task_lock(task); - namespace = task->namespace; + namespace = task->nsproxy->namespace; if (namespace) get_namespace(namespace); task_unlock(task); @@ -1137,143 +920,6 @@ static struct inode_operations proc_pid_link_inode_operations = { .setattr = proc_setattr, }; -static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) -{ - struct dentry *dentry = filp->f_dentry; - struct inode *inode = dentry->d_inode; - struct task_struct *p = get_proc_task(inode); - unsigned int fd, tid, ino; - int retval; - char buf[PROC_NUMBUF]; - struct files_struct * files; - struct fdtable *fdt; - - retval = -ENOENT; - if (!p) - goto out_no_task; - retval = 0; - tid = p->pid; - - fd = filp->f_pos; - switch (fd) { - case 0: - if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) - goto out; - filp->f_pos++; - case 1: - ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) - goto out; - filp->f_pos++; - default: - files = get_files_struct(p); - if (!files) - goto out; - rcu_read_lock(); - fdt = files_fdtable(files); - for (fd = filp->f_pos-2; - fd < fdt->max_fds; - fd++, filp->f_pos++) { - unsigned int i,j; - - if (!fcheck_files(files, fd)) - continue; - rcu_read_unlock(); - - j = PROC_NUMBUF; - i = fd; - do { - j--; - buf[j] = '0' + (i % 10); - i /= 10; - } while (i); - - ino = fake_ino(tid, PROC_TID_FD_DIR + fd); - if (filldir(dirent, buf+j, PROC_NUMBUF-j, fd+2, ino, DT_LNK) < 0) { - rcu_read_lock(); - break; - } - rcu_read_lock(); - } - rcu_read_unlock(); - put_files_struct(files); - } -out: - put_task_struct(p); -out_no_task: - return retval; -} - -static int proc_pident_readdir(struct file *filp, - void *dirent, filldir_t filldir, - struct pid_entry *ents, unsigned int nents) -{ - int i; - int pid; - struct dentry *dentry = filp->f_dentry; - struct inode *inode = dentry->d_inode; - struct task_struct *task = get_proc_task(inode); - struct pid_entry *p; - ino_t ino; - int ret; - - ret = -ENOENT; - if (!task) - goto out; - - ret = 0; - pid = task->pid; - put_task_struct(task); - i = filp->f_pos; - switch (i) { - case 0: - ino = inode->i_ino; - if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) - goto out; - i++; - filp->f_pos++; - /* fall through */ - case 1: - ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) - goto out; - i++; - filp->f_pos++; - /* fall through */ - default: - i -= 2; - if (i >= nents) { - ret = 1; - goto out; - } - p = ents + i; - while (p->name) { - if (filldir(dirent, p->name, p->len, filp->f_pos, - fake_ino(pid, p->type), p->mode >> 12) < 0) - goto out; - filp->f_pos++; - p++; - } - } - - ret = 1; -out: - return ret; -} - -static int proc_tgid_base_readdir(struct file * filp, - void * dirent, filldir_t filldir) -{ - return proc_pident_readdir(filp,dirent,filldir, - tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff)); -} - -static int proc_tid_base_readdir(struct file * filp, - void * dirent, filldir_t filldir) -{ - return proc_pident_readdir(filp,dirent,filldir, - tid_base_stuff,ARRAY_SIZE(tid_base_stuff)); -} /* building an inode */ @@ -1293,13 +939,13 @@ static int task_dumpable(struct task_struct *task) } -static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task, int ino) +static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task) { struct inode * inode; struct proc_inode *ei; /* We need a new inode */ - + inode = new_inode(sb); if (!inode) goto out; @@ -1307,13 +953,12 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st /* Common stuff */ ei = PROC_I(inode); inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - inode->i_ino = fake_ino(task->pid, ino); inode->i_op = &proc_def_inode_operations; /* * grab the reference to task. */ - ei->pid = get_pid(task->pids[PIDTYPE_PID].pid); + ei->pid = get_task_pid(task, PIDTYPE_PID); if (!ei->pid) goto out_unlock; @@ -1333,6 +978,27 @@ out_unlock: return NULL; } +static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct task_struct *task; + generic_fillattr(inode, stat); + + rcu_read_lock(); + stat->uid = 0; + stat->gid = 0; + task = pid_task(proc_pid(inode), PIDTYPE_PID); + if (task) { + if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || + task_dumpable(task)) { + stat->uid = task->euid; + stat->gid = task->egid; + } + } + rcu_read_unlock(); + return 0; +} + /* dentry stuff */ /* @@ -1372,25 +1038,130 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) return 0; } -static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +static int pid_delete_dentry(struct dentry * dentry) { - struct inode *inode = dentry->d_inode; - struct task_struct *task; - generic_fillattr(inode, stat); + /* Is the task we represent dead? + * If so, then don't put the dentry on the lru list, + * kill it immediately. + */ + return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; +} + +static struct dentry_operations pid_dentry_operations = +{ + .d_revalidate = pid_revalidate, + .d_delete = pid_delete_dentry, +}; + +/* Lookups */ + +typedef struct dentry *instantiate_t(struct inode *, struct dentry *, struct task_struct *, void *); + +/* + * Fill a directory entry. + * + * If possible create the dcache entry and derive our inode number and + * file type from dcache entry. + * + * Since all of the proc inode numbers are dynamically generated, the inode + * numbers do not exist until the inode is cache. This means creating the + * the dcache entry in readdir is necessary to keep the inode numbers + * reported by readdir in sync with the inode numbers reported + * by stat. + */ +static int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir, + char *name, int len, + instantiate_t instantiate, struct task_struct *task, void *ptr) +{ + struct dentry *child, *dir = filp->f_dentry; + struct inode *inode; + struct qstr qname; + ino_t ino = 0; + unsigned type = DT_UNKNOWN; + + qname.name = name; + qname.len = len; + qname.hash = full_name_hash(name, len); + + child = d_lookup(dir, &qname); + if (!child) { + struct dentry *new; + new = d_alloc(dir, &qname); + if (new) { + child = instantiate(dir->d_inode, new, task, ptr); + if (child) + dput(new); + else + child = new; + } + } + if (!child || IS_ERR(child) || !child->d_inode) + goto end_instantiate; + inode = child->d_inode; + if (inode) { + ino = inode->i_ino; + type = inode->i_mode >> 12; + } + dput(child); +end_instantiate: + if (!ino) + ino = find_inode_number(dir, &qname); + if (!ino) + ino = 1; + return filldir(dirent, name, len, filp->f_pos, ino, type); +} + +static unsigned name_to_int(struct dentry *dentry) +{ + const char *name = dentry->d_name.name; + int len = dentry->d_name.len; + unsigned n = 0; + + if (len > 1 && *name == '0') + goto out; + while (len-- > 0) { + unsigned c = *name++ - '0'; + if (c > 9) + goto out; + if (n >= (~0U-9)/10) + goto out; + n *= 10; + n += c; + } + return n; +out: + return ~0U; +} + +static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) +{ + struct task_struct *task = get_proc_task(inode); + struct files_struct *files = NULL; + struct file *file; + int fd = proc_fd(inode); - rcu_read_lock(); - stat->uid = 0; - stat->gid = 0; - task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { - if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || - task_dumpable(task)) { - stat->uid = task->euid; - stat->gid = task->egid; + files = get_files_struct(task); + put_task_struct(task); + } + if (files) { + /* + * We are not taking a ref to the file structure, so we must + * hold ->file_lock. + */ + spin_lock(&files->file_lock); + file = fcheck_files(files, fd); + if (file) { + *mnt = mntget(file->f_vfsmnt); + *dentry = dget(file->f_dentry); + spin_unlock(&files->file_lock); + put_files_struct(files); + return 0; } + spin_unlock(&files->file_lock); + put_files_struct(files); } - rcu_read_unlock(); - return 0; + return -ENOENT; } static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) @@ -1428,75 +1199,30 @@ static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) return 0; } -static int pid_delete_dentry(struct dentry * dentry) -{ - /* Is the task we represent dead? - * If so, then don't put the dentry on the lru list, - * kill it immediately. - */ - return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; -} - static struct dentry_operations tid_fd_dentry_operations = { .d_revalidate = tid_fd_revalidate, .d_delete = pid_delete_dentry, }; -static struct dentry_operations pid_dentry_operations = -{ - .d_revalidate = pid_revalidate, - .d_delete = pid_delete_dentry, -}; - -/* Lookups */ - -static unsigned name_to_int(struct dentry *dentry) -{ - const char *name = dentry->d_name.name; - int len = dentry->d_name.len; - unsigned n = 0; - - if (len > 1 && *name == '0') - goto out; - while (len-- > 0) { - unsigned c = *name++ - '0'; - if (c > 9) - goto out; - if (n >= (~0U-9)/10) - goto out; - n *= 10; - n += c; - } - return n; -out: - return ~0U; -} - -/* SMP-safe */ -static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd) +static struct dentry *proc_fd_instantiate(struct inode *dir, + struct dentry *dentry, struct task_struct *task, void *ptr) { - struct task_struct *task = get_proc_task(dir); - unsigned fd = name_to_int(dentry); - struct dentry *result = ERR_PTR(-ENOENT); - struct file * file; - struct files_struct * files; - struct inode *inode; - struct proc_inode *ei; - - if (!task) - goto out_no_task; - if (fd == ~0U) - goto out; + unsigned fd = *(unsigned *)ptr; + struct file *file; + struct files_struct *files; + struct inode *inode; + struct proc_inode *ei; + struct dentry *error = ERR_PTR(-ENOENT); - inode = proc_pid_make_inode(dir->i_sb, task, PROC_TID_FD_DIR+fd); + inode = proc_pid_make_inode(dir->i_sb, task); if (!inode) goto out; ei = PROC_I(inode); ei->fd = fd; files = get_files_struct(task); if (!files) - goto out_unlock; + goto out_iput; inode->i_mode = S_IFLNK; /* @@ -1506,13 +1232,14 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, spin_lock(&files->file_lock); file = fcheck_files(files, fd); if (!file) - goto out_unlock2; + goto out_unlock; if (file->f_mode & 1) inode->i_mode |= S_IRUSR | S_IXUSR; if (file->f_mode & 2) inode->i_mode |= S_IWUSR | S_IXUSR; spin_unlock(&files->file_lock); put_files_struct(files); + inode->i_op = &proc_pid_link_inode_operations; inode->i_size = 64; ei->op.proc_get_link = proc_fd_link; @@ -1520,34 +1247,106 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ if (tid_fd_revalidate(dentry, NULL)) - result = NULL; -out: - put_task_struct(task); -out_no_task: - return result; + error = NULL; -out_unlock2: + out: + return error; +out_unlock: spin_unlock(&files->file_lock); put_files_struct(files); -out_unlock: +out_iput: iput(inode); goto out; } -static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir); -static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd); -static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); +static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd) +{ + struct task_struct *task = get_proc_task(dir); + unsigned fd = name_to_int(dentry); + struct dentry *result = ERR_PTR(-ENOENT); + + if (!task) + goto out_no_task; + if (fd == ~0U) + goto out; + + result = proc_fd_instantiate(dir, dentry, task, &fd); +out: + put_task_struct(task); +out_no_task: + return result; +} + +static int proc_fd_fill_cache(struct file *filp, void *dirent, filldir_t filldir, + struct task_struct *task, int fd) +{ + char name[PROC_NUMBUF]; + int len = snprintf(name, sizeof(name), "%d", fd); + return proc_fill_cache(filp, dirent, filldir, name, len, + proc_fd_instantiate, task, &fd); +} + +static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) +{ + struct dentry *dentry = filp->f_dentry; + struct inode *inode = dentry->d_inode; + struct task_struct *p = get_proc_task(inode); + unsigned int fd, tid, ino; + int retval; + struct files_struct * files; + struct fdtable *fdt; + + retval = -ENOENT; + if (!p) + goto out_no_task; + retval = 0; + tid = p->pid; + + fd = filp->f_pos; + switch (fd) { + case 0: + if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) + goto out; + filp->f_pos++; + case 1: + ino = parent_ino(dentry); + if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) + goto out; + filp->f_pos++; + default: + files = get_files_struct(p); + if (!files) + goto out; + rcu_read_lock(); + fdt = files_fdtable(files); + for (fd = filp->f_pos-2; + fd < fdt->max_fds; + fd++, filp->f_pos++) { + + if (!fcheck_files(files, fd)) + continue; + rcu_read_unlock(); + + if (proc_fd_fill_cache(filp, dirent, filldir, p, fd) < 0) { + rcu_read_lock(); + break; + } + rcu_read_lock(); + } + rcu_read_unlock(); + put_files_struct(files); + } +out: + put_task_struct(p); +out_no_task: + return retval; +} static struct file_operations proc_fd_operations = { .read = generic_read_dir, .readdir = proc_readfd, }; -static struct file_operations proc_task_operations = { - .read = generic_read_dir, - .readdir = proc_task_readdir, -}; - /* * proc directories can do almost nothing.. */ @@ -1556,11 +1355,137 @@ static struct inode_operations proc_fd_inode_operations = { .setattr = proc_setattr, }; -static struct inode_operations proc_task_inode_operations = { - .lookup = proc_task_lookup, - .getattr = proc_task_getattr, - .setattr = proc_setattr, -}; +static struct dentry *proc_pident_instantiate(struct inode *dir, + struct dentry *dentry, struct task_struct *task, void *ptr) +{ + struct pid_entry *p = ptr; + struct inode *inode; + struct proc_inode *ei; + struct dentry *error = ERR_PTR(-EINVAL); + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + goto out; + + ei = PROC_I(inode); + inode->i_mode = p->mode; + if (S_ISDIR(inode->i_mode)) + inode->i_nlink = 2; /* Use getattr to fix if necessary */ + if (p->iop) + inode->i_op = p->iop; + if (p->fop) + inode->i_fop = p->fop; + ei->op = p->op; + dentry->d_op = &pid_dentry_operations; + d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, NULL)) + error = NULL; +out: + return error; +} + +static struct dentry *proc_pident_lookup(struct inode *dir, + struct dentry *dentry, + struct pid_entry *ents, + unsigned int nents) +{ + struct inode *inode; + struct dentry *error; + struct task_struct *task = get_proc_task(dir); + struct pid_entry *p, *last; + + error = ERR_PTR(-ENOENT); + inode = NULL; + + if (!task) + goto out_no_task; + + /* + * Yes, it does not scale. And it should not. Don't add + * new entries into /proc/<tgid>/ without very good reasons. + */ + last = &ents[nents - 1]; + for (p = ents; p <= last; p++) { + if (p->len != dentry->d_name.len) + continue; + if (!memcmp(dentry->d_name.name, p->name, p->len)) + break; + } + if (p > last) + goto out; + + error = proc_pident_instantiate(dir, dentry, task, p); +out: + put_task_struct(task); +out_no_task: + return error; +} + +static int proc_pident_fill_cache(struct file *filp, void *dirent, filldir_t filldir, + struct task_struct *task, struct pid_entry *p) +{ + return proc_fill_cache(filp, dirent, filldir, p->name, p->len, + proc_pident_instantiate, task, p); +} + +static int proc_pident_readdir(struct file *filp, + void *dirent, filldir_t filldir, + struct pid_entry *ents, unsigned int nents) +{ + int i; + int pid; + struct dentry *dentry = filp->f_dentry; + struct inode *inode = dentry->d_inode; + struct task_struct *task = get_proc_task(inode); + struct pid_entry *p, *last; + ino_t ino; + int ret; + + ret = -ENOENT; + if (!task) + goto out_no_task; + + ret = 0; + pid = task->pid; + i = filp->f_pos; + switch (i) { + case 0: + ino = inode->i_ino; + if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) + goto out; + i++; + filp->f_pos++; + /* fall through */ + case 1: + ino = parent_ino(dentry); + if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0) + goto out; + i++; + filp->f_pos++; + /* fall through */ + default: + i -= 2; + if (i >= nents) { + ret = 1; + goto out; + } + p = ents + i; + last = &ents[nents - 1]; + while (p <= last) { + if (proc_pident_fill_cache(filp, dirent, filldir, task, p) < 0) + goto out; + filp->f_pos++; + p++; + } + } + + ret = 1; +out: + put_task_struct(task); +out_no_task: + return ret; +} #ifdef CONFIG_SECURITY static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, @@ -1581,8 +1506,8 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, if (!(page = __get_free_page(GFP_KERNEL))) goto out; - length = security_getprocattr(task, - (char*)file->f_dentry->d_name.name, + length = security_getprocattr(task, + (char*)file->f_dentry->d_name.name, (void*)page, count); if (length >= 0) length = simple_read_from_buffer(buf, count, ppos, (char *)page, length); @@ -1595,17 +1520,17 @@ out_no_task: static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) -{ +{ struct inode * inode = file->f_dentry->d_inode; - char *page; - ssize_t length; + char *page; + ssize_t length; struct task_struct *task = get_proc_task(inode); length = -ESRCH; if (!task) goto out_no_task; - if (count > PAGE_SIZE) - count = PAGE_SIZE; + if (count > PAGE_SIZE) + count = PAGE_SIZE; /* No partial writes. */ length = -EINVAL; @@ -1613,16 +1538,16 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, goto out; length = -ENOMEM; - page = (char*)__get_free_page(GFP_USER); - if (!page) + page = (char*)__get_free_page(GFP_USER); + if (!page) goto out; - length = -EFAULT; - if (copy_from_user(page, buf, count)) + length = -EFAULT; + if (copy_from_user(page, buf, count)) goto out_free; - length = security_setprocattr(task, - (char*)file->f_dentry->d_name.name, + length = security_setprocattr(task, + (char*)file->f_dentry->d_name.name, (void*)page, count); out_free: free_page((unsigned long) page); @@ -1630,330 +1555,263 @@ out: put_task_struct(task); out_no_task: return length; -} +} static struct file_operations proc_pid_attr_operations = { .read = proc_pid_attr_read, .write = proc_pid_attr_write, }; -static struct file_operations proc_tid_attr_operations; -static struct inode_operations proc_tid_attr_inode_operations; -static struct file_operations proc_tgid_attr_operations; -static struct inode_operations proc_tgid_attr_inode_operations; +static struct pid_entry attr_dir_stuff[] = { + REG("current", S_IRUGO|S_IWUGO, pid_attr), + REG("prev", S_IRUGO, pid_attr), + REG("exec", S_IRUGO|S_IWUGO, pid_attr), + REG("fscreate", S_IRUGO|S_IWUGO, pid_attr), + REG("keycreate", S_IRUGO|S_IWUGO, pid_attr), + REG("sockcreate", S_IRUGO|S_IWUGO, pid_attr), +}; + +static int proc_attr_dir_readdir(struct file * filp, + void * dirent, filldir_t filldir) +{ + return proc_pident_readdir(filp,dirent,filldir, + attr_dir_stuff,ARRAY_SIZE(attr_dir_stuff)); +} + +static struct file_operations proc_attr_dir_operations = { + .read = generic_read_dir, + .readdir = proc_attr_dir_readdir, +}; + +static struct dentry *proc_attr_dir_lookup(struct inode *dir, + struct dentry *dentry, struct nameidata *nd) +{ + return proc_pident_lookup(dir, dentry, + attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff)); +} + +static struct inode_operations proc_attr_dir_inode_operations = { + .lookup = proc_attr_dir_lookup, + .getattr = pid_getattr, + .setattr = proc_setattr, +}; + #endif -/* SMP-safe */ -static struct dentry *proc_pident_lookup(struct inode *dir, - struct dentry *dentry, - struct pid_entry *ents) +/* + * /proc/self: + */ +static int proc_self_readlink(struct dentry *dentry, char __user *buffer, + int buflen) +{ + char tmp[PROC_NUMBUF]; + sprintf(tmp, "%d", current->tgid); + return vfs_readlink(dentry,buffer,buflen,tmp); +} + +static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) { + char tmp[PROC_NUMBUF]; + sprintf(tmp, "%d", current->tgid); + return ERR_PTR(vfs_follow_link(nd,tmp)); +} + +static struct inode_operations proc_self_inode_operations = { + .readlink = proc_self_readlink, + .follow_link = proc_self_follow_link, +}; + +/* + * proc base + * + * These are the directory entries in the root directory of /proc + * that properly belong to the /proc filesystem, as they describe + * describe something that is process related. + */ +static struct pid_entry proc_base_stuff[] = { + NOD("self", S_IFLNK|S_IRWXUGO, + &proc_self_inode_operations, NULL, {}), +}; + +/* + * Exceptional case: normally we are not allowed to unhash a busy + * directory. In this case, however, we can do it - no aliasing problems + * due to the way we treat inodes. + */ +static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + struct inode *inode = dentry->d_inode; + struct task_struct *task = get_proc_task(inode); + if (task) { + put_task_struct(task); + return 1; + } + d_drop(dentry); + return 0; +} + +static struct dentry_operations proc_base_dentry_operations = +{ + .d_revalidate = proc_base_revalidate, + .d_delete = pid_delete_dentry, +}; + +static struct dentry *proc_base_instantiate(struct inode *dir, + struct dentry *dentry, struct task_struct *task, void *ptr) +{ + struct pid_entry *p = ptr; struct inode *inode; + struct proc_inode *ei; + struct dentry *error = ERR_PTR(-EINVAL); + + /* Allocate the inode */ + error = ERR_PTR(-ENOMEM); + inode = new_inode(dir->i_sb); + if (!inode) + goto out; + + /* Initialize the inode */ + ei = PROC_I(inode); + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + + /* + * grab the reference to the task. + */ + ei->pid = get_task_pid(task, PIDTYPE_PID); + if (!ei->pid) + goto out_iput; + + inode->i_uid = 0; + inode->i_gid = 0; + inode->i_mode = p->mode; + if (S_ISDIR(inode->i_mode)) + inode->i_nlink = 2; + if (S_ISLNK(inode->i_mode)) + inode->i_size = 64; + if (p->iop) + inode->i_op = p->iop; + if (p->fop) + inode->i_fop = p->fop; + ei->op = p->op; + dentry->d_op = &proc_base_dentry_operations; + d_add(dentry, inode); + error = NULL; +out: + return error; +out_iput: + iput(inode); + goto out; +} + +static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry) +{ struct dentry *error; struct task_struct *task = get_proc_task(dir); - struct pid_entry *p; - struct proc_inode *ei; + struct pid_entry *p, *last; error = ERR_PTR(-ENOENT); - inode = NULL; if (!task) goto out_no_task; - for (p = ents; p->name; p++) { + /* Lookup the directory entry */ + last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1]; + for (p = proc_base_stuff; p <= last; p++) { if (p->len != dentry->d_name.len) continue; if (!memcmp(dentry->d_name.name, p->name, p->len)) break; } - if (!p->name) + if (p > last) goto out; - error = ERR_PTR(-EINVAL); - inode = proc_pid_make_inode(dir->i_sb, task, p->type); - if (!inode) - goto out; + error = proc_base_instantiate(dir, dentry, task, p); - ei = PROC_I(inode); - inode->i_mode = p->mode; - /* - * Yes, it does not scale. And it should not. Don't add - * new entries into /proc/<tgid>/ without very good reasons. - */ - switch(p->type) { - case PROC_TGID_TASK: - inode->i_nlink = 2; - inode->i_op = &proc_task_inode_operations; - inode->i_fop = &proc_task_operations; - break; - case PROC_TID_FD: - case PROC_TGID_FD: - inode->i_nlink = 2; - inode->i_op = &proc_fd_inode_operations; - inode->i_fop = &proc_fd_operations; - break; - case PROC_TID_EXE: - case PROC_TGID_EXE: - inode->i_op = &proc_pid_link_inode_operations; - ei->op.proc_get_link = proc_exe_link; - break; - case PROC_TID_CWD: - case PROC_TGID_CWD: - inode->i_op = &proc_pid_link_inode_operations; - ei->op.proc_get_link = proc_cwd_link; - break; - case PROC_TID_ROOT: - case PROC_TGID_ROOT: - inode->i_op = &proc_pid_link_inode_operations; - ei->op.proc_get_link = proc_root_link; - break; - case PROC_TID_ENVIRON: - case PROC_TGID_ENVIRON: - inode->i_fop = &proc_info_file_operations; - ei->op.proc_read = proc_pid_environ; - break; - case PROC_TID_AUXV: - case PROC_TGID_AUXV: - inode->i_fop = &proc_info_file_operations; - ei->op.proc_read = proc_pid_auxv; - break; - case PROC_TID_STATUS: - case PROC_TGID_STATUS: - inode->i_fop = &proc_info_file_operations; - ei->op.proc_read = proc_pid_status; - break; - case PROC_TID_STAT: - inode->i_fop = &proc_info_file_operations; - ei->op.proc_read = proc_tid_stat; - break; - case PROC_TGID_STAT: - inode->i_fop = &proc_info_file_operations; - ei->op.proc_read = proc_tgid_stat; - break; - case PROC_TID_CMDLINE: - case PROC_TGID_CMDLINE: - inode->i_fop = &proc_info_file_operations; - ei->op.proc_read = proc_pid_cmdline; - break; - case PROC_TID_STATM: - case PROC_TGID_STATM: - inode->i_fop = &proc_info_file_operations; - ei->op.proc_read = proc_pid_statm; - break; - case PROC_TID_MAPS: - case PROC_TGID_MAPS: - inode->i_fop = &proc_maps_operations; - break; +out: + put_task_struct(task); +out_no_task: + return error; +} + +static int proc_base_fill_cache(struct file *filp, void *dirent, filldir_t filldir, + struct task_struct *task, struct pid_entry *p) +{ + return proc_fill_cache(filp, dirent, filldir, p->name, p->len, + proc_base_instantiate, task, p); +} + +/* + * Thread groups + */ +static struct file_operations proc_task_operations; +static struct inode_operations proc_task_inode_operations; + +static struct pid_entry tgid_base_stuff[] = { + DIR("task", S_IRUGO|S_IXUGO, task), + DIR("fd", S_IRUSR|S_IXUSR, fd), + INF("environ", S_IRUSR, pid_environ), + INF("auxv", S_IRUSR, pid_auxv), + INF("status", S_IRUGO, pid_status), + INF("cmdline", S_IRUGO, pid_cmdline), + INF("stat", S_IRUGO, tgid_stat), + INF("statm", S_IRUGO, pid_statm), + REG("maps", S_IRUGO, maps), #ifdef CONFIG_NUMA - case PROC_TID_NUMA_MAPS: - case PROC_TGID_NUMA_MAPS: - inode->i_fop = &proc_numa_maps_operations; - break; + REG("numa_maps", S_IRUGO, numa_maps), #endif - case PROC_TID_MEM: - case PROC_TGID_MEM: - inode->i_fop = &proc_mem_operations; - break; + REG("mem", S_IRUSR|S_IWUSR, mem), #ifdef CONFIG_SECCOMP - case PROC_TID_SECCOMP: - case PROC_TGID_SECCOMP: - inode->i_fop = &proc_seccomp_operations; - break; -#endif /* CONFIG_SECCOMP */ - case PROC_TID_MOUNTS: - case PROC_TGID_MOUNTS: - inode->i_fop = &proc_mounts_operations; - break; + REG("seccomp", S_IRUSR|S_IWUSR, seccomp), +#endif + LNK("cwd", cwd), + LNK("root", root), + LNK("exe", exe), + REG("mounts", S_IRUGO, mounts), + REG("mountstats", S_IRUSR, mountstats), #ifdef CONFIG_MMU - case PROC_TID_SMAPS: - case PROC_TGID_SMAPS: - inode->i_fop = &proc_smaps_operations; - break; + REG("smaps", S_IRUGO, smaps), #endif - case PROC_TID_MOUNTSTATS: - case PROC_TGID_MOUNTSTATS: - inode->i_fop = &proc_mountstats_operations; - break; #ifdef CONFIG_SECURITY - case PROC_TID_ATTR: - inode->i_nlink = 2; - inode->i_op = &proc_tid_attr_inode_operations; - inode->i_fop = &proc_tid_attr_operations; - break; - case PROC_TGID_ATTR: - inode->i_nlink = 2; - inode->i_op = &proc_tgid_attr_inode_operations; - inode->i_fop = &proc_tgid_attr_operations; - break; - case PROC_TID_ATTR_CURRENT: - case PROC_TGID_ATTR_CURRENT: - case PROC_TID_ATTR_PREV: - case PROC_TGID_ATTR_PREV: - case PROC_TID_ATTR_EXEC: - case PROC_TGID_ATTR_EXEC: - case PROC_TID_ATTR_FSCREATE: - case PROC_TGID_ATTR_FSCREATE: - case PROC_TID_ATTR_KEYCREATE: - case PROC_TGID_ATTR_KEYCREATE: - case PROC_TID_ATTR_SOCKCREATE: - case PROC_TGID_ATTR_SOCKCREATE: - inode->i_fop = &proc_pid_attr_operations; - break; + DIR("attr", S_IRUGO|S_IXUGO, attr_dir), #endif #ifdef CONFIG_KALLSYMS - case PROC_TID_WCHAN: - case PROC_TGID_WCHAN: - inode->i_fop = &proc_info_file_operations; - ei->op.proc_read = proc_pid_wchan; - break; + INF("wchan", S_IRUGO, pid_wchan), #endif #ifdef CONFIG_SCHEDSTATS - case PROC_TID_SCHEDSTAT: - case PROC_TGID_SCHEDSTAT: - inode->i_fop = &proc_info_file_operations; - ei->op.proc_read = proc_pid_schedstat; - break; + INF("schedstat", S_IRUGO, pid_schedstat), #endif #ifdef CONFIG_CPUSETS - case PROC_TID_CPUSET: - case PROC_TGID_CPUSET: - inode->i_fop = &proc_cpuset_operations; - break; + REG("cpuset", S_IRUGO, cpuset), #endif - case PROC_TID_OOM_SCORE: - case PROC_TGID_OOM_SCORE: - inode->i_fop = &proc_info_file_operations; - ei->op.proc_read = proc_oom_score; - break; - case PROC_TID_OOM_ADJUST: - case PROC_TGID_OOM_ADJUST: - inode->i_fop = &proc_oom_adjust_operations; - break; + INF("oom_score", S_IRUGO, oom_score), + REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust), #ifdef CONFIG_AUDITSYSCALL - case PROC_TID_LOGINUID: - case PROC_TGID_LOGINUID: - inode->i_fop = &proc_loginuid_operations; - break; + REG("loginuid", S_IWUSR|S_IRUGO, loginuid), #endif - default: - printk("procfs: impossible type (%d)",p->type); - iput(inode); - error = ERR_PTR(-EINVAL); - goto out; - } - dentry->d_op = &pid_dentry_operations; - d_add(dentry, inode); - /* Close the race of the process dying before we return the dentry */ - if (pid_revalidate(dentry, NULL)) - error = NULL; -out: - put_task_struct(task); -out_no_task: - return error; -} - -static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){ - return proc_pident_lookup(dir, dentry, tgid_base_stuff); -} - -static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){ - return proc_pident_lookup(dir, dentry, tid_base_stuff); -} - -static struct file_operations proc_tgid_base_operations = { - .read = generic_read_dir, - .readdir = proc_tgid_base_readdir, }; -static struct file_operations proc_tid_base_operations = { - .read = generic_read_dir, - .readdir = proc_tid_base_readdir, -}; - -static struct inode_operations proc_tgid_base_inode_operations = { - .lookup = proc_tgid_base_lookup, - .getattr = pid_getattr, - .setattr = proc_setattr, -}; - -static struct inode_operations proc_tid_base_inode_operations = { - .lookup = proc_tid_base_lookup, - .getattr = pid_getattr, - .setattr = proc_setattr, -}; - -#ifdef CONFIG_SECURITY -static int proc_tgid_attr_readdir(struct file * filp, - void * dirent, filldir_t filldir) -{ - return proc_pident_readdir(filp,dirent,filldir, - tgid_attr_stuff,ARRAY_SIZE(tgid_attr_stuff)); -} - -static int proc_tid_attr_readdir(struct file * filp, +static int proc_tgid_base_readdir(struct file * filp, void * dirent, filldir_t filldir) { return proc_pident_readdir(filp,dirent,filldir, - tid_attr_stuff,ARRAY_SIZE(tid_attr_stuff)); + tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff)); } -static struct file_operations proc_tgid_attr_operations = { - .read = generic_read_dir, - .readdir = proc_tgid_attr_readdir, -}; - -static struct file_operations proc_tid_attr_operations = { +static struct file_operations proc_tgid_base_operations = { .read = generic_read_dir, - .readdir = proc_tid_attr_readdir, + .readdir = proc_tgid_base_readdir, }; -static struct dentry *proc_tgid_attr_lookup(struct inode *dir, - struct dentry *dentry, struct nameidata *nd) -{ - return proc_pident_lookup(dir, dentry, tgid_attr_stuff); -} - -static struct dentry *proc_tid_attr_lookup(struct inode *dir, - struct dentry *dentry, struct nameidata *nd) -{ - return proc_pident_lookup(dir, dentry, tid_attr_stuff); +static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){ + return proc_pident_lookup(dir, dentry, + tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); } -static struct inode_operations proc_tgid_attr_inode_operations = { - .lookup = proc_tgid_attr_lookup, - .getattr = pid_getattr, - .setattr = proc_setattr, -}; - -static struct inode_operations proc_tid_attr_inode_operations = { - .lookup = proc_tid_attr_lookup, +static struct inode_operations proc_tgid_base_inode_operations = { + .lookup = proc_tgid_base_lookup, .getattr = pid_getattr, .setattr = proc_setattr, }; -#endif - -/* - * /proc/self: - */ -static int proc_self_readlink(struct dentry *dentry, char __user *buffer, - int buflen) -{ - char tmp[PROC_NUMBUF]; - sprintf(tmp, "%d", current->tgid); - return vfs_readlink(dentry,buffer,buflen,tmp); -} - -static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - char tmp[PROC_NUMBUF]; - sprintf(tmp, "%d", current->tgid); - return ERR_PTR(vfs_follow_link(nd,tmp)); -} - -static struct inode_operations proc_self_inode_operations = { - .readlink = proc_self_readlink, - .follow_link = proc_self_follow_link, -}; /** * proc_flush_task - Remove dcache entries for @task from the /proc dcache. @@ -2022,54 +1880,23 @@ out: return; } -/* SMP-safe */ -struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) +struct dentry *proc_pid_instantiate(struct inode *dir, + struct dentry * dentry, struct task_struct *task, void *ptr) { - struct dentry *result = ERR_PTR(-ENOENT); - struct task_struct *task; + struct dentry *error = ERR_PTR(-ENOENT); struct inode *inode; - struct proc_inode *ei; - unsigned tgid; - - if (dentry->d_name.len == 4 && !memcmp(dentry->d_name.name,"self",4)) { - inode = new_inode(dir->i_sb); - if (!inode) - return ERR_PTR(-ENOMEM); - ei = PROC_I(inode); - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - inode->i_ino = fake_ino(0, PROC_TGID_INO); - ei->pde = NULL; - inode->i_mode = S_IFLNK|S_IRWXUGO; - inode->i_uid = inode->i_gid = 0; - inode->i_size = 64; - inode->i_op = &proc_self_inode_operations; - d_add(dentry, inode); - return NULL; - } - tgid = name_to_int(dentry); - if (tgid == ~0U) - goto out; - rcu_read_lock(); - task = find_task_by_pid(tgid); - if (task) - get_task_struct(task); - rcu_read_unlock(); - if (!task) - goto out; - - inode = proc_pid_make_inode(dir->i_sb, task, PROC_TGID_INO); + inode = proc_pid_make_inode(dir->i_sb, task); if (!inode) - goto out_put_task; + goto out; inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO; inode->i_op = &proc_tgid_base_inode_operations; inode->i_fop = &proc_tgid_base_operations; inode->i_flags|=S_IMMUTABLE; -#ifdef CONFIG_SECURITY - inode->i_nlink = 5; -#else inode->i_nlink = 4; +#ifdef CONFIG_SECURITY + inode->i_nlink += 1; #endif dentry->d_op = &pid_dentry_operations; @@ -2077,179 +1904,251 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct d_add(dentry, inode); /* Close the race of the process dying before we return the dentry */ if (pid_revalidate(dentry, NULL)) - result = NULL; - -out_put_task: - put_task_struct(task); + error = NULL; out: - return result; + return error; } -/* SMP-safe */ -static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) +struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) { struct dentry *result = ERR_PTR(-ENOENT); struct task_struct *task; - struct task_struct *leader = get_proc_task(dir); - struct inode *inode; - unsigned tid; + unsigned tgid; - if (!leader) - goto out_no_task; + result = proc_base_lookup(dir, dentry); + if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT) + goto out; - tid = name_to_int(dentry); - if (tid == ~0U) + tgid = name_to_int(dentry); + if (tgid == ~0U) goto out; rcu_read_lock(); - task = find_task_by_pid(tid); + task = find_task_by_pid(tgid); if (task) get_task_struct(task); rcu_read_unlock(); if (!task) goto out; - if (leader->tgid != task->tgid) - goto out_drop_task; - - inode = proc_pid_make_inode(dir->i_sb, task, PROC_TID_INO); - - - if (!inode) - goto out_drop_task; - inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO; - inode->i_op = &proc_tid_base_inode_operations; - inode->i_fop = &proc_tid_base_operations; - inode->i_flags|=S_IMMUTABLE; -#ifdef CONFIG_SECURITY - inode->i_nlink = 4; -#else - inode->i_nlink = 3; -#endif - - dentry->d_op = &pid_dentry_operations; - - d_add(dentry, inode); - /* Close the race of the process dying before we return the dentry */ - if (pid_revalidate(dentry, NULL)) - result = NULL; -out_drop_task: + result = proc_pid_instantiate(dir, dentry, task, NULL); put_task_struct(task); out: - put_task_struct(leader); -out_no_task: return result; } /* - * Find the first tgid to return to user space. - * - * Usually this is just whatever follows &init_task, but if the users - * buffer was too small to hold the full list or there was a seek into - * the middle of the directory we have more work to do. - * - * In the case of a short read we start with find_task_by_pid. + * Find the first task with tgid >= tgid * - * In the case of a seek we start with &init_task and walk nr - * threads past it. */ -static struct task_struct *first_tgid(int tgid, unsigned int nr) +static struct task_struct *next_tgid(unsigned int tgid) { - struct task_struct *pos; - rcu_read_lock(); - if (tgid && nr) { - pos = find_task_by_pid(tgid); - if (pos && thread_group_leader(pos)) - goto found; - } - /* If nr exceeds the number of processes get out quickly */ - pos = NULL; - if (nr && nr >= nr_processes()) - goto done; + struct task_struct *task; + struct pid *pid; - /* If we haven't found our starting place yet start with - * the init_task and walk nr tasks forward. - */ - for (pos = next_task(&init_task); nr > 0; --nr) { - pos = next_task(pos); - if (pos == &init_task) { - pos = NULL; - goto done; - } + rcu_read_lock(); +retry: + task = NULL; + pid = find_ge_pid(tgid); + if (pid) { + tgid = pid->nr + 1; + task = pid_task(pid, PIDTYPE_PID); + /* What we to know is if the pid we have find is the + * pid of a thread_group_leader. Testing for task + * being a thread_group_leader is the obvious thing + * todo but there is a window when it fails, due to + * the pid transfer logic in de_thread. + * + * So we perform the straight forward test of seeing + * if the pid we have found is the pid of a thread + * group leader, and don't worry if the task we have + * found doesn't happen to be a thread group leader. + * As we don't care in the case of readdir. + */ + if (!task || !has_group_leader_pid(task)) + goto retry; + get_task_struct(task); } -found: - get_task_struct(pos); -done: rcu_read_unlock(); - return pos; + return task; } -/* - * Find the next task in the task list. - * Return NULL if we loop or there is any error. - * - * The reference to the input task_struct is released. - */ -static struct task_struct *next_tgid(struct task_struct *start) +#define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff)) + +static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir, + struct task_struct *task, int tgid) { - struct task_struct *pos; - rcu_read_lock(); - pos = start; - if (pid_alive(start)) - pos = next_task(start); - if (pid_alive(pos) && (pos != &init_task)) { - get_task_struct(pos); - goto done; - } - pos = NULL; -done: - rcu_read_unlock(); - put_task_struct(start); - return pos; + char name[PROC_NUMBUF]; + int len = snprintf(name, sizeof(name), "%d", tgid); + return proc_fill_cache(filp, dirent, filldir, name, len, + proc_pid_instantiate, task, NULL); } /* for the /proc/ directory itself, after non-process stuff has been done */ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { - char buf[PROC_NUMBUF]; unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY; + struct task_struct *reaper = get_proc_task(filp->f_dentry->d_inode); struct task_struct *task; int tgid; - if (!nr) { - ino_t ino = fake_ino(0,PROC_TGID_INO); - if (filldir(dirent, "self", 4, filp->f_pos, ino, DT_LNK) < 0) - return 0; - filp->f_pos++; - nr++; + if (!reaper) + goto out_no_task; + + for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) { + struct pid_entry *p = &proc_base_stuff[nr]; + if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0) + goto out; } - nr -= 1; - /* f_version caches the tgid value that the last readdir call couldn't - * return. lseek aka telldir automagically resets f_version to 0. - */ - tgid = filp->f_version; - filp->f_version = 0; - for (task = first_tgid(tgid, nr); + tgid = filp->f_pos - TGID_OFFSET; + for (task = next_tgid(tgid); task; - task = next_tgid(task), filp->f_pos++) { - int len; - ino_t ino; + put_task_struct(task), task = next_tgid(tgid + 1)) { tgid = task->pid; - len = snprintf(buf, sizeof(buf), "%d", tgid); - ino = fake_ino(tgid, PROC_TGID_INO); - if (filldir(dirent, buf, len, filp->f_pos, ino, DT_DIR) < 0) { - /* returning this tgid failed, save it as the first - * pid for the next readir call */ - filp->f_version = tgid; + filp->f_pos = tgid + TGID_OFFSET; + if (proc_pid_fill_cache(filp, dirent, filldir, task, tgid) < 0) { put_task_struct(task); - break; + goto out; } } + filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET; +out: + put_task_struct(reaper); +out_no_task: return 0; } /* + * Tasks + */ +static struct pid_entry tid_base_stuff[] = { + DIR("fd", S_IRUSR|S_IXUSR, fd), + INF("environ", S_IRUSR, pid_environ), + INF("auxv", S_IRUSR, pid_auxv), + INF("status", S_IRUGO, pid_status), + INF("cmdline", S_IRUGO, pid_cmdline), + INF("stat", S_IRUGO, tid_stat), + INF("statm", S_IRUGO, pid_statm), + REG("maps", S_IRUGO, maps), +#ifdef CONFIG_NUMA + REG("numa_maps", S_IRUGO, numa_maps), +#endif + REG("mem", S_IRUSR|S_IWUSR, mem), +#ifdef CONFIG_SECCOMP + REG("seccomp", S_IRUSR|S_IWUSR, seccomp), +#endif + LNK("cwd", cwd), + LNK("root", root), + LNK("exe", exe), + REG("mounts", S_IRUGO, mounts), +#ifdef CONFIG_MMU + REG("smaps", S_IRUGO, smaps), +#endif +#ifdef CONFIG_SECURITY + DIR("attr", S_IRUGO|S_IXUGO, attr_dir), +#endif +#ifdef CONFIG_KALLSYMS + INF("wchan", S_IRUGO, pid_wchan), +#endif +#ifdef CONFIG_SCHEDSTATS + INF("schedstat", S_IRUGO, pid_schedstat), +#endif +#ifdef CONFIG_CPUSETS + REG("cpuset", S_IRUGO, cpuset), +#endif + INF("oom_score", S_IRUGO, oom_score), + REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust), +#ifdef CONFIG_AUDITSYSCALL + REG("loginuid", S_IWUSR|S_IRUGO, loginuid), +#endif +}; + +static int proc_tid_base_readdir(struct file * filp, + void * dirent, filldir_t filldir) +{ + return proc_pident_readdir(filp,dirent,filldir, + tid_base_stuff,ARRAY_SIZE(tid_base_stuff)); +} + +static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){ + return proc_pident_lookup(dir, dentry, + tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); +} + +static struct file_operations proc_tid_base_operations = { + .read = generic_read_dir, + .readdir = proc_tid_base_readdir, +}; + +static struct inode_operations proc_tid_base_inode_operations = { + .lookup = proc_tid_base_lookup, + .getattr = pid_getattr, + .setattr = proc_setattr, +}; + +static struct dentry *proc_task_instantiate(struct inode *dir, + struct dentry *dentry, struct task_struct *task, void *ptr) +{ + struct dentry *error = ERR_PTR(-ENOENT); + struct inode *inode; + inode = proc_pid_make_inode(dir->i_sb, task); + + if (!inode) + goto out; + inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO; + inode->i_op = &proc_tid_base_inode_operations; + inode->i_fop = &proc_tid_base_operations; + inode->i_flags|=S_IMMUTABLE; + inode->i_nlink = 3; +#ifdef CONFIG_SECURITY + inode->i_nlink += 1; +#endif + + dentry->d_op = &pid_dentry_operations; + + d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, NULL)) + error = NULL; +out: + return error; +} + +static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) +{ + struct dentry *result = ERR_PTR(-ENOENT); + struct task_struct *task; + struct task_struct *leader = get_proc_task(dir); + unsigned tid; + + if (!leader) + goto out_no_task; + + tid = name_to_int(dentry); + if (tid == ~0U) + goto out; + + rcu_read_lock(); + task = find_task_by_pid(tid); + if (task) + get_task_struct(task); + rcu_read_unlock(); + if (!task) + goto out; + if (leader->tgid != task->tgid) + goto out_drop_task; + + result = proc_task_instantiate(dir, dentry, task, NULL); +out_drop_task: + put_task_struct(task); +out: + put_task_struct(leader); +out_no_task: + return result; +} + +/* * Find the first tid of a thread group to return to user space. * * Usually this is just the thread group leader, but if the users @@ -2318,10 +2217,18 @@ static struct task_struct *next_tid(struct task_struct *start) return pos; } +static int proc_task_fill_cache(struct file *filp, void *dirent, filldir_t filldir, + struct task_struct *task, int tid) +{ + char name[PROC_NUMBUF]; + int len = snprintf(name, sizeof(name), "%d", tid); + return proc_fill_cache(filp, dirent, filldir, name, len, + proc_task_instantiate, task, NULL); +} + /* for the /proc/TGID/task/ directories */ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir) { - char buf[PROC_NUMBUF]; struct dentry *dentry = filp->f_dentry; struct inode *inode = dentry->d_inode; struct task_struct *leader = get_proc_task(inode); @@ -2358,11 +2265,8 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi for (task = first_tid(leader, tid, pos - 2); task; task = next_tid(task), pos++) { - int len; tid = task->pid; - len = snprintf(buf, sizeof(buf), "%d", tid); - ino = fake_ino(tid, PROC_TID_INO); - if (filldir(dirent, buf, len, pos, ino, DT_DIR < 0)) { + if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) { /* returning this tgid failed, save it as the first * pid for the next readir call */ filp->f_version = tid; @@ -2392,3 +2296,14 @@ static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct return 0; } + +static struct inode_operations proc_task_inode_operations = { + .lookup = proc_task_lookup, + .getattr = proc_task_getattr, + .setattr = proc_setattr, +}; + +static struct file_operations proc_task_operations = { + .read = generic_read_dir, + .readdir = proc_task_readdir, +}; diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 66bc425f2f3..8d88e58ed5c 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -45,6 +45,7 @@ #include <linux/sysrq.h> #include <linux/vmalloc.h> #include <linux/crash_dump.h> +#include <linux/pspace.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/io.h> @@ -91,7 +92,7 @@ static int loadavg_read_proc(char *page, char **start, off_t off, LOAD_INT(a), LOAD_FRAC(a), LOAD_INT(b), LOAD_FRAC(b), LOAD_INT(c), LOAD_FRAC(c), - nr_running(), nr_threads, last_pid); + nr_running(), nr_threads, init_pspace.last_pid); return proc_calc_metrics(page, start, off, count, eof, len); } diff --git a/fs/proc/root.c b/fs/proc/root.c index 8901c65caca..ffe66c38488 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -16,6 +16,7 @@ #include <linux/module.h> #include <linux/bitops.h> #include <linux/smp_lock.h> +#include <linux/mount.h> #include "internal.h" @@ -28,6 +29,17 @@ struct proc_dir_entry *proc_sys_root; static int proc_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt) { + if (proc_mnt) { + /* Seed the root directory with a pid so it doesn't need + * to be special in base.c. I would do this earlier but + * the only task alive when /proc is mounted the first time + * is the init_task and it doesn't have any pids. + */ + struct proc_inode *ei; + ei = PROC_I(proc_mnt->mnt_sb->s_root->d_inode); + if (!ei->pid) + ei->pid = find_get_pid(1); + } return get_sb_single(fs_type, flags, data, proc_fill_super, mnt); } diff --git a/include/asm-alpha/unistd.h b/include/asm-alpha/unistd.h index bc6e6a9259d..2cabbd465c0 100644 --- a/include/asm-alpha/unistd.h +++ b/include/asm-alpha/unistd.h @@ -580,75 +580,6 @@ type name (type1 arg1,type2 arg2,type3 arg3,type4 arg4,type5 arg5, type6 arg6)\ #define __ARCH_WANT_SYS_OLDUMOUNT #define __ARCH_WANT_SYS_SIGPENDING -#ifdef __KERNEL_SYSCALLS__ - -#include <linux/compiler.h> -#include <linux/types.h> -#include <linux/string.h> -#include <linux/signal.h> -#include <linux/syscalls.h> -#include <asm/ptrace.h> - -static inline long open(const char * name, int mode, int flags) -{ - return sys_open(name, mode, flags); -} - -static inline long dup(int fd) -{ - return sys_dup(fd); -} - -static inline long close(int fd) -{ - return sys_close(fd); -} - -static inline off_t lseek(int fd, off_t off, int whence) -{ - return sys_lseek(fd, off, whence); -} - -static inline void _exit(int value) -{ - sys_exit(value); -} - -#define exit(x) _exit(x) - -static inline long write(int fd, const char * buf, size_t nr) -{ - return sys_write(fd, buf, nr); -} - -static inline long read(int fd, char * buf, size_t nr) -{ - return sys_read(fd, buf, nr); -} - -extern int execve(char *, char **, char **); - -static inline long setsid(void) -{ - return sys_setsid(); -} - -static inline pid_t waitpid(int pid, int * wait_stat, int flags) -{ - return sys_wait4(pid, wait_stat, flags, NULL); -} - -asmlinkage int sys_execve(char *ufilename, char **argv, char **envp, - unsigned long a3, unsigned long a4, unsigned long a5, - struct pt_regs regs); -asmlinkage long sys_rt_sigaction(int sig, - const struct sigaction __user *act, - struct sigaction __user *oact, - size_t sigsetsize, - void *restorer); - -#endif /* __KERNEL_SYSCALLS__ */ - /* "Conditional" syscalls. What we want is __attribute__((weak,alias("sys_ni_syscall"))) diff --git a/include/asm-arm/unistd.h b/include/asm-arm/unistd.h index 2ab4078334b..14a87eec5a2 100644 --- a/include/asm-arm/unistd.h +++ b/include/asm-arm/unistd.h @@ -549,30 +549,6 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6 #define __ARCH_WANT_SYS_SOCKETCALL #endif -#ifdef __KERNEL_SYSCALLS__ - -#include <linux/compiler.h> -#include <linux/types.h> -#include <linux/syscalls.h> - -extern long execve(const char *file, char **argv, char **envp); - -struct pt_regs; -asmlinkage int sys_execve(char *filenamei, char **argv, char **envp, - struct pt_regs *regs); -asmlinkage int sys_clone(unsigned long clone_flags, unsigned long newsp, - struct pt_regs *regs); -asmlinkage int sys_fork(struct pt_regs *regs); -asmlinkage int sys_vfork(struct pt_regs *regs); -asmlinkage int sys_pipe(unsigned long *fildes); -struct sigaction; -asmlinkage long sys_rt_sigaction(int sig, - const struct sigaction __user *act, - struct sigaction __user *oact, - size_t sigsetsize); - -#endif /* __KERNEL_SYSCALLS__ */ - /* * "Conditional" syscalls * diff --git a/include/asm-arm26/unistd.h b/include/asm-arm26/unistd.h index c6d2436c9d3..25a5eead85b 100644 --- a/include/asm-arm26/unistd.h +++ b/include/asm-arm26/unistd.h @@ -464,30 +464,6 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6 #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION -#ifdef __KERNEL_SYSCALLS__ - -#include <linux/compiler.h> -#include <linux/types.h> -#include <linux/syscalls.h> - -extern long execve(const char *file, char **argv, char **envp); - -struct pt_regs; -asmlinkage int sys_execve(char *filenamei, char **argv, char **envp, - struct pt_regs *regs); -asmlinkage int sys_clone(unsigned long clone_flags, unsigned long newsp, - struct pt_regs *regs); -asmlinkage int sys_fork(struct pt_regs *regs); -asmlinkage int sys_vfork(struct pt_regs *regs); -asmlinkage int sys_pipe(unsigned long *fildes); -struct sigaction; -asmlinkage long sys_rt_sigaction(int sig, - const struct sigaction __user *act, - struct sigaction __user *oact, - size_t sigsetsize); - -#endif /* __KERNEL_SYSCALLS__ */ - /* * "Conditional" syscalls * diff --git a/include/asm-avr32/unistd.h b/include/asm-avr32/unistd.h index 1f528f92690..a50e5004550 100644 --- a/include/asm-avr32/unistd.h +++ b/include/asm-avr32/unistd.h @@ -281,30 +281,10 @@ #define __NR_tee 263 #define __NR_vmsplice 264 +#ifdef __KERNEL__ #define NR_syscalls 265 -/* - * AVR32 calling convention for system calls: - * - System call number in r8 - * - Parameters in r12 and downwards to r9 as well as r6 and r5. - * - Return value in r12 - */ - -/* - * user-visible error numbers are in the range -1 - -124: see - * <asm-generic/errno.h> - */ - -#define __syscall_return(type, res) do { \ - if ((unsigned long)(res) >= (unsigned long)(-125)) { \ - errno = -(res); \ - res = -1; \ - } \ - return (type) (res); \ - } while (0) - -#ifdef __KERNEL__ #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SYS_ALARM @@ -319,62 +299,6 @@ #define __ARCH_WANT_SYS_GETPGRP #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#endif - -#if defined(__KERNEL_SYSCALLS__) || defined(__CHECKER__) - -#include <linux/types.h> -#include <linux/linkage.h> -#include <asm/signal.h> - -struct pt_regs; - -/* - * we need this inline - forking from kernel space will result - * in NO COPY ON WRITE (!!!), until an execve is executed. This - * is no problem, but for the stack. This is handled by not letting - * main() use the stack at all after fork(). Thus, no function - * calls - which means inline code for fork too, as otherwise we - * would use the stack upon exit from 'fork()'. - * - * Actually only pause and fork are needed inline, so that there - * won't be any messing with the stack from main(), but we define - * some others too. - */ -static inline int execve(const char *file, char **argv, char **envp) -{ - register long scno asm("r8") = __NR_execve; - register long sc1 asm("r12") = (long)file; - register long sc2 asm("r11") = (long)argv; - register long sc3 asm("r10") = (long)envp; - int res; - - asm volatile("scall" - : "=r"(sc1) - : "r"(scno), "0"(sc1), "r"(sc2), "r"(sc3) - : "lr", "memory"); - res = sc1; - __syscall_return(int, res); -} - -asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize); -asmlinkage int sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, - struct pt_regs *regs); -asmlinkage int sys_rt_sigreturn(struct pt_regs *regs); -asmlinkage int sys_pipe(unsigned long __user *filedes); -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, off_t offset); -asmlinkage int sys_cacheflush(int operation, void __user *addr, size_t len); -asmlinkage int sys_fork(struct pt_regs *regs); -asmlinkage int sys_clone(unsigned long clone_flags, unsigned long newsp, - unsigned long parent_tidptr, - unsigned long child_tidptr, struct pt_regs *regs); -asmlinkage int sys_vfork(struct pt_regs *regs); -asmlinkage int sys_execve(char __user *ufilename, char __user *__user *uargv, - char __user *__user *uenvp, struct pt_regs *regs); - -#endif /* * "Conditional" syscalls @@ -384,4 +308,6 @@ asmlinkage int sys_execve(char __user *ufilename, char __user *__user *uargv, */ #define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall"); +#endif /* __KERNEL__ */ + #endif /* __ASM_AVR32_UNISTD_H */ diff --git a/include/asm-cris/unistd.h b/include/asm-cris/unistd.h index 7372efae051..7c90fa970c3 100644 --- a/include/asm-cris/unistd.h +++ b/include/asm-cris/unistd.h @@ -322,67 +322,6 @@ #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION -#ifdef __KERNEL_SYSCALLS__ - -#include <linux/compiler.h> -#include <linux/types.h> -#include <linux/linkage.h> - -/* - * we need this inline - forking from kernel space will result - * in NO COPY ON WRITE (!!!), until an execve is executed. This - * is no problem, but for the stack. This is handled by not letting - * main() use the stack at all after fork(). Thus, no function - * calls - which means inline code for fork too, as otherwise we - * would use the stack upon exit from 'fork()'. - * - * Actually only pause and fork are needed inline, so that there - * won't be any messing with the stack from main(), but we define - * some others too. - */ -#define __NR__exit __NR_exit -static inline _syscall0(pid_t,setsid) -static inline _syscall3(int,write,int,fd,const char *,buf,off_t,count) -static inline _syscall3(int,read,int,fd,char *,buf,off_t,count) -static inline _syscall3(off_t,lseek,int,fd,off_t,offset,int,count) -static inline _syscall1(int,dup,int,fd) -static inline _syscall3(int,execve,const char *,file,char **,argv,char **,envp) -static inline _syscall3(int,open,const char *,file,int,flag,int,mode) -static inline _syscall1(int,close,int,fd) - -struct pt_regs; -asmlinkage long sys_mmap2( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff); -asmlinkage int sys_execve(const char *fname, char **argv, char **envp, - long r13, long mof, long srp, struct pt_regs *regs); -asmlinkage int sys_clone(unsigned long newusp, unsigned long flags, - int* parent_tid, int* child_tid, long mof, long srp, - struct pt_regs *regs); -asmlinkage int sys_fork(long r10, long r11, long r12, long r13, - long mof, long srp, struct pt_regs *regs); -asmlinkage int sys_vfork(long r10, long r11, long r12, long r13, - long mof, long srp, struct pt_regs *regs); -asmlinkage int sys_pipe(unsigned long __user *fildes); -struct sigaction; -asmlinkage long sys_rt_sigaction(int sig, - const struct sigaction __user *act, - struct sigaction __user *oact, - size_t sigsetsize); - -/* - * Since we define it "external", it collides with the built-in - * definition, which has the "noreturn" attribute and will cause - * complaints. We don't want to use -fno-builtin, so just use a - * different name when in the kernel. - */ -#define _exit kernel_syscall_exit -static inline _syscall1(int,_exit,int,exitcode) -static inline _syscall3(pid_t,waitpid,pid_t,pid,int *,wait_stat,int,options) -#endif /* __KERNEL_SYSCALLS__ */ - - /* * "Conditional" syscalls * diff --git a/include/asm-frv/unistd.h b/include/asm-frv/unistd.h index d104d1b91d3..725e854928c 100644 --- a/include/asm-frv/unistd.h +++ b/include/asm-frv/unistd.h @@ -440,31 +440,6 @@ type name (type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg __syscall_return(type, __sc0); \ } - -#ifdef __KERNEL_SYSCALLS__ - -#include <linux/compiler.h> -#include <linux/types.h> -#include <linux/linkage.h> -#include <asm/ptrace.h> - -/* - * we need this inline - forking from kernel space will result - * in NO COPY ON WRITE (!!!), until an execve is executed. This - * is no problem, but for the stack. This is handled by not letting - * main() use the stack at all after fork(). Thus, no function - * calls - which means inline code for fork too, as otherwise we - * would use the stack upon exit from 'fork()'. - * - * Actually only pause and fork are needed inline, so that there - * won't be any messing with the stack from main(), but we define - * some others too. - */ -#define __NR__exit __NR_exit -static inline _syscall3(int,execve,const char *,file,char **,argv,char **,envp) - -#endif /* __KERNEL_SYSCALLS__ */ - #define __ARCH_WANT_IPC_PARSE_VERSION /* #define __ARCH_WANT_OLD_READDIR */ #define __ARCH_WANT_OLD_STAT diff --git a/include/asm-h8300/unistd.h b/include/asm-h8300/unistd.h index a2dd90462d8..747788d629a 100644 --- a/include/asm-h8300/unistd.h +++ b/include/asm-h8300/unistd.h @@ -485,57 +485,6 @@ type name(atype a, btype b, ctype c, dtype d, etype e, ftype f) \ #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION -#ifdef __KERNEL_SYSCALLS__ - -#include <linux/compiler.h> -#include <linux/types.h> - -/* - * we need this inline - forking from kernel space will result - * in NO COPY ON WRITE (!!!), until an execve is executed. This - * is no problem, but for the stack. This is handled by not letting - * main() use the stack at all after fork(). Thus, no function - * calls - which means inline code for fork too, as otherwise we - * would use the stack upon exit from 'fork()'. - * - * Actually only pause and fork are needed inline, so that there - * won't be any messing with the stack from main(), but we define - * some others too. - */ -#define __NR__exit __NR_exit -static inline _syscall0(int,pause) -static inline _syscall0(int,sync) -static inline _syscall0(pid_t,setsid) -static inline _syscall3(int,write,int,fd,const char *,buf,off_t,count) -static inline _syscall3(int,read,int,fd,char *,buf,off_t,count) -static inline _syscall3(off_t,lseek,int,fd,off_t,offset,int,count) -static inline _syscall1(int,dup,int,fd) -static inline _syscall3(int,execve,const char *,file,char **,argv,char **,envp) -static inline _syscall3(int,open,const char *,file,int,flag,int,mode) -static inline _syscall1(int,close,int,fd) -static inline _syscall1(int,_exit,int,exitcode) -static inline _syscall3(pid_t,waitpid,pid_t,pid,int *,wait_stat,int,options) -static inline _syscall1(int,delete_module,const char *,name) - -static inline pid_t wait(int * wait_stat) -{ - return waitpid(-1,wait_stat,0); -} - -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff); -asmlinkage int sys_execve(char *name, char **argv, char **envp, - int dummy, ...); -asmlinkage int sys_pipe(unsigned long *fildes); -struct sigaction; -asmlinkage long sys_rt_sigaction(int sig, - const struct sigaction __user *act, - struct sigaction __user *oact, - size_t sigsetsize); - -#endif /* __KERNEL_SYSCALLS__ */ - /* * "Conditional" syscalls */ diff --git a/include/asm-i386/bugs.h b/include/asm-i386/bugs.h index 2a9e4ee5904..592ffeeda45 100644 --- a/include/asm-i386/bugs.h +++ b/include/asm-i386/bugs.h @@ -189,6 +189,6 @@ static void __init check_bugs(void) check_fpu(); check_hlt(); check_popad(); - system_utsname.machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); + init_utsname()->machine[1] = '0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86); alternative_instructions(); } diff --git a/include/asm-i386/elf.h b/include/asm-i386/elf.h index db4344d9f73..3a05436f31c 100644 --- a/include/asm-i386/elf.h +++ b/include/asm-i386/elf.h @@ -112,7 +112,7 @@ typedef struct user_fxsr_struct elf_fpxregset_t; For the moment, we have only optimizations for the Intel generations, but that could change... */ -#define ELF_PLATFORM (system_utsname.machine) +#define ELF_PLATFORM (utsname()->machine) #define SET_PERSONALITY(ex, ibcs2) do { } while (0) diff --git a/include/asm-i386/ptrace.h b/include/asm-i386/ptrace.h index a4a0e5207db..d505f501077 100644 --- a/include/asm-i386/ptrace.h +++ b/include/asm-i386/ptrace.h @@ -47,7 +47,10 @@ static inline int user_mode_vm(struct pt_regs *regs) { return ((regs->xcs & SEGMENT_RPL_MASK) | (regs->eflags & VM_MASK)) >= USER_RPL; } + #define instruction_pointer(regs) ((regs)->eip) +#define regs_return_value(regs) ((regs)->eax) + extern unsigned long profile_pc(struct pt_regs *regs); #endif /* __KERNEL__ */ diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index bd9987087ad..3ca7ab963d7 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -451,45 +451,6 @@ __syscall_return(type,__res); \ #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#ifdef __KERNEL_SYSCALLS__ - -#include <linux/compiler.h> -#include <linux/types.h> -#include <linux/linkage.h> -#include <asm/ptrace.h> - -/* - * we need this inline - forking from kernel space will result - * in NO COPY ON WRITE (!!!), until an execve is executed. This - * is no problem, but for the stack. This is handled by not letting - * main() use the stack at all after fork(). Thus, no function - * calls - which means inline code for fork too, as otherwise we - * would use the stack upon exit from 'fork()'. - * - * Actually only pause and fork are needed inline, so that there - * won't be any messing with the stack from main(), but we define - * some others too. - */ -static inline _syscall3(int,execve,const char *,file,char **,argv,char **,envp) - -asmlinkage int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount); -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff); -asmlinkage int sys_execve(struct pt_regs regs); -asmlinkage int sys_clone(struct pt_regs regs); -asmlinkage int sys_fork(struct pt_regs regs); -asmlinkage int sys_vfork(struct pt_regs regs); -asmlinkage int sys_pipe(unsigned long __user *fildes); -asmlinkage long sys_iopl(unsigned long unused); -struct sigaction; -asmlinkage long sys_rt_sigaction(int sig, - const struct sigaction __user *act, - struct sigaction __user *oact, - size_t sigsetsize); - -#endif /* __KERNEL_SYSCALLS__ */ - /* * "Conditional" syscalls * diff --git a/include/asm-ia64/ptrace.h b/include/asm-ia64/ptrace.h index 1414316efd4..f4ef87a3623 100644 --- a/include/asm-ia64/ptrace.h +++ b/include/asm-ia64/ptrace.h @@ -241,6 +241,9 @@ struct switch_stack { * the canonical representation by adding to instruction pointer. */ # define instruction_pointer(regs) ((regs)->cr_iip + ia64_psr(regs)->ri) + +#define regs_return_value(regs) ((regs)->r8) + /* Conserve space in histogram by encoding slot bits in address * bits 2 and 3 rather than bits 0 and 1. */ diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h index bb0eb727dcd..53c5c0ee122 100644 --- a/include/asm-ia64/unistd.h +++ b/include/asm-ia64/unistd.h @@ -319,78 +319,6 @@ extern long __ia64_syscall (long a0, long a1, long a2, long a3, long a4, long nr); -#ifdef __KERNEL_SYSCALLS__ - -#include <linux/compiler.h> -#include <linux/string.h> -#include <linux/signal.h> -#include <asm/ptrace.h> -#include <linux/stringify.h> -#include <linux/syscalls.h> - -static inline long -open (const char * name, int mode, int flags) -{ - return sys_open(name, mode, flags); -} - -static inline long -dup (int fd) -{ - return sys_dup(fd); -} - -static inline long -close (int fd) -{ - return sys_close(fd); -} - -static inline off_t -lseek (int fd, off_t off, int whence) -{ - return sys_lseek(fd, off, whence); -} - -static inline void -_exit (int value) -{ - sys_exit(value); -} - -#define exit(x) _exit(x) - -static inline long -write (int fd, const char * buf, size_t nr) -{ - return sys_write(fd, buf, nr); -} - -static inline long -read (int fd, char * buf, size_t nr) -{ - return sys_read(fd, buf, nr); -} - - -static inline long -setsid (void) -{ - return sys_setsid(); -} - -static inline pid_t -waitpid (int pid, int * wait_stat, int flags) -{ - return sys_wait4(pid, wait_stat, flags, NULL); -} - - -extern int execve (const char *filename, char *const av[], char *const ep[]); -extern pid_t clone (unsigned long flags, void *sp); - -#endif /* __KERNEL_SYSCALLS__ */ - asmlinkage unsigned long sys_mmap( unsigned long addr, unsigned long len, int prot, int flags, diff --git a/include/asm-m32r/unistd.h b/include/asm-m32r/unistd.h index 5c6a9ac6cf1..95aa34298d8 100644 --- a/include/asm-m32r/unistd.h +++ b/include/asm-m32r/unistd.h @@ -424,43 +424,6 @@ __syscall_return(type,__res); \ #define __ARCH_WANT_SYS_OLDUMOUNT #define __ARCH_WANT_SYS_RT_SIGACTION -#ifdef __KERNEL_SYSCALLS__ - -#include <linux/compiler.h> -#include <linux/types.h> -#include <linux/linkage.h> -#include <asm/ptrace.h> - -/* - * we need this inline - forking from kernel space will result - * in NO COPY ON WRITE (!!!), until an execve is executed. This - * is no problem, but for the stack. This is handled by not letting - * main() use the stack at all after fork(). Thus, no function - * calls - which means inline code for fork too, as otherwise we - * would use the stack upon exit from 'fork()'. - * - * Actually only pause and fork are needed inline, so that there - * won't be any messing with the stack from main(), but we define - * some others too. - */ -static __inline__ _syscall3(int,execve,const char *,file,char **,argv,char **,envp) - -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff); -asmlinkage int sys_execve(struct pt_regs regs); -asmlinkage int sys_clone(struct pt_regs regs); -asmlinkage int sys_fork(struct pt_regs regs); -asmlinkage int sys_vfork(struct pt_regs regs); -asmlinkage int sys_pipe(unsigned long __user *fildes); -struct sigaction; -asmlinkage long sys_rt_sigaction(int sig, - const struct sigaction __user *act, - struct sigaction __user *oact, - size_t sigsetsize); - -#endif /* __KERNEL_SYSCALLS__ */ - /* * "Conditional" syscalls * diff --git a/include/asm-m68k/unistd.h b/include/asm-m68k/unistd.h index 751632b904d..3ab716f0fc1 100644 --- a/include/asm-m68k/unistd.h +++ b/include/asm-m68k/unistd.h @@ -409,12 +409,6 @@ __syscall_return(type,__res); \ #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION -#ifdef __KERNEL_SYSCALLS__ - -static inline _syscall3(int,execve,const char *,file,char **,argv,char **,envp) - -#endif /* __KERNEL_SYSCALLS__ */ - /* * "Conditional" syscalls * diff --git a/include/asm-m68knommu/unistd.h b/include/asm-m68knommu/unistd.h index 21fdc37c5c2..daafb5d43ef 100644 --- a/include/asm-m68knommu/unistd.h +++ b/include/asm-m68knommu/unistd.h @@ -463,61 +463,6 @@ type name(atype a, btype b, ctype c, dtype d, etype e) \ #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION -#ifdef __KERNEL_SYSCALLS__ - -#include <linux/compiler.h> -#include <linux/interrupt.h> -#include <linux/types.h> - -/* - * we need this inline - forking from kernel space will result - * in NO COPY ON WRITE (!!!), until an execve is executed. This - * is no problem, but for the stack. This is handled by not letting - * main() use the stack at all after fork(). Thus, no function - * calls - which means inline code for fork too, as otherwise we - * would use the stack upon exit from 'fork()'. - * - * Actually only pause and fork are needed inline, so that there - * won't be any messing with the stack from main(), but we define - * some others too. - */ -#define __NR__exit __NR_exit -static inline _syscall0(int,pause) -static inline _syscall0(int,sync) -static inline _syscall0(pid_t,setsid) -static inline _syscall3(int,write,int,fd,const char *,buf,off_t,count) -static inline _syscall3(int,read,int,fd,char *,buf,off_t,count) -static inline _syscall3(off_t,lseek,int,fd,off_t,offset,int,count) -static inline _syscall1(int,dup,int,fd) -static inline _syscall3(int,execve,const char *,file,char **,argv,char **,envp) -static inline _syscall3(int,open,const char *,file,int,flag,int,mode) -static inline _syscall1(int,close,int,fd) -static inline _syscall1(int,_exit,int,exitcode) -static inline _syscall3(pid_t,waitpid,pid_t,pid,int *,wait_stat,int,options) -static inline _syscall1(int,delete_module,const char *,name) - -static inline pid_t wait(int * wait_stat) -{ - return waitpid(-1,wait_stat,0); -} -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff); -asmlinkage int sys_execve(char *name, char **argv, char **envp); -asmlinkage int sys_pipe(unsigned long *fildes); -struct pt_regs; -int sys_request_irq(unsigned int, - irqreturn_t (*)(int, void *, struct pt_regs *), - unsigned long, const char *, void *); -void sys_free_irq(unsigned int, void *); -struct sigaction; -asmlinkage long sys_rt_sigaction(int sig, - const struct sigaction __user *act, - struct sigaction __user *oact, - size_t sigsetsize); - -#endif /* __KERNEL_SYSCALLS__ */ - /* * "Conditional" syscalls * diff --git a/include/asm-mips/unistd.h b/include/asm-mips/unistd.h index c39142920fe..685c91467e6 100644 --- a/include/asm-mips/unistd.h +++ b/include/asm-mips/unistd.h @@ -1212,45 +1212,6 @@ type name (atype a,btype b,ctype c,dtype d,etype e,ftype f) \ # define __ARCH_WANT_COMPAT_SYS_TIME # endif -#ifdef __KERNEL_SYSCALLS__ - -#include <linux/compiler.h> -#include <linux/types.h> -#include <linux/linkage.h> -#include <asm/ptrace.h> -#include <asm/sim.h> - -/* - * we need this inline - forking from kernel space will result - * in NO COPY ON WRITE (!!!), until an execve is executed. This - * is no problem, but for the stack. This is handled by not letting - * main() use the stack at all after fork(). Thus, no function - * calls - which means inline code for fork too, as otherwise we - * would use the stack upon exit from 'fork()'. - * - * Actually only pause and fork are needed inline, so that there - * won't be any messing with the stack from main(), but we define - * some others too. - */ -static inline _syscall3(int,execve,const char *,file,char **,argv,char **,envp) - -asmlinkage unsigned long sys_mmap( - unsigned long addr, size_t len, - int prot, int flags, - int fd, off_t offset); -asmlinkage long sys_mmap2( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff); -asmlinkage int sys_execve(nabi_no_regargs struct pt_regs regs); -asmlinkage int sys_pipe(nabi_no_regargs struct pt_regs regs); -struct sigaction; -asmlinkage long sys_rt_sigaction(int sig, - const struct sigaction __user *act, - struct sigaction __user *oact, - size_t sigsetsize); - -#endif /* __KERNEL_SYSCALLS__ */ #endif /* !__ASSEMBLY__ */ /* diff --git a/include/asm-parisc/unistd.h b/include/asm-parisc/unistd.h index 27bcfad1c3e..53b0f5d290e 100644 --- a/include/asm-parisc/unistd.h +++ b/include/asm-parisc/unistd.h @@ -952,92 +952,6 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) \ #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION -/* mmap & mmap2 take 6 arguments */ -#define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5,type6,arg6) \ -type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6) \ -{ \ - return K_INLINE_SYSCALL(name, 6, arg1, arg2, arg3, arg4, arg5, arg6); \ -} - -#ifdef __KERNEL_SYSCALLS__ - -#include <asm/current.h> -#include <linux/compiler.h> -#include <linux/types.h> -#include <linux/syscalls.h> - -static inline pid_t setsid(void) -{ - return sys_setsid(); -} - -static inline int write(int fd, const char *buf, off_t count) -{ - return sys_write(fd, buf, count); -} - -static inline int read(int fd, char *buf, off_t count) -{ - return sys_read(fd, buf, count); -} - -static inline off_t lseek(int fd, off_t offset, int count) -{ - return sys_lseek(fd, offset, count); -} - -static inline int dup(int fd) -{ - return sys_dup(fd); -} - -static inline int execve(char *filename, char * argv [], - char * envp[]) -{ - extern int __execve(char *, char **, char **, struct task_struct *); - return __execve(filename, argv, envp, current); -} - -static inline int open(const char *file, int flag, int mode) -{ - return sys_open(file, flag, mode); -} - -static inline int close(int fd) -{ - return sys_close(fd); -} - -static inline void _exit(int exitcode) -{ - sys_exit(exitcode); -} - -static inline pid_t waitpid(pid_t pid, int *wait_stat, int options) -{ - return sys_wait4(pid, wait_stat, options, NULL); -} - -asmlinkage unsigned long sys_mmap(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long offset); -asmlinkage unsigned long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff); -struct pt_regs; -asmlinkage int sys_execve(struct pt_regs *regs); -int sys_clone(unsigned long clone_flags, unsigned long usp, - struct pt_regs *regs); -int sys_vfork(struct pt_regs *regs); -int sys_pipe(int *fildes); -struct sigaction; -asmlinkage long sys_rt_sigaction(int sig, - const struct sigaction __user *act, - struct sigaction __user *oact, - size_t sigsetsize); - -#endif /* __KERNEL_SYSCALLS__ */ - #endif /* __ASSEMBLY__ */ #undef STR diff --git a/include/asm-powerpc/kprobes.h b/include/asm-powerpc/kprobes.h index 34e1f89a5fa..2dafa376a63 100644 --- a/include/asm-powerpc/kprobes.h +++ b/include/asm-powerpc/kprobes.h @@ -44,6 +44,28 @@ typedef unsigned int kprobe_opcode_t; #define IS_TDI(instr) (((instr) & 0xfc000000) == 0x08000000) #define IS_TWI(instr) (((instr) & 0xfc000000) == 0x0c000000) +/* + * 64bit powerpc uses function descriptors. + * Handle cases where: + * - User passes a <.symbol> or <module:.symbol> + * - User passes a <symbol> or <module:symbol> + * - User passes a non-existant symbol, kallsyms_lookup_name + * returns 0. Don't deref the NULL pointer in that case + */ +#define kprobe_lookup_name(name, addr) \ +{ \ + addr = (kprobe_opcode_t *)kallsyms_lookup_name(name); \ + if (addr) { \ + char *colon; \ + if ((colon = strchr(name, ':')) != NULL) { \ + colon++; \ + if (*colon != '\0' && *colon != '.') \ + addr = *(kprobe_opcode_t **)addr; \ + } else if (name[0] != '.') \ + addr = *(kprobe_opcode_t **)addr; \ + } \ +} + #define JPROBE_ENTRY(pentry) (kprobe_opcode_t *)((func_descr_t *)pentry) #define is_trap(instr) (IS_TW(instr) || IS_TD(instr) || \ diff --git a/include/asm-powerpc/ptrace.h b/include/asm-powerpc/ptrace.h index 4435efe85d0..4ad77a13f86 100644 --- a/include/asm-powerpc/ptrace.h +++ b/include/asm-powerpc/ptrace.h @@ -73,6 +73,8 @@ struct pt_regs { #ifndef __ASSEMBLY__ #define instruction_pointer(regs) ((regs)->nip) +#define regs_return_value(regs) ((regs)->gpr[3]) + #ifdef CONFIG_SMP extern unsigned long profile_pc(struct pt_regs *regs); #else diff --git a/include/asm-powerpc/unistd.h b/include/asm-powerpc/unistd.h index eb66eae6616..464a48cce7f 100644 --- a/include/asm-powerpc/unistd.h +++ b/include/asm-powerpc/unistd.h @@ -479,13 +479,6 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5, type6 arg6 #endif /* - * System call prototypes. - */ -#ifdef __KERNEL_SYSCALLS__ -extern int execve(const char *file, char **argv, char **envp); -#endif /* __KERNEL_SYSCALLS__ */ - -/* * "Conditional" syscalls * * What we want is __attribute__((weak,alias("sys_ni_syscall"))), diff --git a/include/asm-s390/ptrace.h b/include/asm-s390/ptrace.h index 8d2bf65b0b6..7b768c5c68a 100644 --- a/include/asm-s390/ptrace.h +++ b/include/asm-s390/ptrace.h @@ -472,6 +472,7 @@ struct user_regs_struct #define user_mode(regs) (((regs)->psw.mask & PSW_MASK_PSTATE) != 0) #define instruction_pointer(regs) ((regs)->psw.addr & PSW_ADDR_INSN) +#define regs_return_value(regs)((regs)->gprs[2]) #define profile_pc(regs) instruction_pointer(regs) extern void show_regs(struct pt_regs * regs); #endif diff --git a/include/asm-s390/unistd.h b/include/asm-s390/unistd.h index 0361ac5dcde..0cccfd83c45 100644 --- a/include/asm-s390/unistd.h +++ b/include/asm-s390/unistd.h @@ -523,57 +523,6 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ # define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND # endif -#ifdef __KERNEL_SYSCALLS__ - -#include <linux/compiler.h> -#include <linux/types.h> -#include <asm/ptrace.h> -#include <asm/stat.h> -#include <linux/syscalls.h> - -/* - * we need this inline - forking from kernel space will result - * in NO COPY ON WRITE (!!!), until an execve is executed. This - * is no problem, but for the stack. This is handled by not letting - * main() use the stack at all after fork(). Thus, no function - * calls - which means inline code for fork too, as otherwise we - * would use the stack upon exit from 'fork()'. - * - * Actually only pause and fork are needed inline, so that there - * won't be any messing with the stack from main(), but we define - * some others too. - */ -#define __NR__exit __NR_exit -static inline _syscall0(pid_t,setsid) -static inline _syscall3(int,write,int,fd,const char *,buf,off_t,count) -static inline _syscall3(int,read,int,fd,char *,buf,off_t,count) -static inline _syscall3(off_t,lseek,int,fd,off_t,offset,int,count) -static inline _syscall1(int,dup,int,fd) -static inline _syscall3(int,execve,const char *,file,char **,argv,char **,envp) -static inline _syscall3(int,open,const char *,file,int,flag,int,mode) -static inline _syscall1(int,close,int,fd) -static inline _syscall2(long,stat,char *,filename,struct stat *,statbuf) - -static inline pid_t waitpid(int pid, int *wait_stat, int flags) -{ - return sys_wait4(pid, wait_stat, flags, NULL); -} -struct mmap_arg_struct; -asmlinkage long sys_mmap2(struct mmap_arg_struct __user *arg); - -asmlinkage long sys_execve(struct pt_regs regs); -asmlinkage long sys_clone(struct pt_regs regs); -asmlinkage long sys_fork(struct pt_regs regs); -asmlinkage long sys_vfork(struct pt_regs regs); -asmlinkage long sys_pipe(unsigned long __user *fildes); -struct sigaction; -asmlinkage long sys_rt_sigaction(int sig, - const struct sigaction __user *act, - struct sigaction __user *oact, - size_t sigsetsize); - -#endif /* __KERNEL_SYSCALLS__ */ - /* * "Conditional" syscalls * diff --git a/include/asm-sh/bugs.h b/include/asm-sh/bugs.h index b4000c8bf31..beeea40f549 100644 --- a/include/asm-sh/bugs.h +++ b/include/asm-sh/bugs.h @@ -18,7 +18,7 @@ static void __init check_bugs(void) { extern char *get_cpu_subtype(void); extern unsigned long loops_per_jiffy; - char *p= &system_utsname.machine[2]; /* "sh" */ + char *p= &init_utsname()->machine[2]; /* "sh" */ cpu_data->loops_per_jiffy = loops_per_jiffy; diff --git a/include/asm-sh/unistd.h b/include/asm-sh/unistd.h index 5d5e9f94def..f1a0cbc966b 100644 --- a/include/asm-sh/unistd.h +++ b/include/asm-sh/unistd.h @@ -472,76 +472,6 @@ __syscall_return(type,__sc0); \ #define __ARCH_WANT_SYS_RT_SIGACTION #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#ifdef __KERNEL_SYSCALLS__ - -#include <linux/compiler.h> -#include <linux/types.h> -#include <linux/linkage.h> -#include <asm/ptrace.h> - -/* - * we need this inline - forking from kernel space will result - * in NO COPY ON WRITE (!!!), until an execve is executed. This - * is no problem, but for the stack. This is handled by not letting - * main() use the stack at all after fork(). Thus, no function - * calls - which means inline code for fork too, as otherwise we - * would use the stack upon exit from 'fork()'. - * - * Actually only pause and fork are needed inline, so that there - * won't be any messing with the stack from main(), but we define - * some others too. - */ -#define __NR__exit __NR_exit -static __inline__ _syscall0(int,pause) -static __inline__ _syscall0(int,sync) -static __inline__ _syscall0(pid_t,setsid) -static __inline__ _syscall3(int,write,int,fd,const char *,buf,off_t,count) -static __inline__ _syscall3(int,read,int,fd,char *,buf,off_t,count) -static __inline__ _syscall3(off_t,lseek,int,fd,off_t,offset,int,count) -static __inline__ _syscall1(int,dup,int,fd) -static __inline__ _syscall3(int,execve,const char *,file,char **,argv,char **,envp) -static __inline__ _syscall3(int,open,const char *,file,int,flag,int,mode) -static __inline__ _syscall1(int,close,int,fd) -static __inline__ _syscall3(pid_t,waitpid,pid_t,pid,int *,wait_stat,int,options) -static __inline__ _syscall1(int,delete_module,const char *,name) - -static __inline__ pid_t wait(int * wait_stat) -{ - return waitpid(-1,wait_stat,0); -} - -asmlinkage long sys_mmap2( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff); -asmlinkage int sys_execve(char *ufilename, char **uargv, - char **uenvp, unsigned long r7, - struct pt_regs regs); -asmlinkage int sys_clone(unsigned long clone_flags, unsigned long newsp, - unsigned long parent_tidptr, - unsigned long child_tidptr, - struct pt_regs regs); -asmlinkage int sys_fork(unsigned long r4, unsigned long r5, - unsigned long r6, unsigned long r7, - struct pt_regs regs); -asmlinkage int sys_vfork(unsigned long r4, unsigned long r5, - unsigned long r6, unsigned long r7, - struct pt_regs regs); -asmlinkage int sys_pipe(unsigned long r4, unsigned long r5, - unsigned long r6, unsigned long r7, - struct pt_regs regs); -asmlinkage ssize_t sys_pread_wrapper(unsigned int fd, char *buf, - size_t count, long dummy, loff_t pos); -asmlinkage ssize_t sys_pwrite_wrapper(unsigned int fd, const char *buf, - size_t count, long dummy, loff_t pos); -struct sigaction; -asmlinkage long sys_rt_sigaction(int sig, - const struct sigaction __user *act, - struct sigaction __user *oact, - size_t sigsetsize); - -#endif /* __KERNEL_SYSCALLS__ */ - /* * "Conditional" syscalls * diff --git a/include/asm-sh64/unistd.h b/include/asm-sh64/unistd.h index c113566bef3..ee7828b27ad 100644 --- a/include/asm-sh64/unistd.h +++ b/include/asm-sh64/unistd.h @@ -513,47 +513,6 @@ __syscall_return(type,__sc0); \ #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION -#ifdef __KERNEL_SYSCALLS__ - -/* Copy from sh */ -#include <linux/compiler.h> -#include <linux/types.h> -#include <asm/ptrace.h> - -/* - * we need this inline - forking from kernel space will result - * in NO COPY ON WRITE (!!!), until an execve is executed. This - * is no problem, but for the stack. This is handled by not letting - * main() use the stack at all after fork(). Thus, no function - * calls - which means inline code for fork too, as otherwise we - * would use the stack upon exit from 'fork()'. - * - * Actually only pause and fork are needed inline, so that there - * won't be any messing with the stack from main(), but we define - * some others too. - */ -#define __NR__exit __NR_exit -static inline _syscall0(int,pause) -static inline _syscall1(int,setup,int,magic) -static inline _syscall0(int,sync) -static inline _syscall0(pid_t,setsid) -static inline _syscall3(int,write,int,fd,const char *,buf,off_t,count) -static inline _syscall3(int,read,int,fd,char *,buf,off_t,count) -static inline _syscall3(off_t,lseek,int,fd,off_t,offset,int,count) -static inline _syscall1(int,dup,int,fd) -static inline _syscall3(int,execve,const char *,file,char **,argv,char **,envp) -static inline _syscall3(int,open,const char *,file,int,flag,int,mode) -static inline _syscall1(int,close,int,fd) -static inline _syscall1(int,_exit,int,exitcode) -static inline _syscall3(pid_t,waitpid,pid_t,pid,int *,wait_stat,int,options) -static inline _syscall1(int,delete_module,const char *,name) - -static inline pid_t wait(int * wait_stat) -{ - return waitpid(-1,wait_stat,0); -} -#endif /* __KERNEL_SYSCALLS__ */ - /* * "Conditional" syscalls * diff --git a/include/asm-sparc/unistd.h b/include/asm-sparc/unistd.h index 2553762465c..c7a495afc82 100644 --- a/include/asm-sparc/unistd.h +++ b/include/asm-sparc/unistd.h @@ -478,53 +478,6 @@ return -1; \ #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#ifdef __KERNEL_SYSCALLS__ - -#include <linux/compiler.h> -#include <linux/types.h> - -/* - * we need this inline - forking from kernel space will result - * in NO COPY ON WRITE (!!!), until an execve is executed. This - * is no problem, but for the stack. This is handled by not letting - * main() use the stack at all after fork(). Thus, no function - * calls - which means inline code for fork too, as otherwise we - * would use the stack upon exit from 'fork()'. - * - * Actually only pause and fork are needed inline, so that there - * won't be any messing with the stack from main(), but we define - * some others too. - */ -#define __NR__exit __NR_exit -static __inline__ _syscall0(pid_t,setsid) -static __inline__ _syscall3(int,write,int,fd,__const__ char *,buf,off_t,count) -static __inline__ _syscall3(int,read,int,fd,char *,buf,off_t,count) -static __inline__ _syscall3(off_t,lseek,int,fd,off_t,offset,int,count) -static __inline__ _syscall1(int,dup,int,fd) -static __inline__ _syscall3(int,execve,__const__ char *,file,char **,argv,char **,envp) -static __inline__ _syscall3(int,open,__const__ char *,file,int,flag,int,mode) -static __inline__ _syscall1(int,close,int,fd) -static __inline__ _syscall3(pid_t,waitpid,pid_t,pid,int *,wait_stat,int,options) - -#include <linux/linkage.h> - -asmlinkage unsigned long sys_mmap( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long off); -asmlinkage unsigned long sys_mmap2( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff); -struct sigaction; -asmlinkage long sys_rt_sigaction(int sig, - const struct sigaction __user *act, - struct sigaction __user *oact, - void __user *restorer, - size_t sigsetsize); - -#endif /* __KERNEL_SYSCALLS__ */ - /* * "Conditional" syscalls * diff --git a/include/asm-sparc64/unistd.h b/include/asm-sparc64/unistd.h index badc73fdcb9..124cf076717 100644 --- a/include/asm-sparc64/unistd.h +++ b/include/asm-sparc64/unistd.h @@ -445,48 +445,6 @@ if (__res>=0) \ errno = -__res; \ return -1; \ } -#ifdef __KERNEL_SYSCALLS__ - -#include <linux/compiler.h> -#include <linux/types.h> - -/* - * we need this inline - forking from kernel space will result - * in NO COPY ON WRITE (!!!), until an execve is executed. This - * is no problem, but for the stack. This is handled by not letting - * main() use the stack at all after fork(). Thus, no function - * calls - which means inline code for fork too, as otherwise we - * would use the stack upon exit from 'fork()'. - * - * Actually only pause and fork are needed inline, so that there - * won't be any messing with the stack from main(), but we define - * some others too. - */ -#define __NR__exit __NR_exit -static __inline__ _syscall0(pid_t,setsid) -static __inline__ _syscall3(int,write,int,fd,__const__ char *,buf,off_t,count) -static __inline__ _syscall3(int,read,int,fd,char *,buf,off_t,count) -static __inline__ _syscall3(off_t,lseek,int,fd,off_t,offset,int,count) -static __inline__ _syscall1(int,dup,int,fd) -static __inline__ _syscall3(int,execve,__const__ char *,file,char **,argv,char **,envp) -static __inline__ _syscall3(int,open,__const__ char *,file,int,flag,int,mode) -static __inline__ _syscall1(int,close,int,fd) -static __inline__ _syscall3(pid_t,waitpid,pid_t,pid,int *,wait_stat,int,options) - -#include <linux/linkage.h> - -asmlinkage unsigned long sys_mmap( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long off); -struct sigaction; -asmlinkage long sys_rt_sigaction(int sig, - const struct sigaction __user *act, - struct sigaction __user *oact, - void __user *restorer, - size_t sigsetsize); - -#endif /* __KERNEL_SYSCALLS__ */ /* sysconf options, for SunOS compatibility */ #define _SC_ARG_MAX 1 diff --git a/include/asm-um/unistd.h b/include/asm-um/unistd.h index afccfcaa9ea..732c83f04c3 100644 --- a/include/asm-um/unistd.h +++ b/include/asm-um/unistd.h @@ -37,34 +37,6 @@ extern int um_execve(const char *file, char *const argv[], char *const env[]); #define __ARCH_WANT_SYS_RT_SIGSUSPEND #endif -#ifdef __KERNEL_SYSCALLS__ - -#include <linux/compiler.h> -#include <linux/types.h> - -static inline int execve(const char *filename, char *const argv[], - char *const envp[]) -{ - mm_segment_t fs; - int ret; - - fs = get_fs(); - set_fs(KERNEL_DS); - ret = um_execve(filename, argv, envp); - set_fs(fs); - - if (ret >= 0) - return ret; - - errno = -(long)ret; - return -1; -} - -int sys_execve(char *file, char **argv, char **env); - -#endif /* __KERNEL_SYSCALLS__ */ - -#undef __KERNEL_SYSCALLS__ #include "asm/arch/unistd.h" #endif /* _UM_UNISTD_H_*/ diff --git a/include/asm-v850/unistd.h b/include/asm-v850/unistd.h index 552b7c873a5..737401e7d3a 100644 --- a/include/asm-v850/unistd.h +++ b/include/asm-v850/unistd.h @@ -387,57 +387,6 @@ type name (atype a, btype b, ctype c, dtype d, etype e, ftype f) \ #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION -#ifdef __KERNEL_SYSCALLS__ - -#include <linux/compiler.h> -#include <linux/types.h> - -/* - * we need this inline - forking from kernel space will result - * in NO COPY ON WRITE (!!!), until an execve is executed. This - * is no problem, but for the stack. This is handled by not letting - * main() use the stack at all after fork(). Thus, no function - * calls - which means inline code for fork too, as otherwise we - * would use the stack upon exit from 'fork()'. - * - * Actually only pause and fork are needed inline, so that there - * won't be any messing with the stack from main(), but we define - * some others too. - */ -#define __NR__exit __NR_exit -extern inline _syscall0(pid_t,setsid) -extern inline _syscall3(int,write,int,fd,const char *,buf,off_t,count) -extern inline _syscall3(int,read,int,fd,char *,buf,off_t,count) -extern inline _syscall3(off_t,lseek,int,fd,off_t,offset,int,count) -extern inline _syscall1(int,dup,int,fd) -extern inline _syscall3(int,execve,const char *,file,char **,argv,char **,envp) -extern inline _syscall3(int,open,const char *,file,int,flag,int,mode) -extern inline _syscall1(int,close,int,fd) -extern inline _syscall1(int,_exit,int,exitcode) -extern inline _syscall3(pid_t,waitpid,pid_t,pid,int *,wait_stat,int,options) - -extern inline pid_t wait(int * wait_stat) -{ - return waitpid (-1, wait_stat, 0); -} - -unsigned long sys_mmap(unsigned long addr, size_t len, - unsigned long prot, unsigned long flags, - unsigned long fd, off_t offset); -unsigned long sys_mmap2(unsigned long addr, size_t len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff); -struct pt_regs; -int sys_execve (char *name, char **argv, char **envp, struct pt_regs *regs); -int sys_pipe (int *fildes); -struct sigaction; -asmlinkage long sys_rt_sigaction(int sig, - const struct sigaction __user *act, - struct sigaction __user *oact, - size_t sigsetsize); - -#endif /* __KERNEL_SYSCALLS__ */ - /* * "Conditional" syscalls */ diff --git a/include/asm-x86_64/ptrace.h b/include/asm-x86_64/ptrace.h index ab827dc381d..5ea84dbb1e9 100644 --- a/include/asm-x86_64/ptrace.h +++ b/include/asm-x86_64/ptrace.h @@ -39,6 +39,8 @@ struct pt_regs { #define user_mode(regs) (!!((regs)->cs & 3)) #define user_mode_vm(regs) user_mode(regs) #define instruction_pointer(regs) ((regs)->rip) +#define regs_return_value(regs) ((regs)->rax) + extern unsigned long profile_pc(struct pt_regs *regs); void signal_fault(struct pt_regs *regs, void __user *frame, char *where); diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index 6137146516d..777288eb7e7 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -620,10 +620,11 @@ __SYSCALL(__NR_vmsplice, sys_vmsplice) #define __NR_move_pages 279 __SYSCALL(__NR_move_pages, sys_move_pages) -#ifdef __KERNEL__ - #define __NR_syscall_max __NR_move_pages + +#ifdef __KERNEL__ #include <linux/err.h> +#endif #ifndef __NO_STUBS @@ -663,8 +664,6 @@ do { \ #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_COMPAT_SYS_TIME -#ifndef __KERNEL_SYSCALLS__ - #define __syscall "syscall" #define _syscall0(type,name) \ @@ -746,83 +745,7 @@ __asm__ volatile ("movq %5,%%r10 ; movq %6,%%r8 ; movq %7,%%r9 ; " __syscall \ __syscall_return(type,__res); \ } -#else /* __KERNEL_SYSCALLS__ */ - -#include <linux/syscalls.h> -#include <asm/ptrace.h> - -/* - * we need this inline - forking from kernel space will result - * in NO COPY ON WRITE (!!!), until an execve is executed. This - * is no problem, but for the stack. This is handled by not letting - * main() use the stack at all after fork(). Thus, no function - * calls - which means inline code for fork too, as otherwise we - * would use the stack upon exit from 'fork()'. - * - * Actually only pause and fork are needed inline, so that there - * won't be any messing with the stack from main(), but we define - * some others too. - */ -#define __NR__exit __NR_exit - -static inline pid_t setsid(void) -{ - return sys_setsid(); -} - -static inline ssize_t write(unsigned int fd, char * buf, size_t count) -{ - return sys_write(fd, buf, count); -} - -static inline ssize_t read(unsigned int fd, char * buf, size_t count) -{ - return sys_read(fd, buf, count); -} - -static inline off_t lseek(unsigned int fd, off_t offset, unsigned int origin) -{ - return sys_lseek(fd, offset, origin); -} - -static inline long dup(unsigned int fd) -{ - return sys_dup(fd); -} - -/* implemented in asm in arch/x86_64/kernel/entry.S */ -extern int execve(const char *, char * const *, char * const *); - -static inline long open(const char * filename, int flags, int mode) -{ - return sys_open(filename, flags, mode); -} - -static inline long close(unsigned int fd) -{ - return sys_close(fd); -} - -static inline pid_t waitpid(int pid, int * wait_stat, int flags) -{ - return sys_wait4(pid, wait_stat, flags, NULL); -} - -extern long sys_mmap(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long off); - -extern int sys_modify_ldt(int func, void *ptr, unsigned long bytecount); - -asmlinkage long sys_execve(char *name, char **argv, char **envp, - struct pt_regs regs); -asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, - void *parent_tid, void *child_tid, - struct pt_regs regs); -asmlinkage long sys_fork(struct pt_regs regs); -asmlinkage long sys_vfork(struct pt_regs regs); -asmlinkage long sys_pipe(int *fildes); - +#ifdef __KERNEL__ #ifndef __ASSEMBLY__ #include <linux/linkage.h> @@ -839,8 +762,8 @@ asmlinkage long sys_rt_sigaction(int sig, size_t sigsetsize); #endif /* __ASSEMBLY__ */ - -#endif /* __KERNEL_SYSCALLS__ */ +#endif /* __KERNEL__ */ +#endif /* __NO_STUBS */ /* * "Conditional" syscalls @@ -850,8 +773,4 @@ asmlinkage long sys_rt_sigaction(int sig, */ #define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall") -#endif /* __NO_STUBS */ - -#endif /* __KERNEL__ */ - #endif /* _ASM_X86_64_UNISTD_H_ */ diff --git a/include/asm-xtensa/unistd.h b/include/asm-xtensa/unistd.h index 5e1b99dc4ab..411f810a55c 100644 --- a/include/asm-xtensa/unistd.h +++ b/include/asm-xtensa/unistd.h @@ -402,11 +402,6 @@ __asm__ __volatile__ ( \ __syscall_return(type,__res); \ } - -#ifdef __KERNEL_SYSCALLS__ -static __inline__ _syscall3(int,execve,const char*,file,char**,argv,char**,envp) -#endif - /* * "Conditional" syscalls * diff --git a/include/linux/compat.h b/include/linux/compat.h index 9760753e662..6f110957cc9 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -13,6 +13,7 @@ #include <asm/compat.h> #include <asm/siginfo.h> +#include <asm/signal.h> #define compat_jiffies_to_clock_t(x) \ (((unsigned long)(x) * COMPAT_USER_HZ) / HZ) diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h index 25423f79bf9..ed6c0fee1ac 100644 --- a/include/linux/console_struct.h +++ b/include/linux/console_struct.h @@ -54,7 +54,7 @@ struct vc_data { struct tty_struct *vc_tty; /* TTY we are attached to */ /* data for manual vt switching */ struct vt_mode vt_mode; - int vt_pid; + struct pid *vt_pid; int vt_newvt; wait_queue_head_t paste_wait; /* mode flags */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 2e29a2edaee..91c0b2a32a9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -684,7 +684,8 @@ extern struct block_device *I_BDEV(struct inode *inode); struct fown_struct { rwlock_t lock; /* protects pid, uid, euid fields */ - int pid; /* pid or -pgrp where SIGIO should be sent */ + struct pid *pid; /* pid or -pgrp where SIGIO should be sent */ + enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */ uid_t uid, euid; /* uid/euid of process setting the owner */ int signum; /* posix.1b rt signal to be delivered on IO */ }; @@ -880,8 +881,10 @@ extern void kill_fasync(struct fasync_struct **, int, int); /* only for net: no internal synchronization */ extern void __kill_fasync(struct fasync_struct *, int, int); +extern int __f_setown(struct file *filp, struct pid *, enum pid_type, int force); extern int f_setown(struct file *filp, unsigned long arg, int force); extern void f_delown(struct file *filp); +extern pid_t f_getown(struct file *filp); extern int send_sigurg(struct fown_struct *fown); /* diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index 690c42803d2..9869ef3674a 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h @@ -31,5 +31,6 @@ struct gen_pool_chunk { extern struct gen_pool *gen_pool_create(int, int); extern int gen_pool_add(struct gen_pool *, unsigned long, size_t, int); +extern void gen_pool_destroy(struct gen_pool *); extern unsigned long gen_pool_alloc(struct gen_pool *, size_t); extern void gen_pool_free(struct gen_pool *, unsigned long, size_t); diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 60aac2cea0c..33c5daacc74 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -4,7 +4,9 @@ #include <linux/file.h> #include <linux/rcupdate.h> #include <linux/irqflags.h> +#include <linux/utsname.h> #include <linux/lockdep.h> +#include <linux/ipc.h> #define INIT_FDTABLE \ { \ @@ -68,6 +70,15 @@ .session = 1, \ } +extern struct nsproxy init_nsproxy; +#define INIT_NSPROXY(nsproxy) { \ + .count = ATOMIC_INIT(1), \ + .nslock = SPIN_LOCK_UNLOCKED, \ + .uts_ns = &init_uts_ns, \ + .namespace = NULL, \ + INIT_IPC_NS(ipc_ns) \ +} + #define INIT_SIGHAND(sighand) { \ .count = ATOMIC_INIT(1), \ .action = { { { .sa_handler = NULL, } }, }, \ @@ -117,6 +128,7 @@ extern struct group_info init_groups; .files = &init_files, \ .signal = &init_signals, \ .sighand = &init_sighand, \ + .nsproxy = &init_nsproxy, \ .pending = { \ .list = LIST_HEAD_INIT(tsk.pending.list), \ .signal = {{0}}}, \ diff --git a/include/linux/ipc.h b/include/linux/ipc.h index b291189737e..d9e2b3f36c3 100644 --- a/include/linux/ipc.h +++ b/include/linux/ipc.h @@ -2,6 +2,7 @@ #define _LINUX_IPC_H #include <linux/types.h> +#include <linux/kref.h> #define IPC_PRIVATE ((__kernel_key_t) 0) @@ -68,6 +69,59 @@ struct kern_ipc_perm void *security; }; +struct ipc_ids; +struct ipc_namespace { + struct kref kref; + struct ipc_ids *ids[3]; + + int sem_ctls[4]; + int used_sems; + + int msg_ctlmax; + int msg_ctlmnb; + int msg_ctlmni; + + size_t shm_ctlmax; + size_t shm_ctlall; + int shm_ctlmni; + int shm_tot; +}; + +extern struct ipc_namespace init_ipc_ns; + +#ifdef CONFIG_SYSVIPC +#define INIT_IPC_NS(ns) .ns = &init_ipc_ns, +#else +#define INIT_IPC_NS(ns) +#endif + +#ifdef CONFIG_IPC_NS +extern void free_ipc_ns(struct kref *kref); +extern int copy_ipcs(unsigned long flags, struct task_struct *tsk); +extern int unshare_ipcs(unsigned long flags, struct ipc_namespace **ns); +#else +static inline int copy_ipcs(unsigned long flags, struct task_struct *tsk) +{ + return 0; +} +#endif + +static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) +{ +#ifdef CONFIG_IPC_NS + if (ns) + kref_get(&ns->kref); +#endif + return ns; +} + +static inline void put_ipc_ns(struct ipc_namespace *ns) +{ +#ifdef CONFIG_IPC_NS + kref_put(&ns->kref, free_ipc_ns); +#endif +} + #endif /* __KERNEL__ */ #endif /* _LINUX_IPC_H */ diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 8bf6702da2a..ac4c0559f75 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -77,6 +77,12 @@ struct kprobe { /* location of the probe point */ kprobe_opcode_t *addr; + /* Allow user to indicate symbol name of the probe point */ + char *symbol_name; + + /* Offset into the symbol */ + unsigned int offset; + /* Called before addr is executed. */ kprobe_pre_handler_t pre_handler; @@ -196,7 +202,7 @@ void unregister_kretprobe(struct kretprobe *rp); struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp); void add_rp_inst(struct kretprobe_instance *ri); void kprobe_flush_task(struct task_struct *tk); -void recycle_rp_inst(struct kretprobe_instance *ri); +void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head); #else /* CONFIG_KPROBES */ #define __kprobes /**/ diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index b054debef2e..81e3a185f95 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -30,7 +30,7 @@ extern struct nlmsvc_binding * nlmsvc_ops; * Functions exported by the lockd module */ extern int nlmclnt_proc(struct inode *, int, struct file_lock *); -extern int lockd_up(void); +extern int lockd_up(int proto); extern void lockd_down(void); #endif /* LINUX_LOCKD_BIND_H */ diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index 0d92c468d55..47b7dbd647a 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -80,7 +80,7 @@ struct nlm_wait; /* * Memory chunk for NLM client RPC request. */ -#define NLMCLNT_OHSIZE (sizeof(system_utsname.nodename)+10) +#define NLMCLNT_OHSIZE (sizeof(utsname()->nodename)+10) struct nlm_rqst { unsigned int a_flags; /* initial RPC task flags */ struct nlm_host * a_host; /* host handle */ diff --git a/include/linux/module.h b/include/linux/module.h index 2c599175c58..4b2d8091a41 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -320,6 +320,8 @@ struct module /* Am I GPL-compatible */ int license_gplok; + unsigned int taints; /* same bits as kernel:tainted */ + #ifdef CONFIG_MODULE_UNLOAD /* Reference counts */ struct module_ref ref[NR_CPUS]; diff --git a/include/linux/namespace.h b/include/linux/namespace.h index 3abc8e3b487..d137009f0b2 100644 --- a/include/linux/namespace.h +++ b/include/linux/namespace.h @@ -4,6 +4,7 @@ #include <linux/mount.h> #include <linux/sched.h> +#include <linux/nsproxy.h> struct namespace { atomic_t count; @@ -26,11 +27,8 @@ static inline void put_namespace(struct namespace *namespace) static inline void exit_namespace(struct task_struct *p) { - struct namespace *namespace = p->namespace; + struct namespace *namespace = p->nsproxy->namespace; if (namespace) { - task_lock(p); - p->namespace = NULL; - task_unlock(p); put_namespace(namespace); } } diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h index 2dcad295fec..e1dbc86c270 100644 --- a/include/linux/nfsd/nfsd.h +++ b/include/linux/nfsd/nfsd.h @@ -140,6 +140,11 @@ struct posix_acl *nfsd_get_posix_acl(struct svc_fh *, int); int nfsd_set_posix_acl(struct svc_fh *, int, struct posix_acl *); #endif +enum vers_op {NFSD_SET, NFSD_CLEAR, NFSD_TEST, NFSD_AVAIL }; +int nfsd_vers(int vers, enum vers_op change); +void nfsd_reset_versions(void); +int nfsd_create_serv(void); + /* * NFSv4 State diff --git a/include/linux/nfsd/nfsfh.h b/include/linux/nfsd/nfsfh.h index 31a3cb617ce..069257ea99a 100644 --- a/include/linux/nfsd/nfsfh.h +++ b/include/linux/nfsd/nfsfh.h @@ -290,8 +290,9 @@ fill_post_wcc(struct svc_fh *fhp) * vfs.c:nfsd_rename as it needs to grab 2 i_mutex's at once * so, any changes here should be reflected there. */ + static inline void -fh_lock(struct svc_fh *fhp) +fh_lock_nested(struct svc_fh *fhp, unsigned int subclass) { struct dentry *dentry = fhp->fh_dentry; struct inode *inode; @@ -310,11 +311,17 @@ fh_lock(struct svc_fh *fhp) } inode = dentry->d_inode; - mutex_lock(&inode->i_mutex); + mutex_lock_nested(&inode->i_mutex, subclass); fill_pre_wcc(fhp); fhp->fh_locked = 1; } +static inline void +fh_lock(struct svc_fh *fhp) +{ + fh_lock_nested(fhp, I_MUTEX_NORMAL); +} + /* * Unlock a file handle/inode */ diff --git a/include/linux/nfsd/syscall.h b/include/linux/nfsd/syscall.h index dae0faea280..8bcddccb6c4 100644 --- a/include/linux/nfsd/syscall.h +++ b/include/linux/nfsd/syscall.h @@ -38,21 +38,6 @@ #define NFSCTL_GETFD 7 /* get an fh by path (used by mountd) */ #define NFSCTL_GETFS 8 /* get an fh by path with max FH len */ -/* - * Macros used to set version - */ -#define NFSCTL_VERSET(_cltbits, _v) ((_cltbits) |= (1 << (_v))) -#define NFSCTL_VERUNSET(_cltbits, _v) ((_cltbits) &= ~(1 << (_v))) -#define NFSCTL_VERISSET(_cltbits, _v) ((_cltbits) & (1 << (_v))) - -#if defined(CONFIG_NFSD_V4) -#define NFSCTL_VERALL (0x1c /* 0b011100 */) -#elif defined(CONFIG_NFSD_V3) -#define NFSCTL_VERALL (0x0c /* 0b001100 */) -#else -#define NFSCTL_VERALL (0x04 /* 0b000100 */) -#endif - /* SVC */ struct nfsctl_svc { unsigned short svc_port; @@ -134,8 +119,6 @@ extern int exp_delclient(struct nfsctl_client *ncp); extern int exp_export(struct nfsctl_export *nxp); extern int exp_unexport(struct nfsctl_export *nxp); -extern unsigned int nfsd_versbits; - #endif /* __KERNEL__ */ #endif /* NFSD_SYSCALL_H */ diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 1a9ef3e627d..5dce5c21822 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -352,6 +352,7 @@ extern nodemask_t node_possible_map; #define node_possible(node) node_isset((node), node_possible_map) #define first_online_node first_node(node_online_map) #define next_online_node(nid) next_node((nid), node_online_map) +int highest_possible_node_id(void); #else #define num_online_nodes() 1 #define num_possible_nodes() 1 @@ -359,6 +360,7 @@ extern nodemask_t node_possible_map; #define node_possible(node) ((node) == 0) #define first_online_node 0 #define next_online_node(nid) (MAX_NUMNODES) +#define highest_possible_node_id() 0 #endif #define any_online_node(mask) \ diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h new file mode 100644 index 00000000000..f6baecdeecd --- /dev/null +++ b/include/linux/nsproxy.h @@ -0,0 +1,52 @@ +#ifndef _LINUX_NSPROXY_H +#define _LINUX_NSPROXY_H + +#include <linux/spinlock.h> +#include <linux/sched.h> + +struct namespace; +struct uts_namespace; +struct ipc_namespace; + +/* + * A structure to contain pointers to all per-process + * namespaces - fs (mount), uts, network, sysvipc, etc. + * + * 'count' is the number of tasks holding a reference. + * The count for each namespace, then, will be the number + * of nsproxies pointing to it, not the number of tasks. + * + * The nsproxy is shared by tasks which share all namespaces. + * As soon as a single namespace is cloned or unshared, the + * nsproxy is copied. + */ +struct nsproxy { + atomic_t count; + spinlock_t nslock; + struct uts_namespace *uts_ns; + struct ipc_namespace *ipc_ns; + struct namespace *namespace; +}; +extern struct nsproxy init_nsproxy; + +struct nsproxy *dup_namespaces(struct nsproxy *orig); +int copy_namespaces(int flags, struct task_struct *tsk); +void get_task_namespaces(struct task_struct *tsk); +void free_nsproxy(struct nsproxy *ns); + +static inline void put_nsproxy(struct nsproxy *ns) +{ + if (atomic_dec_and_test(&ns->count)) { + free_nsproxy(ns); + } +} + +static inline void exit_task_namespaces(struct task_struct *p) +{ + struct nsproxy *ns = p->nsproxy; + if (ns) { + put_nsproxy(ns); + p->nsproxy = NULL; + } +} +#endif diff --git a/include/linux/pid.h b/include/linux/pid.h index 93da7e2d9f3..17b9e04d358 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -68,6 +68,8 @@ extern struct task_struct *FASTCALL(pid_task(struct pid *pid, enum pid_type)); extern struct task_struct *FASTCALL(get_pid_task(struct pid *pid, enum pid_type)); +extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type); + /* * attach_pid() and detach_pid() must be called with the tasklist_lock * write-held. @@ -89,33 +91,42 @@ extern struct pid *FASTCALL(find_pid(int nr)); * Lookup a PID in the hash table, and return with it's count elevated. */ extern struct pid *find_get_pid(int nr); +extern struct pid *find_ge_pid(int nr); extern struct pid *alloc_pid(void); extern void FASTCALL(free_pid(struct pid *pid)); -#define pid_next(task, type) \ - ((task)->pids[(type)].node.next) +static inline pid_t pid_nr(struct pid *pid) +{ + pid_t nr = 0; + if (pid) + nr = pid->nr; + return nr; +} + -#define pid_next_task(task, type) \ - hlist_entry(pid_next(task, type), struct task_struct, \ - pids[(type)].node) +#define do_each_task_pid(who, type, task) \ + do { \ + struct hlist_node *pos___; \ + struct pid *pid___ = find_pid(who); \ + if (pid___ != NULL) \ + hlist_for_each_entry_rcu((task), pos___, \ + &pid___->tasks[type], pids[type].node) { +#define while_each_task_pid(who, type, task) \ + } \ + } while (0) -/* We could use hlist_for_each_entry_rcu here but it takes more arguments - * than the do_each_task_pid/while_each_task_pid. So we roll our own - * to preserve the existing interface. - */ -#define do_each_task_pid(who, type, task) \ - if ((task = find_task_by_pid_type(type, who))) { \ - prefetch(pid_next(task, type)); \ - do { - -#define while_each_task_pid(who, type, task) \ - } while (pid_next(task, type) && ({ \ - task = pid_next_task(task, type); \ - rcu_dereference(task); \ - prefetch(pid_next(task, type)); \ - 1; }) ); \ - } + +#define do_each_pid_task(pid, type, task) \ + do { \ + struct hlist_node *pos___; \ + if (pid != NULL) \ + hlist_for_each_entry_rcu((task), pos___, \ + &pid->tasks[type], pids[type].node) { + +#define while_each_pid_task(pid, type, task) \ + } \ + } while (0) #endif /* _LINUX_PID_H */ diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 57f70bc8b24..87dec8fe6de 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -244,13 +244,15 @@ static inline void kclist_add(struct kcore_list *new, void *addr, size_t size) extern void kclist_add(struct kcore_list *, void *, size_t); #endif +union proc_op { + int (*proc_get_link)(struct inode *, struct dentry **, struct vfsmount **); + int (*proc_read)(struct task_struct *task, char *page); +}; + struct proc_inode { struct pid *pid; int fd; - union { - int (*proc_get_link)(struct inode *, struct dentry **, struct vfsmount **); - int (*proc_read)(struct task_struct *task, char *page); - } op; + union proc_op op; struct proc_dir_entry *pde; struct inode vfs_inode; }; diff --git a/include/linux/pspace.h b/include/linux/pspace.h new file mode 100644 index 00000000000..91d48b8b2d9 --- /dev/null +++ b/include/linux/pspace.h @@ -0,0 +1,23 @@ +#ifndef _LINUX_PSPACE_H +#define _LINUX_PSPACE_H + +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/threads.h> +#include <linux/pid.h> + +struct pidmap { + atomic_t nr_free; + void *page; +}; + +#define PIDMAP_ENTRIES ((PID_MAX_LIMIT + 8*PAGE_SIZE - 1)/PAGE_SIZE/8) + +struct pspace { + struct pidmap pidmap[PIDMAP_ENTRIES]; + int last_pid; +}; + +extern struct pspace init_pspace; + +#endif /* _LINUX_PSPACE_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 7ef899c47c2..38530232d92 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -24,6 +24,8 @@ #define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */ #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ #define CLONE_STOPPED 0x02000000 /* Start in stopped state */ +#define CLONE_NEWUTS 0x04000000 /* New utsname group? */ +#define CLONE_NEWIPC 0x08000000 /* New ipcs */ /* * Scheduling policies @@ -118,7 +120,6 @@ extern unsigned long avenrun[]; /* Load averages */ extern unsigned long total_forks; extern int nr_threads; -extern int last_pid; DECLARE_PER_CPU(unsigned long, process_counts); extern int nr_processes(void); extern unsigned long nr_running(void); @@ -239,7 +240,7 @@ extern signed long schedule_timeout_interruptible(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); asmlinkage void schedule(void); -struct namespace; +struct nsproxy; /* Maximum number of active map areas.. This is a random (large) number */ #define DEFAULT_MAX_MAP_COUNT 65536 @@ -754,6 +755,7 @@ static inline void prefetch_stack(struct task_struct *t) { } struct audit_context; /* See audit.c */ struct mempolicy; struct pipe_inode_info; +struct uts_namespace; enum sleep_type { SLEEP_NORMAL, @@ -897,8 +899,8 @@ struct task_struct { struct fs_struct *fs; /* open file information */ struct files_struct *files; -/* namespace */ - struct namespace *namespace; +/* namespaces */ + struct nsproxy *nsproxy; /* signal handlers */ struct signal_struct *signal; struct sighand_struct *sighand; @@ -1020,6 +1022,26 @@ static inline pid_t process_group(struct task_struct *tsk) return tsk->signal->pgrp; } +static inline struct pid *task_pid(struct task_struct *task) +{ + return task->pids[PIDTYPE_PID].pid; +} + +static inline struct pid *task_tgid(struct task_struct *task) +{ + return task->group_leader->pids[PIDTYPE_PID].pid; +} + +static inline struct pid *task_pgrp(struct task_struct *task) +{ + return task->group_leader->pids[PIDTYPE_PGID].pid; +} + +static inline struct pid *task_session(struct task_struct *task) +{ + return task->group_leader->pids[PIDTYPE_SID].pid; +} + /** * pid_alive - check that a task structure is not stale * @p: Task structure to be checked. @@ -1043,6 +1065,8 @@ static inline int is_init(struct task_struct *tsk) return tsk->pid == 1; } +extern struct pid *cad_pid; + extern void free_task(struct task_struct *tsk); #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0) @@ -1247,10 +1271,15 @@ extern int send_sig_info(int, struct siginfo *, struct task_struct *); extern int send_group_sig_info(int, struct siginfo *, struct task_struct *); extern int force_sigsegv(int, struct task_struct *); extern int force_sig_info(int, struct siginfo *, struct task_struct *); +extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp); +extern int kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp); +extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid); +extern int kill_pid_info_as_uid(int, struct siginfo *, struct pid *, uid_t, uid_t, u32); +extern int kill_pgrp(struct pid *pid, int sig, int priv); +extern int kill_pid(struct pid *pid, int sig, int priv); extern int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp); extern int kill_pg_info(int, struct siginfo *, pid_t); extern int kill_proc_info(int, struct siginfo *, pid_t); -extern int kill_proc_info_as_uid(int, struct siginfo *, pid_t, uid_t, uid_t, u32); extern void do_notify_parent(struct task_struct *, int); extern void force_sig(int, struct task_struct *); extern void force_sig_specific(int, struct task_struct *); @@ -1265,6 +1294,11 @@ extern int send_group_sigqueue(int, struct sigqueue *, struct task_struct *); extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long); +static inline int kill_cad_pid(int sig, int priv) +{ + return kill_pid(cad_pid, sig, priv); +} + /* These can be the second arg to send_sig_info/send_group_sig_info. */ #define SEND_SIG_NOINFO ((struct siginfo *) 0) #define SEND_SIG_PRIV ((struct siginfo *) 1) @@ -1358,6 +1392,17 @@ extern void wait_task_inactive(struct task_struct * p); /* de_thread depends on thread_group_leader not being a pid based check */ #define thread_group_leader(p) (p == p->group_leader) +/* Do to the insanities of de_thread it is possible for a process + * to have the pid of the thread group leader without actually being + * the thread group leader. For iteration through the pids in proc + * all we care about is that we have a task with the appropriate + * pid, we don't actually care if we have the right task. + */ +static inline int has_group_leader_pid(struct task_struct *p) +{ + return p->pid == p->tgid; +} + static inline struct task_struct *next_thread(const struct task_struct *p) { return list_entry(rcu_dereference(p->thread_group.next), diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index 73140ee5c63..4ebcdf91f3b 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -18,6 +18,30 @@ #include <linux/mm.h> /* + * This is the RPC server thread function prototype + */ +typedef void (*svc_thread_fn)(struct svc_rqst *); + +/* + * + * RPC service thread pool. + * + * Pool of threads and temporary sockets. Generally there is only + * a single one of these per RPC service, but on NUMA machines those + * services that can benefit from it (i.e. nfs but not lockd) will + * have one pool per NUMA node. This optimisation reduces cross- + * node traffic on multi-node NUMA NFS servers. + */ +struct svc_pool { + unsigned int sp_id; /* pool id; also node id on NUMA */ + spinlock_t sp_lock; /* protects all fields */ + struct list_head sp_threads; /* idle server threads */ + struct list_head sp_sockets; /* pending sockets */ + unsigned int sp_nrthreads; /* # of threads in pool */ + struct list_head sp_all_threads; /* all server threads */ +} ____cacheline_aligned_in_smp; + +/* * RPC service. * * An RPC service is a ``daemon,'' possibly multithreaded, which @@ -28,8 +52,6 @@ * We currently do not support more than one RPC program per daemon. */ struct svc_serv { - struct list_head sv_threads; /* idle server threads */ - struct list_head sv_sockets; /* pending sockets */ struct svc_program * sv_program; /* RPC program */ struct svc_stat * sv_stats; /* RPC statistics */ spinlock_t sv_lock; @@ -40,11 +62,36 @@ struct svc_serv { struct list_head sv_permsocks; /* all permanent sockets */ struct list_head sv_tempsocks; /* all temporary sockets */ int sv_tmpcnt; /* count of temporary sockets */ + struct timer_list sv_temptimer; /* timer for aging temporary sockets */ char * sv_name; /* service name */ + + unsigned int sv_nrpools; /* number of thread pools */ + struct svc_pool * sv_pools; /* array of thread pools */ + + void (*sv_shutdown)(struct svc_serv *serv); + /* Callback to use when last thread + * exits. + */ + + struct module * sv_module; /* optional module to count when + * adding threads */ + svc_thread_fn sv_function; /* main function for threads */ + int sv_kill_signal; /* signal to kill threads */ }; /* + * We use sv_nrthreads as a reference count. svc_destroy() drops + * this refcount, so we need to bump it up around operations that + * change the number of threads. Horrible, but there it is. + * Should be called with the BKL held. + */ +static inline void svc_get(struct svc_serv *serv) +{ + serv->sv_nrthreads++; +} + +/* * Maximum payload size supported by a kernel RPC server. * This is use to determine the max number of pages nfsd is * willing to return in a single READ operation. @@ -127,11 +174,13 @@ static inline void svc_putu32(struct kvec *iov, __be32 val) */ struct svc_rqst { struct list_head rq_list; /* idle list */ + struct list_head rq_all; /* all threads list */ struct svc_sock * rq_sock; /* socket */ struct sockaddr_in rq_addr; /* peer address */ int rq_addrlen; struct svc_serv * rq_server; /* RPC service definition */ + struct svc_pool * rq_pool; /* thread pool */ struct svc_procedure * rq_procinfo; /* procedure info */ struct auth_ops * rq_authop; /* authentication flavour */ struct svc_cred rq_cred; /* auth info */ @@ -180,6 +229,7 @@ struct svc_rqst { * to prevent encrypting page * cache pages */ wait_queue_head_t rq_wait; /* synchronization */ + struct task_struct *rq_task; /* service thread */ }; /* @@ -321,20 +371,21 @@ struct svc_procedure { }; /* - * This is the RPC server thread function prototype - */ -typedef void (*svc_thread_fn)(struct svc_rqst *); - -/* * Function prototypes. */ -struct svc_serv * svc_create(struct svc_program *, unsigned int); +struct svc_serv * svc_create(struct svc_program *, unsigned int, + void (*shutdown)(struct svc_serv*)); int svc_create_thread(svc_thread_fn, struct svc_serv *); void svc_exit_thread(struct svc_rqst *); +struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, + void (*shutdown)(struct svc_serv*), + svc_thread_fn, int sig, struct module *); +int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); void svc_destroy(struct svc_serv *); -int svc_process(struct svc_serv *, struct svc_rqst *); +int svc_process(struct svc_rqst *); int svc_register(struct svc_serv *, int, unsigned short); void svc_wake_up(struct svc_serv *); void svc_reserve(struct svc_rqst *rqstp, int space); +struct svc_pool * svc_pool_for_cpu(struct svc_serv *serv, int cpu); #endif /* SUNRPC_SVC_H */ diff --git a/include/linux/sunrpc/svcsock.h b/include/linux/sunrpc/svcsock.h index b4acb3d37c3..4c296152cbf 100644 --- a/include/linux/sunrpc/svcsock.h +++ b/include/linux/sunrpc/svcsock.h @@ -20,8 +20,9 @@ struct svc_sock { struct socket * sk_sock; /* berkeley socket layer */ struct sock * sk_sk; /* INET layer */ + struct svc_pool * sk_pool; /* current pool iff queued */ struct svc_serv * sk_server; /* service for this socket */ - unsigned int sk_inuse; /* use count */ + atomic_t sk_inuse; /* use count */ unsigned long sk_flags; #define SK_BUSY 0 /* enqueued/receiving */ #define SK_CONN 1 /* conn pending */ @@ -31,9 +32,12 @@ struct svc_sock { #define SK_DEAD 6 /* socket closed */ #define SK_CHNGBUF 7 /* need to change snd/rcv buffer sizes */ #define SK_DEFERRED 8 /* request on sk_deferred */ +#define SK_OLD 9 /* used for temp socket aging mark+sweep */ +#define SK_DETACHED 10 /* detached from tempsocks list */ - int sk_reserved; /* space on outq that is reserved */ + atomic_t sk_reserved; /* space on outq that is reserved */ + spinlock_t sk_defer_lock; /* protects sk_deferred */ struct list_head sk_deferred; /* deferred requests that need to * be revisted */ struct mutex sk_mutex; /* to serialize sending data */ @@ -57,9 +61,14 @@ struct svc_sock { */ int svc_makesock(struct svc_serv *, int, unsigned short); void svc_delete_socket(struct svc_sock *); -int svc_recv(struct svc_serv *, struct svc_rqst *, long); +int svc_recv(struct svc_rqst *, long); int svc_send(struct svc_rqst *); void svc_drop(struct svc_rqst *); void svc_sock_update_bufs(struct svc_serv *serv); +int svc_sock_names(char *buf, struct svc_serv *serv, char *toclose); +int svc_addsock(struct svc_serv *serv, + int fd, + char *name_return, + int *proto); #endif /* SUNRPC_SVCSOCK_H */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 2d1c3d5c83a..3efcfc7e9c6 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -599,4 +599,6 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, size_t len); asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, struct getcpu_cache __user *cache); +int kernel_execve(const char *filename, char *const argv[], char *const envp[]); + #endif diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h index 58c961c9e17..5c8473bb688 100644 --- a/include/linux/tty_driver.h +++ b/include/linux/tty_driver.h @@ -219,7 +219,8 @@ extern struct list_head tty_drivers; struct tty_driver *alloc_tty_driver(int lines); void put_tty_driver(struct tty_driver *driver); -void tty_set_operations(struct tty_driver *driver, struct tty_operations *op); +void tty_set_operations(struct tty_driver *driver, + const struct tty_operations *op); /* tty driver magic number */ #define TTY_DRIVER_MAGIC 0x5402 diff --git a/include/linux/unistd.h b/include/linux/unistd.h index c18c60f3254..aa8d5b5e2e3 100644 --- a/include/linux/unistd.h +++ b/include/linux/unistd.h @@ -1,12 +1,8 @@ #ifndef _LINUX_UNISTD_H_ #define _LINUX_UNISTD_H_ -#ifdef __KERNEL__ -extern int errno; -#endif - /* - * Include machine specific syscallX macros + * Include machine specific syscall numbers */ #include <asm/unistd.h> diff --git a/include/linux/utsname.h b/include/linux/utsname.h index 13e1da0c538..02e4b697206 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -1,6 +1,11 @@ #ifndef _LINUX_UTSNAME_H #define _LINUX_UTSNAME_H +#include <linux/sched.h> +#include <linux/kref.h> +#include <linux/nsproxy.h> +#include <asm/atomic.h> + #define __OLD_UTS_LEN 8 struct oldold_utsname { @@ -30,7 +35,55 @@ struct new_utsname { char domainname[65]; }; -extern struct new_utsname system_utsname; +struct uts_namespace { + struct kref kref; + struct new_utsname name; +}; +extern struct uts_namespace init_uts_ns; + +static inline void get_uts_ns(struct uts_namespace *ns) +{ + kref_get(&ns->kref); +} + +#ifdef CONFIG_UTS_NS +extern int unshare_utsname(unsigned long unshare_flags, + struct uts_namespace **new_uts); +extern int copy_utsname(int flags, struct task_struct *tsk); +extern void free_uts_ns(struct kref *kref); + +static inline void put_uts_ns(struct uts_namespace *ns) +{ + kref_put(&ns->kref, free_uts_ns); +} +#else +static inline int unshare_utsname(unsigned long unshare_flags, + struct uts_namespace **new_uts) +{ + if (unshare_flags & CLONE_NEWUTS) + return -EINVAL; + + return 0; +} + +static inline int copy_utsname(int flags, struct task_struct *tsk) +{ + return 0; +} +static inline void put_uts_ns(struct uts_namespace *ns) +{ +} +#endif + +static inline struct new_utsname *utsname(void) +{ + return ¤t->nsproxy->uts_ns->name; +} + +static inline struct new_utsname *init_utsname(void) +{ + return &init_uts_ns.name; +} extern struct rw_semaphore uts_sem; #endif diff --git a/include/linux/vt_kern.h b/include/linux/vt_kern.h index 1009d3fe1fc..37a1a41f5b6 100644 --- a/include/linux/vt_kern.h +++ b/include/linux/vt_kern.h @@ -84,4 +84,11 @@ void reset_vc(struct vc_data *vc); extern char con_buf[CON_BUF_SIZE]; extern struct semaphore con_buf_sem; +struct vt_spawn_console { + spinlock_t lock; + struct pid *pid; + int sig; +}; +extern struct vt_spawn_console vt_spawn_con; + #endif /* _VT_KERN_H */ diff --git a/init/Kconfig b/init/Kconfig index f7a04d0daf0..10382931eea 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -115,6 +115,15 @@ config SYSVIPC section 6.4 of the Linux Programmer's Guide, available from <http://www.tldp.org/guides.html>. +config IPC_NS + bool "IPC Namespaces" + depends on SYSVIPC + default n + help + Support ipc namespaces. This allows containers, i.e. virtual + environments, to use ipc namespaces to provide different ipc + objects for different servers. If unsure, say N. + config POSIX_MQUEUE bool "POSIX Message Queues" depends on NET && EXPERIMENTAL @@ -182,6 +191,14 @@ config TASK_DELAY_ACCT Say N if unsure. +config UTS_NS + bool "UTS Namespaces" + default n + help + Support uts namespaces. This allows containers, i.e. + vservers, to use uts namespaces to provide different + uts info for different servers. If unsure, say N. + config AUDIT bool "Auditing support" depends on NET diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index a06f037fa00..919a80cb322 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -1,4 +1,3 @@ -#define __KERNEL_SYSCALLS__ #include <linux/unistd.h> #include <linux/kernel.h> #include <linux/fs.h> @@ -35,7 +34,7 @@ static int __init do_linuxrc(void * shell) (void) sys_open("/dev/console",O_RDWR,0); (void) sys_dup(0); (void) sys_dup(0); - return execve(shell, argv, envp_init); + return kernel_execve(shell, argv, envp_init); } static void __init handle_initrd(void) diff --git a/init/main.c b/init/main.c index 0766e69712b..ee123243fb5 100644 --- a/init/main.c +++ b/init/main.c @@ -9,8 +9,6 @@ * Simplified starting of init: Michael A. Griffith <grif@acm.org> */ -#define __KERNEL_SYSCALLS__ - #include <linux/types.h> #include <linux/module.h> #include <linux/proc_fs.h> @@ -703,7 +701,7 @@ static void do_pre_smp_initcalls(void) static void run_init_process(char *init_filename) { argv_init[0] = init_filename; - execve(init_filename, argv_init, envp_init); + kernel_execve(init_filename, argv_init, envp_init); } static int init(void * unused) @@ -723,6 +721,8 @@ static int init(void * unused) */ child_reaper = current; + cad_pid = task_pid(current); + smp_prepare_cpus(max_cpus); do_pre_smp_initcalls(); diff --git a/init/version.c b/init/version.c index e290802c6bd..8f28344d9c7 100644 --- a/init/version.c +++ b/init/version.c @@ -12,22 +12,27 @@ #include <linux/utsname.h> #include <linux/utsrelease.h> #include <linux/version.h> +#include <linux/sched.h> #define version(a) Version_ ## a #define version_string(a) version(a) int version_string(LINUX_VERSION_CODE); -struct new_utsname system_utsname = { - .sysname = UTS_SYSNAME, - .nodename = UTS_NODENAME, - .release = UTS_RELEASE, - .version = UTS_VERSION, - .machine = UTS_MACHINE, - .domainname = UTS_DOMAINNAME, +struct uts_namespace init_uts_ns = { + .kref = { + .refcount = ATOMIC_INIT(2), + }, + .name = { + .sysname = UTS_SYSNAME, + .nodename = UTS_NODENAME, + .release = UTS_RELEASE, + .version = UTS_VERSION, + .machine = UTS_MACHINE, + .domainname = UTS_DOMAINNAME, + }, }; - -EXPORT_SYMBOL(system_utsname); +EXPORT_SYMBOL_GPL(init_uts_ns); const char linux_banner[] = "Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@" diff --git a/ipc/mqueue.c b/ipc/mqueue.c index d75d0ba8336..c45ae86cec3 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -73,7 +73,7 @@ struct mqueue_inode_info { struct mq_attr attr; struct sigevent notify; - pid_t notify_owner; + struct pid* notify_owner; struct user_struct *user; /* user who created, for accounting */ struct sock *notify_sock; struct sk_buff *notify_cookie; @@ -134,7 +134,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, int mode, INIT_LIST_HEAD(&info->e_wait_q[0].list); INIT_LIST_HEAD(&info->e_wait_q[1].list); info->messages = NULL; - info->notify_owner = 0; + info->notify_owner = NULL; info->qsize = 0; info->user = NULL; /* set when all is ok */ memset(&info->attr, 0, sizeof(info->attr)); @@ -338,7 +338,7 @@ static ssize_t mqueue_read_file(struct file *filp, char __user *u_data, (info->notify_owner && info->notify.sigev_notify == SIGEV_SIGNAL) ? info->notify.sigev_signo : 0, - info->notify_owner); + pid_nr(info->notify_owner)); spin_unlock(&info->lock); buffer[sizeof(buffer)-1] = '\0'; slen = strlen(buffer)+1; @@ -363,7 +363,7 @@ static int mqueue_flush_file(struct file *filp, fl_owner_t id) struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode); spin_lock(&info->lock); - if (current->tgid == info->notify_owner) + if (task_tgid(current) == info->notify_owner) remove_notification(info); spin_unlock(&info->lock); @@ -518,8 +518,8 @@ static void __do_notify(struct mqueue_inode_info *info) sig_i.si_pid = current->tgid; sig_i.si_uid = current->uid; - kill_proc_info(info->notify.sigev_signo, - &sig_i, info->notify_owner); + kill_pid_info(info->notify.sigev_signo, + &sig_i, info->notify_owner); break; case SIGEV_THREAD: set_cookie(info->notify_cookie, NOTIFY_WOKENUP); @@ -528,7 +528,8 @@ static void __do_notify(struct mqueue_inode_info *info) break; } /* after notification unregisters process */ - info->notify_owner = 0; + put_pid(info->notify_owner); + info->notify_owner = NULL; } wake_up(&info->wait_q); } @@ -566,12 +567,13 @@ static long prepare_timeout(const struct timespec __user *u_arg) static void remove_notification(struct mqueue_inode_info *info) { - if (info->notify_owner != 0 && + if (info->notify_owner != NULL && info->notify.sigev_notify == SIGEV_THREAD) { set_cookie(info->notify_cookie, NOTIFY_REMOVED); netlink_sendskb(info->notify_sock, info->notify_cookie, 0); } - info->notify_owner = 0; + put_pid(info->notify_owner); + info->notify_owner = NULL; } static int mq_attr_ok(struct mq_attr *attr) @@ -1062,11 +1064,11 @@ retry: ret = 0; spin_lock(&info->lock); if (u_notification == NULL) { - if (info->notify_owner == current->tgid) { + if (info->notify_owner == task_tgid(current)) { remove_notification(info); inode->i_atime = inode->i_ctime = CURRENT_TIME; } - } else if (info->notify_owner != 0) { + } else if (info->notify_owner != NULL) { ret = -EBUSY; } else { switch (notification.sigev_notify) { @@ -1086,7 +1088,8 @@ retry: info->notify.sigev_notify = SIGEV_SIGNAL; break; } - info->notify_owner = current->tgid; + + info->notify_owner = get_pid(task_tgid(current)); inode->i_atime = inode->i_ctime = CURRENT_TIME; } spin_unlock(&info->lock); diff --git a/ipc/msg.c b/ipc/msg.c index 2b4fccf8ea5..5b213d95254 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -16,6 +16,10 @@ * * support for audit of ipc object properties and permission changes * Dustin Kirkland <dustin.kirkland@us.ibm.com> + * + * namespaces support + * OpenVZ, SWsoft Inc. + * Pavel Emelianov <xemul@openvz.org> */ #include <linux/capability.h> @@ -31,16 +35,12 @@ #include <linux/audit.h> #include <linux/seq_file.h> #include <linux/mutex.h> +#include <linux/nsproxy.h> #include <asm/current.h> #include <asm/uaccess.h> #include "util.h" -/* sysctl: */ -int msg_ctlmax = MSGMAX; -int msg_ctlmnb = MSGMNB; -int msg_ctlmni = MSGMNI; - /* * one msg_receiver structure for each sleeping receiver: */ @@ -69,30 +69,75 @@ struct msg_sender { static atomic_t msg_bytes = ATOMIC_INIT(0); static atomic_t msg_hdrs = ATOMIC_INIT(0); -static struct ipc_ids msg_ids; +static struct ipc_ids init_msg_ids; -#define msg_lock(id) ((struct msg_queue *)ipc_lock(&msg_ids, id)) -#define msg_unlock(msq) ipc_unlock(&(msq)->q_perm) -#define msg_rmid(id) ((struct msg_queue *)ipc_rmid(&msg_ids, id)) -#define msg_checkid(msq, msgid) ipc_checkid(&msg_ids, &msq->q_perm, msgid) -#define msg_buildid(id, seq) ipc_buildid(&msg_ids, id, seq) +#define msg_ids(ns) (*((ns)->ids[IPC_MSG_IDS])) -static void freeque(struct msg_queue *msq, int id); -static int newque(key_t key, int msgflg); +#define msg_lock(ns, id) ((struct msg_queue*)ipc_lock(&msg_ids(ns), id)) +#define msg_unlock(msq) ipc_unlock(&(msq)->q_perm) +#define msg_rmid(ns, id) ((struct msg_queue*)ipc_rmid(&msg_ids(ns), id)) +#define msg_checkid(ns, msq, msgid) \ + ipc_checkid(&msg_ids(ns), &msq->q_perm, msgid) +#define msg_buildid(ns, id, seq) \ + ipc_buildid(&msg_ids(ns), id, seq) + +static void freeque (struct ipc_namespace *ns, struct msg_queue *msq, int id); +static int newque (struct ipc_namespace *ns, key_t key, int msgflg); #ifdef CONFIG_PROC_FS static int sysvipc_msg_proc_show(struct seq_file *s, void *it); #endif +static void __ipc_init __msg_init_ns(struct ipc_namespace *ns, struct ipc_ids *ids) +{ + ns->ids[IPC_MSG_IDS] = ids; + ns->msg_ctlmax = MSGMAX; + ns->msg_ctlmnb = MSGMNB; + ns->msg_ctlmni = MSGMNI; + ipc_init_ids(ids, ns->msg_ctlmni); +} + +#ifdef CONFIG_IPC_NS +int msg_init_ns(struct ipc_namespace *ns) +{ + struct ipc_ids *ids; + + ids = kmalloc(sizeof(struct ipc_ids), GFP_KERNEL); + if (ids == NULL) + return -ENOMEM; + + __msg_init_ns(ns, ids); + return 0; +} + +void msg_exit_ns(struct ipc_namespace *ns) +{ + int i; + struct msg_queue *msq; + + mutex_lock(&msg_ids(ns).mutex); + for (i = 0; i <= msg_ids(ns).max_id; i++) { + msq = msg_lock(ns, i); + if (msq == NULL) + continue; + + freeque(ns, msq, i); + } + mutex_unlock(&msg_ids(ns).mutex); + + kfree(ns->ids[IPC_MSG_IDS]); + ns->ids[IPC_MSG_IDS] = NULL; +} +#endif + void __init msg_init(void) { - ipc_init_ids(&msg_ids, msg_ctlmni); + __msg_init_ns(&init_ipc_ns, &init_msg_ids); ipc_init_proc_interface("sysvipc/msg", " key msqid perms cbytes qnum lspid lrpid uid gid cuid cgid stime rtime ctime\n", - &msg_ids, - sysvipc_msg_proc_show); + IPC_MSG_IDS, sysvipc_msg_proc_show); } -static int newque(key_t key, int msgflg) +static int newque (struct ipc_namespace *ns, key_t key, int msgflg) { struct msg_queue *msq; int id, retval; @@ -111,18 +156,18 @@ static int newque(key_t key, int msgflg) return retval; } - id = ipc_addid(&msg_ids, &msq->q_perm, msg_ctlmni); + id = ipc_addid(&msg_ids(ns), &msq->q_perm, ns->msg_ctlmni); if (id == -1) { security_msg_queue_free(msq); ipc_rcu_putref(msq); return -ENOSPC; } - msq->q_id = msg_buildid(id, msq->q_perm.seq); + msq->q_id = msg_buildid(ns, id, msq->q_perm.seq); msq->q_stime = msq->q_rtime = 0; msq->q_ctime = get_seconds(); msq->q_cbytes = msq->q_qnum = 0; - msq->q_qbytes = msg_ctlmnb; + msq->q_qbytes = ns->msg_ctlmnb; msq->q_lspid = msq->q_lrpid = 0; INIT_LIST_HEAD(&msq->q_messages); INIT_LIST_HEAD(&msq->q_receivers); @@ -186,13 +231,13 @@ static void expunge_all(struct msg_queue *msq, int res) * msg_ids.mutex and the spinlock for this message queue is hold * before freeque() is called. msg_ids.mutex remains locked on exit. */ -static void freeque(struct msg_queue *msq, int id) +static void freeque(struct ipc_namespace *ns, struct msg_queue *msq, int id) { struct list_head *tmp; expunge_all(msq, -EIDRM); ss_wakeup(&msq->q_senders, 1); - msq = msg_rmid(id); + msq = msg_rmid(ns, id); msg_unlock(msq); tmp = msq->q_messages.next; @@ -212,24 +257,27 @@ asmlinkage long sys_msgget(key_t key, int msgflg) { struct msg_queue *msq; int id, ret = -EPERM; + struct ipc_namespace *ns; + + ns = current->nsproxy->ipc_ns; - mutex_lock(&msg_ids.mutex); + mutex_lock(&msg_ids(ns).mutex); if (key == IPC_PRIVATE) - ret = newque(key, msgflg); - else if ((id = ipc_findkey(&msg_ids, key)) == -1) { /* key not used */ + ret = newque(ns, key, msgflg); + else if ((id = ipc_findkey(&msg_ids(ns), key)) == -1) { /* key not used */ if (!(msgflg & IPC_CREAT)) ret = -ENOENT; else - ret = newque(key, msgflg); + ret = newque(ns, key, msgflg); } else if (msgflg & IPC_CREAT && msgflg & IPC_EXCL) { ret = -EEXIST; } else { - msq = msg_lock(id); + msq = msg_lock(ns, id); BUG_ON(msq == NULL); if (ipcperms(&msq->q_perm, msgflg)) ret = -EACCES; else { - int qid = msg_buildid(id, msq->q_perm.seq); + int qid = msg_buildid(ns, id, msq->q_perm.seq); ret = security_msg_queue_associate(msq, msgflg); if (!ret) @@ -237,7 +285,7 @@ asmlinkage long sys_msgget(key_t key, int msgflg) } msg_unlock(msq); } - mutex_unlock(&msg_ids.mutex); + mutex_unlock(&msg_ids(ns).mutex); return ret; } @@ -341,11 +389,13 @@ asmlinkage long sys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) struct msq_setbuf setbuf; struct msg_queue *msq; int err, version; + struct ipc_namespace *ns; if (msqid < 0 || cmd < 0) return -EINVAL; version = ipc_parse_version(&cmd); + ns = current->nsproxy->ipc_ns; switch (cmd) { case IPC_INFO: @@ -366,14 +416,14 @@ asmlinkage long sys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) return err; memset(&msginfo, 0, sizeof(msginfo)); - msginfo.msgmni = msg_ctlmni; - msginfo.msgmax = msg_ctlmax; - msginfo.msgmnb = msg_ctlmnb; + msginfo.msgmni = ns->msg_ctlmni; + msginfo.msgmax = ns->msg_ctlmax; + msginfo.msgmnb = ns->msg_ctlmnb; msginfo.msgssz = MSGSSZ; msginfo.msgseg = MSGSEG; - mutex_lock(&msg_ids.mutex); + mutex_lock(&msg_ids(ns).mutex); if (cmd == MSG_INFO) { - msginfo.msgpool = msg_ids.in_use; + msginfo.msgpool = msg_ids(ns).in_use; msginfo.msgmap = atomic_read(&msg_hdrs); msginfo.msgtql = atomic_read(&msg_bytes); } else { @@ -381,8 +431,8 @@ asmlinkage long sys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) msginfo.msgpool = MSGPOOL; msginfo.msgtql = MSGTQL; } - max_id = msg_ids.max_id; - mutex_unlock(&msg_ids.mutex); + max_id = msg_ids(ns).max_id; + mutex_unlock(&msg_ids(ns).mutex); if (copy_to_user(buf, &msginfo, sizeof(struct msginfo))) return -EFAULT; return (max_id < 0) ? 0 : max_id; @@ -395,20 +445,20 @@ asmlinkage long sys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) if (!buf) return -EFAULT; - if (cmd == MSG_STAT && msqid >= msg_ids.entries->size) + if (cmd == MSG_STAT && msqid >= msg_ids(ns).entries->size) return -EINVAL; memset(&tbuf, 0, sizeof(tbuf)); - msq = msg_lock(msqid); + msq = msg_lock(ns, msqid); if (msq == NULL) return -EINVAL; if (cmd == MSG_STAT) { - success_return = msg_buildid(msqid, msq->q_perm.seq); + success_return = msg_buildid(ns, msqid, msq->q_perm.seq); } else { err = -EIDRM; - if (msg_checkid(msq, msqid)) + if (msg_checkid(ns, msq, msqid)) goto out_unlock; success_return = 0; } @@ -446,14 +496,14 @@ asmlinkage long sys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) return -EINVAL; } - mutex_lock(&msg_ids.mutex); - msq = msg_lock(msqid); + mutex_lock(&msg_ids(ns).mutex); + msq = msg_lock(ns, msqid); err = -EINVAL; if (msq == NULL) goto out_up; err = -EIDRM; - if (msg_checkid(msq, msqid)) + if (msg_checkid(ns, msq, msqid)) goto out_unlock_up; ipcp = &msq->q_perm; @@ -481,7 +531,7 @@ asmlinkage long sys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) case IPC_SET: { err = -EPERM; - if (setbuf.qbytes > msg_ctlmnb && !capable(CAP_SYS_RESOURCE)) + if (setbuf.qbytes > ns->msg_ctlmnb && !capable(CAP_SYS_RESOURCE)) goto out_unlock_up; msq->q_qbytes = setbuf.qbytes; @@ -503,12 +553,12 @@ asmlinkage long sys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf) break; } case IPC_RMID: - freeque(msq, msqid); + freeque(ns, msq, msqid); break; } err = 0; out_up: - mutex_unlock(&msg_ids.mutex); + mutex_unlock(&msg_ids(ns).mutex); return err; out_unlock_up: msg_unlock(msq); @@ -582,8 +632,11 @@ sys_msgsnd(int msqid, struct msgbuf __user *msgp, size_t msgsz, int msgflg) struct msg_msg *msg; long mtype; int err; + struct ipc_namespace *ns; + + ns = current->nsproxy->ipc_ns; - if (msgsz > msg_ctlmax || (long) msgsz < 0 || msqid < 0) + if (msgsz > ns->msg_ctlmax || (long) msgsz < 0 || msqid < 0) return -EINVAL; if (get_user(mtype, &msgp->mtype)) return -EFAULT; @@ -597,13 +650,13 @@ sys_msgsnd(int msqid, struct msgbuf __user *msgp, size_t msgsz, int msgflg) msg->m_type = mtype; msg->m_ts = msgsz; - msq = msg_lock(msqid); + msq = msg_lock(ns, msqid); err = -EINVAL; if (msq == NULL) goto out_free; err= -EIDRM; - if (msg_checkid(msq, msqid)) + if (msg_checkid(ns, msq, msqid)) goto out_unlock_free; for (;;) { @@ -694,17 +747,19 @@ asmlinkage long sys_msgrcv(int msqid, struct msgbuf __user *msgp, size_t msgsz, struct msg_queue *msq; struct msg_msg *msg; int mode; + struct ipc_namespace *ns; if (msqid < 0 || (long) msgsz < 0) return -EINVAL; mode = convert_mode(&msgtyp, msgflg); + ns = current->nsproxy->ipc_ns; - msq = msg_lock(msqid); + msq = msg_lock(ns, msqid); if (msq == NULL) return -EINVAL; msg = ERR_PTR(-EIDRM); - if (msg_checkid(msq, msqid)) + if (msg_checkid(ns, msq, msqid)) goto out_unlock; for (;;) { diff --git a/ipc/sem.c b/ipc/sem.c index 6013c751156..0dafcc455f9 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -64,6 +64,10 @@ * * support for audit of ipc object properties and permission changes * Dustin Kirkland <dustin.kirkland@us.ibm.com> + * + * namespaces support + * OpenVZ, SWsoft Inc. + * Pavel Emelianov <xemul@openvz.org> */ #include <linux/slab.h> @@ -78,22 +82,25 @@ #include <linux/capability.h> #include <linux/seq_file.h> #include <linux/mutex.h> +#include <linux/nsproxy.h> #include <asm/uaccess.h> #include "util.h" +#define sem_ids(ns) (*((ns)->ids[IPC_SEM_IDS])) + +#define sem_lock(ns, id) ((struct sem_array*)ipc_lock(&sem_ids(ns), id)) +#define sem_unlock(sma) ipc_unlock(&(sma)->sem_perm) +#define sem_rmid(ns, id) ((struct sem_array*)ipc_rmid(&sem_ids(ns), id)) +#define sem_checkid(ns, sma, semid) \ + ipc_checkid(&sem_ids(ns),&sma->sem_perm,semid) +#define sem_buildid(ns, id, seq) \ + ipc_buildid(&sem_ids(ns), id, seq) -#define sem_lock(id) ((struct sem_array*)ipc_lock(&sem_ids,id)) -#define sem_unlock(sma) ipc_unlock(&(sma)->sem_perm) -#define sem_rmid(id) ((struct sem_array*)ipc_rmid(&sem_ids,id)) -#define sem_checkid(sma, semid) \ - ipc_checkid(&sem_ids,&sma->sem_perm,semid) -#define sem_buildid(id, seq) \ - ipc_buildid(&sem_ids, id, seq) -static struct ipc_ids sem_ids; +static struct ipc_ids init_sem_ids; -static int newary (key_t, int, int); -static void freeary (struct sem_array *sma, int id); +static int newary(struct ipc_namespace *, key_t, int, int); +static void freeary(struct ipc_namespace *ns, struct sem_array *sma, int id); #ifdef CONFIG_PROC_FS static int sysvipc_sem_proc_show(struct seq_file *s, void *it); #endif @@ -110,22 +117,61 @@ static int sysvipc_sem_proc_show(struct seq_file *s, void *it); * */ -int sem_ctls[4] = {SEMMSL, SEMMNS, SEMOPM, SEMMNI}; -#define sc_semmsl (sem_ctls[0]) -#define sc_semmns (sem_ctls[1]) -#define sc_semopm (sem_ctls[2]) -#define sc_semmni (sem_ctls[3]) +#define sc_semmsl sem_ctls[0] +#define sc_semmns sem_ctls[1] +#define sc_semopm sem_ctls[2] +#define sc_semmni sem_ctls[3] -static int used_sems; +static void __ipc_init __sem_init_ns(struct ipc_namespace *ns, struct ipc_ids *ids) +{ + ns->ids[IPC_SEM_IDS] = ids; + ns->sc_semmsl = SEMMSL; + ns->sc_semmns = SEMMNS; + ns->sc_semopm = SEMOPM; + ns->sc_semmni = SEMMNI; + ns->used_sems = 0; + ipc_init_ids(ids, ns->sc_semmni); +} + +#ifdef CONFIG_IPC_NS +int sem_init_ns(struct ipc_namespace *ns) +{ + struct ipc_ids *ids; + + ids = kmalloc(sizeof(struct ipc_ids), GFP_KERNEL); + if (ids == NULL) + return -ENOMEM; + + __sem_init_ns(ns, ids); + return 0; +} + +void sem_exit_ns(struct ipc_namespace *ns) +{ + int i; + struct sem_array *sma; + + mutex_lock(&sem_ids(ns).mutex); + for (i = 0; i <= sem_ids(ns).max_id; i++) { + sma = sem_lock(ns, i); + if (sma == NULL) + continue; + + freeary(ns, sma, i); + } + mutex_unlock(&sem_ids(ns).mutex); + + kfree(ns->ids[IPC_SEM_IDS]); + ns->ids[IPC_SEM_IDS] = NULL; +} +#endif void __init sem_init (void) { - used_sems = 0; - ipc_init_ids(&sem_ids,sc_semmni); + __sem_init_ns(&init_ipc_ns, &init_sem_ids); ipc_init_proc_interface("sysvipc/sem", " key semid perms nsems uid gid cuid cgid otime ctime\n", - &sem_ids, - sysvipc_sem_proc_show); + IPC_SEM_IDS, sysvipc_sem_proc_show); } /* @@ -162,7 +208,7 @@ void __init sem_init (void) */ #define IN_WAKEUP 1 -static int newary (key_t key, int nsems, int semflg) +static int newary (struct ipc_namespace *ns, key_t key, int nsems, int semflg) { int id; int retval; @@ -171,7 +217,7 @@ static int newary (key_t key, int nsems, int semflg) if (!nsems) return -EINVAL; - if (used_sems + nsems > sc_semmns) + if (ns->used_sems + nsems > ns->sc_semmns) return -ENOSPC; size = sizeof (*sma) + nsems * sizeof (struct sem); @@ -191,15 +237,15 @@ static int newary (key_t key, int nsems, int semflg) return retval; } - id = ipc_addid(&sem_ids, &sma->sem_perm, sc_semmni); + id = ipc_addid(&sem_ids(ns), &sma->sem_perm, ns->sc_semmni); if(id == -1) { security_sem_free(sma); ipc_rcu_putref(sma); return -ENOSPC; } - used_sems += nsems; + ns->used_sems += nsems; - sma->sem_id = sem_buildid(id, sma->sem_perm.seq); + sma->sem_id = sem_buildid(ns, id, sma->sem_perm.seq); sma->sem_base = (struct sem *) &sma[1]; /* sma->sem_pending = NULL; */ sma->sem_pending_last = &sma->sem_pending; @@ -215,29 +261,32 @@ asmlinkage long sys_semget (key_t key, int nsems, int semflg) { int id, err = -EINVAL; struct sem_array *sma; + struct ipc_namespace *ns; - if (nsems < 0 || nsems > sc_semmsl) + ns = current->nsproxy->ipc_ns; + + if (nsems < 0 || nsems > ns->sc_semmsl) return -EINVAL; - mutex_lock(&sem_ids.mutex); + mutex_lock(&sem_ids(ns).mutex); if (key == IPC_PRIVATE) { - err = newary(key, nsems, semflg); - } else if ((id = ipc_findkey(&sem_ids, key)) == -1) { /* key not used */ + err = newary(ns, key, nsems, semflg); + } else if ((id = ipc_findkey(&sem_ids(ns), key)) == -1) { /* key not used */ if (!(semflg & IPC_CREAT)) err = -ENOENT; else - err = newary(key, nsems, semflg); + err = newary(ns, key, nsems, semflg); } else if (semflg & IPC_CREAT && semflg & IPC_EXCL) { err = -EEXIST; } else { - sma = sem_lock(id); + sma = sem_lock(ns, id); BUG_ON(sma==NULL); if (nsems > sma->sem_nsems) err = -EINVAL; else if (ipcperms(&sma->sem_perm, semflg)) err = -EACCES; else { - int semid = sem_buildid(id, sma->sem_perm.seq); + int semid = sem_buildid(ns, id, sma->sem_perm.seq); err = security_sem_associate(sma, semflg); if (!err) err = semid; @@ -245,7 +294,7 @@ asmlinkage long sys_semget (key_t key, int nsems, int semflg) sem_unlock(sma); } - mutex_unlock(&sem_ids.mutex); + mutex_unlock(&sem_ids(ns).mutex); return err; } @@ -444,7 +493,7 @@ static int count_semzcnt (struct sem_array * sma, ushort semnum) * the spinlock for this semaphore set hold. sem_ids.mutex remains locked * on exit. */ -static void freeary (struct sem_array *sma, int id) +static void freeary (struct ipc_namespace *ns, struct sem_array *sma, int id) { struct sem_undo *un; struct sem_queue *q; @@ -472,10 +521,10 @@ static void freeary (struct sem_array *sma, int id) } /* Remove the semaphore set from the ID array*/ - sma = sem_rmid(id); + sma = sem_rmid(ns, id); sem_unlock(sma); - used_sems -= sma->sem_nsems; + ns->used_sems -= sma->sem_nsems; size = sizeof (*sma) + sma->sem_nsems * sizeof (struct sem); security_sem_free(sma); ipc_rcu_putref(sma); @@ -503,7 +552,8 @@ static unsigned long copy_semid_to_user(void __user *buf, struct semid64_ds *in, } } -static int semctl_nolock(int semid, int semnum, int cmd, int version, union semun arg) +static int semctl_nolock(struct ipc_namespace *ns, int semid, int semnum, + int cmd, int version, union semun arg) { int err = -EINVAL; struct sem_array *sma; @@ -520,24 +570,24 @@ static int semctl_nolock(int semid, int semnum, int cmd, int version, union semu return err; memset(&seminfo,0,sizeof(seminfo)); - seminfo.semmni = sc_semmni; - seminfo.semmns = sc_semmns; - seminfo.semmsl = sc_semmsl; - seminfo.semopm = sc_semopm; + seminfo.semmni = ns->sc_semmni; + seminfo.semmns = ns->sc_semmns; + seminfo.semmsl = ns->sc_semmsl; + seminfo.semopm = ns->sc_semopm; seminfo.semvmx = SEMVMX; seminfo.semmnu = SEMMNU; seminfo.semmap = SEMMAP; seminfo.semume = SEMUME; - mutex_lock(&sem_ids.mutex); + mutex_lock(&sem_ids(ns).mutex); if (cmd == SEM_INFO) { - seminfo.semusz = sem_ids.in_use; - seminfo.semaem = used_sems; + seminfo.semusz = sem_ids(ns).in_use; + seminfo.semaem = ns->used_sems; } else { seminfo.semusz = SEMUSZ; seminfo.semaem = SEMAEM; } - max_id = sem_ids.max_id; - mutex_unlock(&sem_ids.mutex); + max_id = sem_ids(ns).max_id; + mutex_unlock(&sem_ids(ns).mutex); if (copy_to_user (arg.__buf, &seminfo, sizeof(struct seminfo))) return -EFAULT; return (max_id < 0) ? 0: max_id; @@ -547,12 +597,12 @@ static int semctl_nolock(int semid, int semnum, int cmd, int version, union semu struct semid64_ds tbuf; int id; - if(semid >= sem_ids.entries->size) + if(semid >= sem_ids(ns).entries->size) return -EINVAL; memset(&tbuf,0,sizeof(tbuf)); - sma = sem_lock(semid); + sma = sem_lock(ns, semid); if(sma == NULL) return -EINVAL; @@ -564,7 +614,7 @@ static int semctl_nolock(int semid, int semnum, int cmd, int version, union semu if (err) goto out_unlock; - id = sem_buildid(semid, sma->sem_perm.seq); + id = sem_buildid(ns, semid, sma->sem_perm.seq); kernel_to_ipc64_perm(&sma->sem_perm, &tbuf.sem_perm); tbuf.sem_otime = sma->sem_otime; @@ -584,7 +634,8 @@ out_unlock: return err; } -static int semctl_main(int semid, int semnum, int cmd, int version, union semun arg) +static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, + int cmd, int version, union semun arg) { struct sem_array *sma; struct sem* curr; @@ -593,14 +644,14 @@ static int semctl_main(int semid, int semnum, int cmd, int version, union semun ushort* sem_io = fast_sem_io; int nsems; - sma = sem_lock(semid); + sma = sem_lock(ns, semid); if(sma==NULL) return -EINVAL; nsems = sma->sem_nsems; err=-EIDRM; - if (sem_checkid(sma,semid)) + if (sem_checkid(ns,sma,semid)) goto out_unlock; err = -EACCES; @@ -802,7 +853,8 @@ static inline unsigned long copy_semid_from_user(struct sem_setbuf *out, void __ } } -static int semctl_down(int semid, int semnum, int cmd, int version, union semun arg) +static int semctl_down(struct ipc_namespace *ns, int semid, int semnum, + int cmd, int version, union semun arg) { struct sem_array *sma; int err; @@ -813,11 +865,11 @@ static int semctl_down(int semid, int semnum, int cmd, int version, union semun if(copy_semid_from_user (&setbuf, arg.buf, version)) return -EFAULT; } - sma = sem_lock(semid); + sma = sem_lock(ns, semid); if(sma==NULL) return -EINVAL; - if (sem_checkid(sma,semid)) { + if (sem_checkid(ns,sma,semid)) { err=-EIDRM; goto out_unlock; } @@ -844,7 +896,7 @@ static int semctl_down(int semid, int semnum, int cmd, int version, union semun switch(cmd){ case IPC_RMID: - freeary(sma, semid); + freeary(ns, sma, semid); err = 0; break; case IPC_SET: @@ -872,17 +924,19 @@ asmlinkage long sys_semctl (int semid, int semnum, int cmd, union semun arg) { int err = -EINVAL; int version; + struct ipc_namespace *ns; if (semid < 0) return -EINVAL; version = ipc_parse_version(&cmd); + ns = current->nsproxy->ipc_ns; switch(cmd) { case IPC_INFO: case SEM_INFO: case SEM_STAT: - err = semctl_nolock(semid,semnum,cmd,version,arg); + err = semctl_nolock(ns,semid,semnum,cmd,version,arg); return err; case GETALL: case GETVAL: @@ -892,13 +946,13 @@ asmlinkage long sys_semctl (int semid, int semnum, int cmd, union semun arg) case IPC_STAT: case SETVAL: case SETALL: - err = semctl_main(semid,semnum,cmd,version,arg); + err = semctl_main(ns,semid,semnum,cmd,version,arg); return err; case IPC_RMID: case IPC_SET: - mutex_lock(&sem_ids.mutex); - err = semctl_down(semid,semnum,cmd,version,arg); - mutex_unlock(&sem_ids.mutex); + mutex_lock(&sem_ids(ns).mutex); + err = semctl_down(ns,semid,semnum,cmd,version,arg); + mutex_unlock(&sem_ids(ns).mutex); return err; default: return -EINVAL; @@ -949,15 +1003,12 @@ static inline void unlock_semundo(void) static inline int get_undo_list(struct sem_undo_list **undo_listp) { struct sem_undo_list *undo_list; - int size; undo_list = current->sysvsem.undo_list; if (!undo_list) { - size = sizeof(struct sem_undo_list); - undo_list = (struct sem_undo_list *) kmalloc(size, GFP_KERNEL); + undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL); if (undo_list == NULL) return -ENOMEM; - memset(undo_list, 0, size); spin_lock_init(&undo_list->lock); atomic_set(&undo_list->refcnt, 1); current->sysvsem.undo_list = undo_list; @@ -986,7 +1037,7 @@ static struct sem_undo *lookup_undo(struct sem_undo_list *ulp, int semid) return un; } -static struct sem_undo *find_undo(int semid) +static struct sem_undo *find_undo(struct ipc_namespace *ns, int semid) { struct sem_array *sma; struct sem_undo_list *ulp; @@ -1005,12 +1056,12 @@ static struct sem_undo *find_undo(int semid) goto out; /* no undo structure around - allocate one. */ - sma = sem_lock(semid); + sma = sem_lock(ns, semid); un = ERR_PTR(-EINVAL); if(sma==NULL) goto out; un = ERR_PTR(-EIDRM); - if (sem_checkid(sma,semid)) { + if (sem_checkid(ns,sma,semid)) { sem_unlock(sma); goto out; } @@ -1070,10 +1121,13 @@ asmlinkage long sys_semtimedop(int semid, struct sembuf __user *tsops, int undos = 0, alter = 0, max; struct sem_queue queue; unsigned long jiffies_left = 0; + struct ipc_namespace *ns; + + ns = current->nsproxy->ipc_ns; if (nsops < 1 || semid < 0) return -EINVAL; - if (nsops > sc_semopm) + if (nsops > ns->sc_semopm) return -E2BIG; if(nsops > SEMOPM_FAST) { sops = kmalloc(sizeof(*sops)*nsops,GFP_KERNEL); @@ -1109,7 +1163,7 @@ asmlinkage long sys_semtimedop(int semid, struct sembuf __user *tsops, retry_undos: if (undos) { - un = find_undo(semid); + un = find_undo(ns, semid); if (IS_ERR(un)) { error = PTR_ERR(un); goto out_free; @@ -1117,12 +1171,12 @@ retry_undos: } else un = NULL; - sma = sem_lock(semid); + sma = sem_lock(ns, semid); error=-EINVAL; if(sma==NULL) goto out_free; error = -EIDRM; - if (sem_checkid(sma,semid)) + if (sem_checkid(ns,sma,semid)) goto out_unlock_free; /* * semid identifies are not unique - find_undo may have @@ -1190,7 +1244,7 @@ retry_undos: goto out_free; } - sma = sem_lock(semid); + sma = sem_lock(ns, semid); if(sma==NULL) { BUG_ON(queue.prev != NULL); error = -EIDRM; @@ -1267,6 +1321,7 @@ void exit_sem(struct task_struct *tsk) { struct sem_undo_list *undo_list; struct sem_undo *u, **up; + struct ipc_namespace *ns; undo_list = tsk->sysvsem.undo_list; if (!undo_list) @@ -1275,6 +1330,7 @@ void exit_sem(struct task_struct *tsk) if (!atomic_dec_and_test(&undo_list->refcnt)) return; + ns = tsk->nsproxy->ipc_ns; /* There's no need to hold the semundo list lock, as current * is the last task exiting for this undo list. */ @@ -1288,14 +1344,14 @@ void exit_sem(struct task_struct *tsk) if(semid == -1) continue; - sma = sem_lock(semid); + sma = sem_lock(ns, semid); if (sma == NULL) continue; if (u->semid == -1) goto next_entry; - BUG_ON(sem_checkid(sma,u->semid)); + BUG_ON(sem_checkid(ns,sma,u->semid)); /* remove u from the sma->undo list */ for (unp = &sma->undo; (un = *unp); unp = &un->id_next) { diff --git a/ipc/shm.c b/ipc/shm.c index 940b0c9b13a..bfbd317ec11 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -15,6 +15,10 @@ * * support for audit of ipc object properties and permission changes * Dustin Kirkland <dustin.kirkland@us.ibm.com> + * + * namespaces support + * OpenVZ, SWsoft Inc. + * Pavel Emelianov <xemul@openvz.org> */ #include <linux/slab.h> @@ -32,6 +36,7 @@ #include <linux/ptrace.h> #include <linux/seq_file.h> #include <linux/mutex.h> +#include <linux/nsproxy.h> #include <asm/uaccess.h> @@ -40,59 +45,115 @@ static struct file_operations shm_file_operations; static struct vm_operations_struct shm_vm_ops; -static struct ipc_ids shm_ids; +static struct ipc_ids init_shm_ids; + +#define shm_ids(ns) (*((ns)->ids[IPC_SHM_IDS])) -#define shm_lock(id) ((struct shmid_kernel*)ipc_lock(&shm_ids,id)) -#define shm_unlock(shp) ipc_unlock(&(shp)->shm_perm) -#define shm_get(id) ((struct shmid_kernel*)ipc_get(&shm_ids,id)) -#define shm_buildid(id, seq) \ - ipc_buildid(&shm_ids, id, seq) +#define shm_lock(ns, id) \ + ((struct shmid_kernel*)ipc_lock(&shm_ids(ns),id)) +#define shm_unlock(shp) \ + ipc_unlock(&(shp)->shm_perm) +#define shm_get(ns, id) \ + ((struct shmid_kernel*)ipc_get(&shm_ids(ns),id)) +#define shm_buildid(ns, id, seq) \ + ipc_buildid(&shm_ids(ns), id, seq) -static int newseg (key_t key, int shmflg, size_t size); +static int newseg (struct ipc_namespace *ns, key_t key, + int shmflg, size_t size); static void shm_open (struct vm_area_struct *shmd); static void shm_close (struct vm_area_struct *shmd); +static void shm_destroy (struct ipc_namespace *ns, struct shmid_kernel *shp); #ifdef CONFIG_PROC_FS static int sysvipc_shm_proc_show(struct seq_file *s, void *it); #endif -size_t shm_ctlmax = SHMMAX; -size_t shm_ctlall = SHMALL; -int shm_ctlmni = SHMMNI; +static void __ipc_init __shm_init_ns(struct ipc_namespace *ns, struct ipc_ids *ids) +{ + ns->ids[IPC_SHM_IDS] = ids; + ns->shm_ctlmax = SHMMAX; + ns->shm_ctlall = SHMALL; + ns->shm_ctlmni = SHMMNI; + ns->shm_tot = 0; + ipc_init_ids(ids, 1); +} + +static void do_shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *shp) +{ + if (shp->shm_nattch){ + shp->shm_perm.mode |= SHM_DEST; + /* Do not find it any more */ + shp->shm_perm.key = IPC_PRIVATE; + shm_unlock(shp); + } else + shm_destroy(ns, shp); +} + +#ifdef CONFIG_IPC_NS +int shm_init_ns(struct ipc_namespace *ns) +{ + struct ipc_ids *ids; + + ids = kmalloc(sizeof(struct ipc_ids), GFP_KERNEL); + if (ids == NULL) + return -ENOMEM; -static int shm_tot; /* total number of shared memory pages */ + __shm_init_ns(ns, ids); + return 0; +} + +void shm_exit_ns(struct ipc_namespace *ns) +{ + int i; + struct shmid_kernel *shp; + + mutex_lock(&shm_ids(ns).mutex); + for (i = 0; i <= shm_ids(ns).max_id; i++) { + shp = shm_lock(ns, i); + if (shp == NULL) + continue; + + do_shm_rmid(ns, shp); + } + mutex_unlock(&shm_ids(ns).mutex); + + kfree(ns->ids[IPC_SHM_IDS]); + ns->ids[IPC_SHM_IDS] = NULL; +} +#endif void __init shm_init (void) { - ipc_init_ids(&shm_ids, 1); + __shm_init_ns(&init_ipc_ns, &init_shm_ids); ipc_init_proc_interface("sysvipc/shm", " key shmid perms size cpid lpid nattch uid gid cuid cgid atime dtime ctime\n", - &shm_ids, - sysvipc_shm_proc_show); + IPC_SHM_IDS, sysvipc_shm_proc_show); } -static inline int shm_checkid(struct shmid_kernel *s, int id) +static inline int shm_checkid(struct ipc_namespace *ns, + struct shmid_kernel *s, int id) { - if (ipc_checkid(&shm_ids,&s->shm_perm,id)) + if (ipc_checkid(&shm_ids(ns), &s->shm_perm, id)) return -EIDRM; return 0; } -static inline struct shmid_kernel *shm_rmid(int id) +static inline struct shmid_kernel *shm_rmid(struct ipc_namespace *ns, int id) { - return (struct shmid_kernel *)ipc_rmid(&shm_ids,id); + return (struct shmid_kernel *)ipc_rmid(&shm_ids(ns), id); } -static inline int shm_addid(struct shmid_kernel *shp) +static inline int shm_addid(struct ipc_namespace *ns, struct shmid_kernel *shp) { - return ipc_addid(&shm_ids, &shp->shm_perm, shm_ctlmni); + return ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni); } -static inline void shm_inc (int id) { +static inline void shm_inc(struct ipc_namespace *ns, int id) +{ struct shmid_kernel *shp; - shp = shm_lock(id); + shp = shm_lock(ns, id); BUG_ON(!shp); shp->shm_atim = get_seconds(); shp->shm_lprid = current->tgid; @@ -100,10 +161,13 @@ static inline void shm_inc (int id) { shm_unlock(shp); } +#define shm_file_ns(file) (*((struct ipc_namespace **)&(file)->private_data)) + /* This is called by fork, once for every shm attach. */ -static void shm_open (struct vm_area_struct *shmd) +static void shm_open(struct vm_area_struct *shmd) { - shm_inc (shmd->vm_file->f_dentry->d_inode->i_ino); + shm_inc(shm_file_ns(shmd->vm_file), + shmd->vm_file->f_dentry->d_inode->i_ino); } /* @@ -114,10 +178,10 @@ static void shm_open (struct vm_area_struct *shmd) * It has to be called with shp and shm_ids.mutex locked, * but returns with shp unlocked and freed. */ -static void shm_destroy (struct shmid_kernel *shp) +static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) { - shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; - shm_rmid (shp->id); + ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; + shm_rmid(ns, shp->id); shm_unlock(shp); if (!is_file_hugepages(shp->shm_file)) shmem_lock(shp->shm_file, 0, shp->mlock_user); @@ -140,20 +204,23 @@ static void shm_close (struct vm_area_struct *shmd) struct file * file = shmd->vm_file; int id = file->f_dentry->d_inode->i_ino; struct shmid_kernel *shp; + struct ipc_namespace *ns; - mutex_lock(&shm_ids.mutex); + ns = shm_file_ns(file); + + mutex_lock(&shm_ids(ns).mutex); /* remove from the list of attaches of the shm segment */ - shp = shm_lock(id); + shp = shm_lock(ns, id); BUG_ON(!shp); shp->shm_lprid = current->tgid; shp->shm_dtim = get_seconds(); shp->shm_nattch--; if(shp->shm_nattch == 0 && shp->shm_perm.mode & SHM_DEST) - shm_destroy (shp); + shm_destroy(ns, shp); else shm_unlock(shp); - mutex_unlock(&shm_ids.mutex); + mutex_unlock(&shm_ids(ns).mutex); } static int shm_mmap(struct file * file, struct vm_area_struct * vma) @@ -165,14 +232,25 @@ static int shm_mmap(struct file * file, struct vm_area_struct * vma) vma->vm_ops = &shm_vm_ops; if (!(vma->vm_flags & VM_WRITE)) vma->vm_flags &= ~VM_MAYWRITE; - shm_inc(file->f_dentry->d_inode->i_ino); + shm_inc(shm_file_ns(file), file->f_dentry->d_inode->i_ino); } return ret; } +static int shm_release(struct inode *ino, struct file *file) +{ + struct ipc_namespace *ns; + + ns = shm_file_ns(file); + put_ipc_ns(ns); + shm_file_ns(file) = NULL; + return 0; +} + static struct file_operations shm_file_operations = { - .mmap = shm_mmap, + .mmap = shm_mmap, + .release = shm_release, #ifndef CONFIG_MMU .get_unmapped_area = shmem_get_unmapped_area, #endif @@ -188,7 +266,7 @@ static struct vm_operations_struct shm_vm_ops = { #endif }; -static int newseg (key_t key, int shmflg, size_t size) +static int newseg (struct ipc_namespace *ns, key_t key, int shmflg, size_t size) { int error; struct shmid_kernel *shp; @@ -197,10 +275,10 @@ static int newseg (key_t key, int shmflg, size_t size) char name[13]; int id; - if (size < SHMMIN || size > shm_ctlmax) + if (size < SHMMIN || size > ns->shm_ctlmax) return -EINVAL; - if (shm_tot + numpages >= shm_ctlall) + if (ns->shm_tot + numpages >= ns->shm_ctlall) return -ENOSPC; shp = ipc_rcu_alloc(sizeof(*shp)); @@ -239,7 +317,7 @@ static int newseg (key_t key, int shmflg, size_t size) goto no_file; error = -ENOSPC; - id = shm_addid(shp); + id = shm_addid(ns, shp); if(id == -1) goto no_id; @@ -249,15 +327,17 @@ static int newseg (key_t key, int shmflg, size_t size) shp->shm_ctim = get_seconds(); shp->shm_segsz = size; shp->shm_nattch = 0; - shp->id = shm_buildid(id,shp->shm_perm.seq); + shp->id = shm_buildid(ns, id, shp->shm_perm.seq); shp->shm_file = file; file->f_dentry->d_inode->i_ino = shp->id; + shm_file_ns(file) = get_ipc_ns(ns); + /* Hugetlb ops would have already been assigned. */ if (!(shmflg & SHM_HUGETLB)) file->f_op = &shm_file_operations; - shm_tot += numpages; + ns->shm_tot += numpages; shm_unlock(shp); return shp->id; @@ -273,33 +353,36 @@ asmlinkage long sys_shmget (key_t key, size_t size, int shmflg) { struct shmid_kernel *shp; int err, id = 0; + struct ipc_namespace *ns; + + ns = current->nsproxy->ipc_ns; - mutex_lock(&shm_ids.mutex); + mutex_lock(&shm_ids(ns).mutex); if (key == IPC_PRIVATE) { - err = newseg(key, shmflg, size); - } else if ((id = ipc_findkey(&shm_ids, key)) == -1) { + err = newseg(ns, key, shmflg, size); + } else if ((id = ipc_findkey(&shm_ids(ns), key)) == -1) { if (!(shmflg & IPC_CREAT)) err = -ENOENT; else - err = newseg(key, shmflg, size); + err = newseg(ns, key, shmflg, size); } else if ((shmflg & IPC_CREAT) && (shmflg & IPC_EXCL)) { err = -EEXIST; } else { - shp = shm_lock(id); + shp = shm_lock(ns, id); BUG_ON(shp==NULL); if (shp->shm_segsz < size) err = -EINVAL; else if (ipcperms(&shp->shm_perm, shmflg)) err = -EACCES; else { - int shmid = shm_buildid(id, shp->shm_perm.seq); + int shmid = shm_buildid(ns, id, shp->shm_perm.seq); err = security_shm_associate(shp, shmflg); if (!err) err = shmid; } shm_unlock(shp); } - mutex_unlock(&shm_ids.mutex); + mutex_unlock(&shm_ids(ns).mutex); return err; } @@ -395,18 +478,19 @@ static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminf } } -static void shm_get_stat(unsigned long *rss, unsigned long *swp) +static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss, + unsigned long *swp) { int i; *rss = 0; *swp = 0; - for (i = 0; i <= shm_ids.max_id; i++) { + for (i = 0; i <= shm_ids(ns).max_id; i++) { struct shmid_kernel *shp; struct inode *inode; - shp = shm_get(i); + shp = shm_get(ns, i); if(!shp) continue; @@ -430,6 +514,7 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf) struct shm_setbuf setbuf; struct shmid_kernel *shp; int err, version; + struct ipc_namespace *ns; if (cmd < 0 || shmid < 0) { err = -EINVAL; @@ -437,6 +522,7 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf) } version = ipc_parse_version(&cmd); + ns = current->nsproxy->ipc_ns; switch (cmd) { /* replace with proc interface ? */ case IPC_INFO: @@ -448,15 +534,15 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf) return err; memset(&shminfo,0,sizeof(shminfo)); - shminfo.shmmni = shminfo.shmseg = shm_ctlmni; - shminfo.shmmax = shm_ctlmax; - shminfo.shmall = shm_ctlall; + shminfo.shmmni = shminfo.shmseg = ns->shm_ctlmni; + shminfo.shmmax = ns->shm_ctlmax; + shminfo.shmall = ns->shm_ctlall; shminfo.shmmin = SHMMIN; if(copy_shminfo_to_user (buf, &shminfo, version)) return -EFAULT; /* reading a integer is always atomic */ - err= shm_ids.max_id; + err= shm_ids(ns).max_id; if(err<0) err = 0; goto out; @@ -470,14 +556,14 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf) return err; memset(&shm_info,0,sizeof(shm_info)); - mutex_lock(&shm_ids.mutex); - shm_info.used_ids = shm_ids.in_use; - shm_get_stat (&shm_info.shm_rss, &shm_info.shm_swp); - shm_info.shm_tot = shm_tot; + mutex_lock(&shm_ids(ns).mutex); + shm_info.used_ids = shm_ids(ns).in_use; + shm_get_stat (ns, &shm_info.shm_rss, &shm_info.shm_swp); + shm_info.shm_tot = ns->shm_tot; shm_info.swap_attempts = 0; shm_info.swap_successes = 0; - err = shm_ids.max_id; - mutex_unlock(&shm_ids.mutex); + err = shm_ids(ns).max_id; + mutex_unlock(&shm_ids(ns).mutex); if(copy_to_user (buf, &shm_info, sizeof(shm_info))) { err = -EFAULT; goto out; @@ -492,17 +578,17 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf) struct shmid64_ds tbuf; int result; memset(&tbuf, 0, sizeof(tbuf)); - shp = shm_lock(shmid); + shp = shm_lock(ns, shmid); if(shp==NULL) { err = -EINVAL; goto out; } else if(cmd==SHM_STAT) { err = -EINVAL; - if (shmid > shm_ids.max_id) + if (shmid > shm_ids(ns).max_id) goto out_unlock; - result = shm_buildid(shmid, shp->shm_perm.seq); + result = shm_buildid(ns, shmid, shp->shm_perm.seq); } else { - err = shm_checkid(shp,shmid); + err = shm_checkid(ns, shp,shmid); if(err) goto out_unlock; result = 0; @@ -534,12 +620,12 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf) case SHM_LOCK: case SHM_UNLOCK: { - shp = shm_lock(shmid); + shp = shm_lock(ns, shmid); if(shp==NULL) { err = -EINVAL; goto out; } - err = shm_checkid(shp,shmid); + err = shm_checkid(ns, shp,shmid); if(err) goto out_unlock; @@ -590,12 +676,12 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf) * Instead we set a destroyed flag, and then blow * the name away when the usage hits zero. */ - mutex_lock(&shm_ids.mutex); - shp = shm_lock(shmid); + mutex_lock(&shm_ids(ns).mutex); + shp = shm_lock(ns, shmid); err = -EINVAL; if (shp == NULL) goto out_up; - err = shm_checkid(shp, shmid); + err = shm_checkid(ns, shp, shmid); if(err) goto out_unlock_up; @@ -614,14 +700,8 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf) if (err) goto out_unlock_up; - if (shp->shm_nattch){ - shp->shm_perm.mode |= SHM_DEST; - /* Do not find it any more */ - shp->shm_perm.key = IPC_PRIVATE; - shm_unlock(shp); - } else - shm_destroy (shp); - mutex_unlock(&shm_ids.mutex); + do_shm_rmid(ns, shp); + mutex_unlock(&shm_ids(ns).mutex); goto out; } @@ -631,12 +711,12 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf) err = -EFAULT; goto out; } - mutex_lock(&shm_ids.mutex); - shp = shm_lock(shmid); + mutex_lock(&shm_ids(ns).mutex); + shp = shm_lock(ns, shmid); err=-EINVAL; if(shp==NULL) goto out_up; - err = shm_checkid(shp,shmid); + err = shm_checkid(ns, shp,shmid); if(err) goto out_unlock_up; err = audit_ipc_obj(&(shp->shm_perm)); @@ -673,7 +753,7 @@ asmlinkage long sys_shmctl (int shmid, int cmd, struct shmid_ds __user *buf) out_unlock_up: shm_unlock(shp); out_up: - mutex_unlock(&shm_ids.mutex); + mutex_unlock(&shm_ids(ns).mutex); goto out; out_unlock: shm_unlock(shp); @@ -699,6 +779,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr) unsigned long prot; int acc_mode; void *user_addr; + struct ipc_namespace *ns; if (shmid < 0) { err = -EINVAL; @@ -737,12 +818,13 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr) * We cannot rely on the fs check since SYSV IPC does have an * additional creator id... */ - shp = shm_lock(shmid); + ns = current->nsproxy->ipc_ns; + shp = shm_lock(ns, shmid); if(shp == NULL) { err = -EINVAL; goto out; } - err = shm_checkid(shp,shmid); + err = shm_checkid(ns, shp,shmid); if (err) { shm_unlock(shp); goto out; @@ -783,16 +865,16 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr) invalid: up_write(¤t->mm->mmap_sem); - mutex_lock(&shm_ids.mutex); - shp = shm_lock(shmid); + mutex_lock(&shm_ids(ns).mutex); + shp = shm_lock(ns, shmid); BUG_ON(!shp); shp->shm_nattch--; if(shp->shm_nattch == 0 && shp->shm_perm.mode & SHM_DEST) - shm_destroy (shp); + shm_destroy(ns, shp); else shm_unlock(shp); - mutex_unlock(&shm_ids.mutex); + mutex_unlock(&shm_ids(ns).mutex); *raddr = (unsigned long) user_addr; err = 0; diff --git a/ipc/util.c b/ipc/util.c index 67b6d178db6..42479e4eec5 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -12,6 +12,9 @@ * Mingming Cao <cmm@us.ibm.com> * Mar 2006 - support for audit of ipc object properties * Dustin Kirkland <dustin.kirkland@us.ibm.com> + * Jun 2006 - namespaces ssupport + * OpenVZ, SWsoft Inc. + * Pavel Emelianov <xemul@openvz.org> */ #include <linux/mm.h> @@ -29,6 +32,7 @@ #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/audit.h> +#include <linux/nsproxy.h> #include <asm/unistd.h> @@ -37,10 +41,111 @@ struct ipc_proc_iface { const char *path; const char *header; - struct ipc_ids *ids; + int ids; int (*show)(struct seq_file *, void *); }; +struct ipc_namespace init_ipc_ns = { + .kref = { + .refcount = ATOMIC_INIT(2), + }, +}; + +#ifdef CONFIG_IPC_NS +static struct ipc_namespace *clone_ipc_ns(struct ipc_namespace *old_ns) +{ + int err; + struct ipc_namespace *ns; + + err = -ENOMEM; + ns = kmalloc(sizeof(struct ipc_namespace), GFP_KERNEL); + if (ns == NULL) + goto err_mem; + + err = sem_init_ns(ns); + if (err) + goto err_sem; + err = msg_init_ns(ns); + if (err) + goto err_msg; + err = shm_init_ns(ns); + if (err) + goto err_shm; + + kref_init(&ns->kref); + return ns; + +err_shm: + msg_exit_ns(ns); +err_msg: + sem_exit_ns(ns); +err_sem: + kfree(ns); +err_mem: + return ERR_PTR(err); +} + +int unshare_ipcs(unsigned long unshare_flags, struct ipc_namespace **new_ipc) +{ + struct ipc_namespace *new; + + if (unshare_flags & CLONE_NEWIPC) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + new = clone_ipc_ns(current->nsproxy->ipc_ns); + if (IS_ERR(new)) + return PTR_ERR(new); + + *new_ipc = new; + } + + return 0; +} + +int copy_ipcs(unsigned long flags, struct task_struct *tsk) +{ + struct ipc_namespace *old_ns = tsk->nsproxy->ipc_ns; + struct ipc_namespace *new_ns; + int err = 0; + + if (!old_ns) + return 0; + + get_ipc_ns(old_ns); + + if (!(flags & CLONE_NEWIPC)) + return 0; + + if (!capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto out; + } + + new_ns = clone_ipc_ns(old_ns); + if (!new_ns) { + err = -ENOMEM; + goto out; + } + + tsk->nsproxy->ipc_ns = new_ns; +out: + put_ipc_ns(old_ns); + return err; +} + +void free_ipc_ns(struct kref *kref) +{ + struct ipc_namespace *ns; + + ns = container_of(kref, struct ipc_namespace, kref); + sem_exit_ns(ns); + msg_exit_ns(ns); + shm_exit_ns(ns); + kfree(ns); +} +#endif + /** * ipc_init - initialise IPC subsystem * @@ -67,7 +172,7 @@ __initcall(ipc_init); * array itself. */ -void __init ipc_init_ids(struct ipc_ids* ids, int size) +void __ipc_init ipc_init_ids(struct ipc_ids* ids, int size) { int i; @@ -110,8 +215,7 @@ static struct file_operations sysvipc_proc_fops; * @show: show routine. */ void __init ipc_init_proc_interface(const char *path, const char *header, - struct ipc_ids *ids, - int (*show)(struct seq_file *, void *)) + int ids, int (*show)(struct seq_file *, void *)) { struct proc_dir_entry *pde; struct ipc_proc_iface *iface; @@ -635,6 +739,9 @@ static void *sysvipc_proc_next(struct seq_file *s, void *it, loff_t *pos) struct ipc_proc_iface *iface = s->private; struct kern_ipc_perm *ipc = it; loff_t p; + struct ipc_ids *ids; + + ids = current->nsproxy->ipc_ns->ids[iface->ids]; /* If we had an ipc id locked before, unlock it */ if (ipc && ipc != SEQ_START_TOKEN) @@ -644,8 +751,8 @@ static void *sysvipc_proc_next(struct seq_file *s, void *it, loff_t *pos) * p = *pos - 1 (because id 0 starts at position 1) * + 1 (because we increment the position by one) */ - for (p = *pos; p <= iface->ids->max_id; p++) { - if ((ipc = ipc_lock(iface->ids, p)) != NULL) { + for (p = *pos; p <= ids->max_id; p++) { + if ((ipc = ipc_lock(ids, p)) != NULL) { *pos = p + 1; return ipc; } @@ -664,12 +771,15 @@ static void *sysvipc_proc_start(struct seq_file *s, loff_t *pos) struct ipc_proc_iface *iface = s->private; struct kern_ipc_perm *ipc; loff_t p; + struct ipc_ids *ids; + + ids = current->nsproxy->ipc_ns->ids[iface->ids]; /* * Take the lock - this will be released by the corresponding * call to stop(). */ - mutex_lock(&iface->ids->mutex); + mutex_lock(&ids->mutex); /* pos < 0 is invalid */ if (*pos < 0) @@ -680,8 +790,8 @@ static void *sysvipc_proc_start(struct seq_file *s, loff_t *pos) return SEQ_START_TOKEN; /* Find the (pos-1)th ipc */ - for (p = *pos - 1; p <= iface->ids->max_id; p++) { - if ((ipc = ipc_lock(iface->ids, p)) != NULL) { + for (p = *pos - 1; p <= ids->max_id; p++) { + if ((ipc = ipc_lock(ids, p)) != NULL) { *pos = p + 1; return ipc; } @@ -693,13 +803,15 @@ static void sysvipc_proc_stop(struct seq_file *s, void *it) { struct kern_ipc_perm *ipc = it; struct ipc_proc_iface *iface = s->private; + struct ipc_ids *ids; /* If we had a locked segment, release it */ if (ipc && ipc != SEQ_START_TOKEN) ipc_unlock(ipc); + ids = current->nsproxy->ipc_ns->ids[iface->ids]; /* Release the lock we took in start() */ - mutex_unlock(&iface->ids->mutex); + mutex_unlock(&ids->mutex); } static int sysvipc_proc_show(struct seq_file *s, void *it) diff --git a/ipc/util.h b/ipc/util.h index 0181553d31d..c8fd6b9d77b 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -3,6 +3,8 @@ * Copyright (C) 1999 Christoph Rohland * * ipc helper functions (c) 1999 Manfred Spraul <manfred@colorfullife.com> + * namespaces support. 2006 OpenVZ, SWsoft Inc. + * Pavel Emelianov <xemul@openvz.org> */ #ifndef _IPC_UTIL_H @@ -15,6 +17,14 @@ void sem_init (void); void msg_init (void); void shm_init (void); +int sem_init_ns(struct ipc_namespace *ns); +int msg_init_ns(struct ipc_namespace *ns); +int shm_init_ns(struct ipc_namespace *ns); + +void sem_exit_ns(struct ipc_namespace *ns); +void msg_exit_ns(struct ipc_namespace *ns); +void shm_exit_ns(struct ipc_namespace *ns); + struct ipc_id_ary { int size; struct kern_ipc_perm *p[0]; @@ -31,15 +41,23 @@ struct ipc_ids { }; struct seq_file; -void __init ipc_init_ids(struct ipc_ids* ids, int size); +#ifdef CONFIG_IPC_NS +#define __ipc_init +#else +#define __ipc_init __init +#endif +void __ipc_init ipc_init_ids(struct ipc_ids *ids, int size); #ifdef CONFIG_PROC_FS void __init ipc_init_proc_interface(const char *path, const char *header, - struct ipc_ids *ids, - int (*show)(struct seq_file *, void *)); + int ids, int (*show)(struct seq_file *, void *)); #else #define ipc_init_proc_interface(path, header, ids, show) do {} while (0) #endif +#define IPC_SEM_IDS 0 +#define IPC_MSG_IDS 1 +#define IPC_SHM_IDS 2 + /* must be called with ids->mutex acquired.*/ int ipc_findkey(struct ipc_ids* ids, key_t key); int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size); diff --git a/kernel/Makefile b/kernel/Makefile index aacaafb28b9..d948ca12acf 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,7 +8,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ - hrtimer.o rwsem.o latency.o + hrtimer.o rwsem.o latency.o nsproxy.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ @@ -48,6 +48,7 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RELAY) += relay.o +obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o diff --git a/kernel/compat.c b/kernel/compat.c index b4fbd838cd7..75573e5d27b 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -26,8 +26,6 @@ #include <asm/uaccess.h> -extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat); - int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) { return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || diff --git a/kernel/exit.c b/kernel/exit.c index 3b47f26985f..f250a5e3e28 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -21,6 +21,7 @@ #include <linux/tsacct_kern.h> #include <linux/file.h> #include <linux/binfmts.h> +#include <linux/nsproxy.h> #include <linux/ptrace.h> #include <linux/profile.h> #include <linux/mount.h> @@ -397,9 +398,11 @@ void daemonize(const char *name, ...) fs = init_task.fs; current->fs = fs; atomic_inc(&fs->count); - exit_namespace(current); - current->namespace = init_task.namespace; - get_namespace(current->namespace); + + exit_task_namespaces(current); + current->nsproxy = init_task.nsproxy; + get_task_namespaces(current); + exit_files(current); current->files = init_task.files; atomic_inc(¤t->files->count); @@ -917,7 +920,6 @@ fastcall NORET_TYPE void do_exit(long code) exit_sem(tsk); __exit_files(tsk); __exit_fs(tsk); - exit_namespace(tsk); exit_thread(); cpuset_exit(tsk); exit_keys(tsk); @@ -932,6 +934,7 @@ fastcall NORET_TYPE void do_exit(long code) tsk->exit_code = code; proc_exit_connector(tsk); exit_notify(tsk); + exit_task_namespaces(tsk); #ifdef CONFIG_NUMA mpol_free(tsk->mempolicy); tsk->mempolicy = NULL; diff --git a/kernel/fork.c b/kernel/fork.c index 89f666491d1..7dc6140baac 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -27,6 +27,7 @@ #include <linux/binfmts.h> #include <linux/mman.h> #include <linux/fs.h> +#include <linux/nsproxy.h> #include <linux/capability.h> #include <linux/cpu.h> #include <linux/cpuset.h> @@ -1116,11 +1117,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_signal; if ((retval = copy_keys(clone_flags, p))) goto bad_fork_cleanup_mm; - if ((retval = copy_namespace(clone_flags, p))) + if ((retval = copy_namespaces(clone_flags, p))) goto bad_fork_cleanup_keys; retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); if (retval) - goto bad_fork_cleanup_namespace; + goto bad_fork_cleanup_namespaces; p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; /* @@ -1212,7 +1213,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; - goto bad_fork_cleanup_namespace; + goto bad_fork_cleanup_namespaces; } if (clone_flags & CLONE_THREAD) { @@ -1260,8 +1261,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, proc_fork_connector(p); return p; -bad_fork_cleanup_namespace: - exit_namespace(p); +bad_fork_cleanup_namespaces: + exit_task_namespaces(p); bad_fork_cleanup_keys: exit_keys(p); bad_fork_cleanup_mm: @@ -1514,10 +1515,9 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) */ static int unshare_namespace(unsigned long unshare_flags, struct namespace **new_nsp, struct fs_struct *new_fs) { - struct namespace *ns = current->namespace; + struct namespace *ns = current->nsproxy->namespace; - if ((unshare_flags & CLONE_NEWNS) && - (ns && atomic_read(&ns->count) > 1)) { + if ((unshare_flags & CLONE_NEWNS) && ns) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1589,6 +1589,16 @@ static int unshare_semundo(unsigned long unshare_flags, struct sem_undo_list **n return 0; } +#ifndef CONFIG_IPC_NS +static inline int unshare_ipcs(unsigned long flags, struct ipc_namespace **ns) +{ + if (flags & CLONE_NEWIPC) + return -EINVAL; + + return 0; +} +#endif + /* * unshare allows a process to 'unshare' part of the process * context which was originally shared using clone. copy_* @@ -1606,13 +1616,17 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL; struct files_struct *fd, *new_fd = NULL; struct sem_undo_list *new_ulist = NULL; + struct nsproxy *new_nsproxy = NULL, *old_nsproxy = NULL; + struct uts_namespace *uts, *new_uts = NULL; + struct ipc_namespace *ipc, *new_ipc = NULL; check_unshare_flags(&unshare_flags); /* Return -EINVAL for all unsupported flags */ err = -EINVAL; if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| - CLONE_VM|CLONE_FILES|CLONE_SYSVSEM)) + CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| + CLONE_NEWUTS|CLONE_NEWIPC)) goto bad_unshare_out; if ((err = unshare_thread(unshare_flags))) @@ -1629,11 +1643,30 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) goto bad_unshare_cleanup_vm; if ((err = unshare_semundo(unshare_flags, &new_ulist))) goto bad_unshare_cleanup_fd; + if ((err = unshare_utsname(unshare_flags, &new_uts))) + goto bad_unshare_cleanup_semundo; + if ((err = unshare_ipcs(unshare_flags, &new_ipc))) + goto bad_unshare_cleanup_uts; + + if (new_ns || new_uts || new_ipc) { + old_nsproxy = current->nsproxy; + new_nsproxy = dup_namespaces(old_nsproxy); + if (!new_nsproxy) { + err = -ENOMEM; + goto bad_unshare_cleanup_ipc; + } + } - if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist) { + if (new_fs || new_ns || new_sigh || new_mm || new_fd || new_ulist || + new_uts || new_ipc) { task_lock(current); + if (new_nsproxy) { + current->nsproxy = new_nsproxy; + new_nsproxy = old_nsproxy; + } + if (new_fs) { fs = current->fs; current->fs = new_fs; @@ -1641,8 +1674,8 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) } if (new_ns) { - ns = current->namespace; - current->namespace = new_ns; + ns = current->nsproxy->namespace; + current->nsproxy->namespace = new_ns; new_ns = ns; } @@ -1667,9 +1700,33 @@ asmlinkage long sys_unshare(unsigned long unshare_flags) new_fd = fd; } + if (new_uts) { + uts = current->nsproxy->uts_ns; + current->nsproxy->uts_ns = new_uts; + new_uts = uts; + } + + if (new_ipc) { + ipc = current->nsproxy->ipc_ns; + current->nsproxy->ipc_ns = new_ipc; + new_ipc = ipc; + } + task_unlock(current); } + if (new_nsproxy) + put_nsproxy(new_nsproxy); + +bad_unshare_cleanup_ipc: + if (new_ipc) + put_ipc_ns(new_ipc); + +bad_unshare_cleanup_uts: + if (new_uts) + put_uts_ns(new_uts); + +bad_unshare_cleanup_semundo: bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); diff --git a/kernel/futex.c b/kernel/futex.c index 4b6770e9806..4aaf91951a4 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1527,7 +1527,7 @@ static int futex_fd(u32 __user *uaddr, int signal) filp->f_mapping = filp->f_dentry->d_inode->i_mapping; if (signal) { - err = f_setown(filp, current->pid, 1); + err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1); if (err < 0) { goto error; } diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index ab16a5a4cfe..342bca62c49 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -154,7 +154,6 @@ unsigned long kallsyms_lookup_name(const char *name) } return module_kallsyms_lookup_name(name); } -EXPORT_SYMBOL_GPL(kallsyms_lookup_name); /* * Lookup an address diff --git a/kernel/kmod.c b/kernel/kmod.c index f8121b95183..bb4e29d924e 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -18,8 +18,6 @@ call_usermodehelper wait flag, and remove exec_usermodehelper. Rusty Russell <rusty@rustcorp.com.au> Jan 2003 */ -#define __KERNEL_SYSCALLS__ - #include <linux/module.h> #include <linux/sched.h> #include <linux/syscalls.h> @@ -169,7 +167,8 @@ static int ____call_usermodehelper(void *data) retval = -EPERM; if (current->fs->root) - retval = execve(sub_info->path, sub_info->argv, sub_info->envp); + retval = kernel_execve(sub_info->path, + sub_info->argv, sub_info->envp); /* Exec failed? */ sub_info->retval = retval; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3f57dfdc8f9..610c837ad9e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -37,6 +37,7 @@ #include <linux/slab.h> #include <linux/module.h> #include <linux/moduleloader.h> +#include <linux/kallsyms.h> #include <asm-generic/sections.h> #include <asm/cacheflush.h> #include <asm/errno.h> @@ -45,6 +46,16 @@ #define KPROBE_HASH_BITS 6 #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) + +/* + * Some oddball architectures like 64bit powerpc have function descriptors + * so this must be overridable. + */ +#ifndef kprobe_lookup_name +#define kprobe_lookup_name(name, addr) \ + addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name))) +#endif + static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; static atomic_t kprobe_count; @@ -308,7 +319,8 @@ void __kprobes add_rp_inst(struct kretprobe_instance *ri) } /* Called with kretprobe_lock held */ -void __kprobes recycle_rp_inst(struct kretprobe_instance *ri) +void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, + struct hlist_head *head) { /* remove rp inst off the rprobe_inst_table */ hlist_del(&ri->hlist); @@ -320,7 +332,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri) hlist_add_head(&ri->uflist, &ri->rp->free_instances); } else /* Unregistering */ - kfree(ri); + hlist_add_head(&ri->hlist, head); } struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk) @@ -336,18 +348,24 @@ struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk) */ void __kprobes kprobe_flush_task(struct task_struct *tk) { - struct kretprobe_instance *ri; - struct hlist_head *head; + struct kretprobe_instance *ri; + struct hlist_head *head, empty_rp; struct hlist_node *node, *tmp; unsigned long flags = 0; + INIT_HLIST_HEAD(&empty_rp); spin_lock_irqsave(&kretprobe_lock, flags); - head = kretprobe_inst_table_head(tk); - hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { - if (ri->task == tk) - recycle_rp_inst(ri); - } + head = kretprobe_inst_table_head(tk); + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task == tk) + recycle_rp_inst(ri, &empty_rp); + } spin_unlock_irqrestore(&kretprobe_lock, flags); + + hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { + hlist_del(&ri->hlist); + kfree(ri); + } } static inline void free_rp_inst(struct kretprobe *rp) @@ -447,6 +465,21 @@ static int __kprobes __register_kprobe(struct kprobe *p, struct kprobe *old_p; struct module *probed_mod; + /* + * If we have a symbol_name argument look it up, + * and add it to the address. That way the addr + * field can either be global or relative to a symbol. + */ + if (p->symbol_name) { + if (p->addr) + return -EINVAL; + kprobe_lookup_name(p->symbol_name, p->addr); + } + + if (!p->addr) + return -EINVAL; + p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset); + if ((!kernel_text_address((unsigned long) p->addr)) || in_kprobes_functions((unsigned long) p->addr)) return -EINVAL; @@ -488,7 +521,7 @@ static int __kprobes __register_kprobe(struct kprobe *p, (ARCH_INACTIVE_KPROBE_COUNT + 1)) register_page_fault_notifier(&kprobe_page_fault_nb); - arch_arm_kprobe(p); + arch_arm_kprobe(p); out: mutex_unlock(&kprobe_mutex); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index e596525669e..4c055346100 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -518,9 +518,9 @@ print_circular_bug_entry(struct lock_list *target, unsigned int depth) static void print_kernel_version(void) { - printk("%s %.*s\n", system_utsname.release, - (int)strcspn(system_utsname.version, " "), - system_utsname.version); + printk("%s %.*s\n", init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); } /* diff --git a/kernel/module.c b/kernel/module.c index 05625d5dc75..7c77a0a9275 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -851,6 +851,7 @@ static int check_version(Elf_Shdr *sechdrs, printk("%s: no version for \"%s\" found: kernel tainted.\n", mod->name, symname); add_taint(TAINT_FORCED_MODULE); + mod->taints |= TAINT_FORCED_MODULE; } return 1; } @@ -1339,6 +1340,7 @@ static void set_license(struct module *mod, const char *license) printk(KERN_WARNING "%s: module license '%s' taints kernel.\n", mod->name, license); add_taint(TAINT_PROPRIETARY_MODULE); + mod->taints |= TAINT_PROPRIETARY_MODULE; } } @@ -1618,6 +1620,7 @@ static struct module *load_module(void __user *umod, /* This is allowed: modprobe --force will invalidate it. */ if (!modmagic) { add_taint(TAINT_FORCED_MODULE); + mod->taints |= TAINT_FORCED_MODULE; printk(KERN_WARNING "%s: no version magic, tainting kernel.\n", mod->name); } else if (!same_magic(modmagic, vermagic)) { @@ -1711,10 +1714,14 @@ static struct module *load_module(void __user *umod, /* Set up license info based on the info section */ set_license(mod, get_modinfo(sechdrs, infoindex, "license")); - if (strcmp(mod->name, "ndiswrapper") == 0) + if (strcmp(mod->name, "ndiswrapper") == 0) { add_taint(TAINT_PROPRIETARY_MODULE); - if (strcmp(mod->name, "driverloader") == 0) + mod->taints |= TAINT_PROPRIETARY_MODULE; + } + if (strcmp(mod->name, "driverloader") == 0) { add_taint(TAINT_PROPRIETARY_MODULE); + mod->taints |= TAINT_PROPRIETARY_MODULE; + } /* Set up MODINFO_ATTR fields */ setup_modinfo(mod, sechdrs, infoindex); @@ -1760,6 +1767,7 @@ static struct module *load_module(void __user *umod, printk(KERN_WARNING "%s: No versions for exported symbols." " Tainting kernel.\n", mod->name); add_taint(TAINT_FORCED_MODULE); + mod->taints |= TAINT_FORCED_MODULE; } #endif @@ -2226,14 +2234,37 @@ struct module *module_text_address(unsigned long addr) return mod; } +static char *taint_flags(unsigned int taints, char *buf) +{ + *buf = '\0'; + if (taints) { + int bx; + + buf[0] = '('; + bx = 1; + if (taints & TAINT_PROPRIETARY_MODULE) + buf[bx++] = 'P'; + if (taints & TAINT_FORCED_MODULE) + buf[bx++] = 'F'; + /* + * TAINT_FORCED_RMMOD: could be added. + * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't + * apply to modules. + */ + buf[bx] = ')'; + } + return buf; +} + /* Don't grab lock, we're oopsing. */ void print_modules(void) { struct module *mod; + char buf[8]; printk("Modules linked in:"); list_for_each_entry(mod, &modules, list) - printk(" %s", mod->name); + printk(" %s%s", mod->name, taint_flags(mod->taints, buf)); printk("\n"); } diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c new file mode 100644 index 00000000000..6ebdb82a0ce --- /dev/null +++ b/kernel/nsproxy.c @@ -0,0 +1,139 @@ +/* + * Copyright (C) 2006 IBM Corporation + * + * Author: Serge Hallyn <serue@us.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + * + * Jun 2006 - namespaces support + * OpenVZ, SWsoft Inc. + * Pavel Emelianov <xemul@openvz.org> + */ + +#include <linux/module.h> +#include <linux/version.h> +#include <linux/nsproxy.h> +#include <linux/init_task.h> +#include <linux/namespace.h> +#include <linux/utsname.h> + +struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); + +static inline void get_nsproxy(struct nsproxy *ns) +{ + atomic_inc(&ns->count); +} + +void get_task_namespaces(struct task_struct *tsk) +{ + struct nsproxy *ns = tsk->nsproxy; + if (ns) { + get_nsproxy(ns); + } +} + +/* + * creates a copy of "orig" with refcount 1. + * This does not grab references to the contained namespaces, + * so that needs to be done by dup_namespaces. + */ +static inline struct nsproxy *clone_namespaces(struct nsproxy *orig) +{ + struct nsproxy *ns; + + ns = kmalloc(sizeof(struct nsproxy), GFP_KERNEL); + if (ns) { + memcpy(ns, orig, sizeof(struct nsproxy)); + atomic_set(&ns->count, 1); + } + return ns; +} + +/* + * copies the nsproxy, setting refcount to 1, and grabbing a + * reference to all contained namespaces. Called from + * sys_unshare() + */ +struct nsproxy *dup_namespaces(struct nsproxy *orig) +{ + struct nsproxy *ns = clone_namespaces(orig); + + if (ns) { + if (ns->namespace) + get_namespace(ns->namespace); + if (ns->uts_ns) + get_uts_ns(ns->uts_ns); + if (ns->ipc_ns) + get_ipc_ns(ns->ipc_ns); + } + + return ns; +} + +/* + * called from clone. This now handles copy for nsproxy and all + * namespaces therein. + */ +int copy_namespaces(int flags, struct task_struct *tsk) +{ + struct nsproxy *old_ns = tsk->nsproxy; + struct nsproxy *new_ns; + int err = 0; + + if (!old_ns) + return 0; + + get_nsproxy(old_ns); + + if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC))) + return 0; + + new_ns = clone_namespaces(old_ns); + if (!new_ns) { + err = -ENOMEM; + goto out; + } + + tsk->nsproxy = new_ns; + + err = copy_namespace(flags, tsk); + if (err) + goto out_ns; + + err = copy_utsname(flags, tsk); + if (err) + goto out_uts; + + err = copy_ipcs(flags, tsk); + if (err) + goto out_ipc; + +out: + put_nsproxy(old_ns); + return err; + +out_ipc: + if (new_ns->uts_ns) + put_uts_ns(new_ns->uts_ns); +out_uts: + if (new_ns->namespace) + put_namespace(new_ns->namespace); +out_ns: + tsk->nsproxy = old_ns; + kfree(new_ns); + goto out; +} + +void free_nsproxy(struct nsproxy *ns) +{ + if (ns->namespace) + put_namespace(ns->namespace); + if (ns->uts_ns) + put_uts_ns(ns->uts_ns); + if (ns->ipc_ns) + put_ipc_ns(ns->ipc_ns); + kfree(ns); +} diff --git a/kernel/pid.c b/kernel/pid.c index 8387e8c6819..b914392085f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -26,6 +26,7 @@ #include <linux/init.h> #include <linux/bootmem.h> #include <linux/hash.h> +#include <linux/pspace.h> #define pid_hashfn(nr) hash_long((unsigned long)nr, pidhash_shift) static struct hlist_head *pid_hash; @@ -33,17 +34,20 @@ static int pidhash_shift; static kmem_cache_t *pid_cachep; int pid_max = PID_MAX_DEFAULT; -int last_pid; #define RESERVED_PIDS 300 int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; -#define PIDMAP_ENTRIES ((PID_MAX_LIMIT + 8*PAGE_SIZE - 1)/PAGE_SIZE/8) #define BITS_PER_PAGE (PAGE_SIZE*8) #define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) -#define mk_pid(map, off) (((map) - pidmap_array)*BITS_PER_PAGE + (off)) + +static inline int mk_pid(struct pspace *pspace, struct pidmap *map, int off) +{ + return (map - pspace->pidmap)*BITS_PER_PAGE + off; +} + #define find_next_offset(map, off) \ find_next_zero_bit((map)->page, BITS_PER_PAGE, off) @@ -53,13 +57,12 @@ int pid_max_max = PID_MAX_LIMIT; * value does not cause lots of bitmaps to be allocated, but * the scheme scales to up to 4 million PIDs, runtime. */ -typedef struct pidmap { - atomic_t nr_free; - void *page; -} pidmap_t; - -static pidmap_t pidmap_array[PIDMAP_ENTRIES] = - { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } }; +struct pspace init_pspace = { + .pidmap = { + [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } + }, + .last_pid = 0 +}; /* * Note: disable interrupts while the pidmap_lock is held as an @@ -74,40 +77,41 @@ static pidmap_t pidmap_array[PIDMAP_ENTRIES] = * irq handlers that take it we can leave the interrupts enabled. * For now it is easier to be safe than to prove it can't happen. */ + static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); -static fastcall void free_pidmap(int pid) +static fastcall void free_pidmap(struct pspace *pspace, int pid) { - pidmap_t *map = pidmap_array + pid / BITS_PER_PAGE; + struct pidmap *map = pspace->pidmap + pid / BITS_PER_PAGE; int offset = pid & BITS_PER_PAGE_MASK; clear_bit(offset, map->page); atomic_inc(&map->nr_free); } -static int alloc_pidmap(void) +static int alloc_pidmap(struct pspace *pspace) { - int i, offset, max_scan, pid, last = last_pid; - pidmap_t *map; + int i, offset, max_scan, pid, last = pspace->last_pid; + struct pidmap *map; pid = last + 1; if (pid >= pid_max) pid = RESERVED_PIDS; offset = pid & BITS_PER_PAGE_MASK; - map = &pidmap_array[pid/BITS_PER_PAGE]; + map = &pspace->pidmap[pid/BITS_PER_PAGE]; max_scan = (pid_max + BITS_PER_PAGE - 1)/BITS_PER_PAGE - !offset; for (i = 0; i <= max_scan; ++i) { if (unlikely(!map->page)) { - unsigned long page = get_zeroed_page(GFP_KERNEL); + void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); /* * Free the page if someone raced with us * installing it: */ spin_lock_irq(&pidmap_lock); if (map->page) - free_page(page); + kfree(page); else - map->page = (void *)page; + map->page = page; spin_unlock_irq(&pidmap_lock); if (unlikely(!map->page)) break; @@ -116,11 +120,11 @@ static int alloc_pidmap(void) do { if (!test_and_set_bit(offset, map->page)) { atomic_dec(&map->nr_free); - last_pid = pid; + pspace->last_pid = pid; return pid; } offset = find_next_offset(map, offset); - pid = mk_pid(map, offset); + pid = mk_pid(pspace, map, offset); /* * find_next_offset() found a bit, the pid from it * is in-bounds, and if we fell back to the last @@ -131,16 +135,34 @@ static int alloc_pidmap(void) (i != max_scan || pid < last || !((last+1) & BITS_PER_PAGE_MASK))); } - if (map < &pidmap_array[(pid_max-1)/BITS_PER_PAGE]) { + if (map < &pspace->pidmap[(pid_max-1)/BITS_PER_PAGE]) { ++map; offset = 0; } else { - map = &pidmap_array[0]; + map = &pspace->pidmap[0]; offset = RESERVED_PIDS; if (unlikely(last == offset)) break; } - pid = mk_pid(map, offset); + pid = mk_pid(pspace, map, offset); + } + return -1; +} + +static int next_pidmap(struct pspace *pspace, int last) +{ + int offset; + struct pidmap *map, *end; + + offset = (last + 1) & BITS_PER_PAGE_MASK; + map = &pspace->pidmap[(last + 1)/BITS_PER_PAGE]; + end = &pspace->pidmap[PIDMAP_ENTRIES]; + for (; map < end; map++, offset = 0) { + if (unlikely(!map->page)) + continue; + offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); + if (offset < BITS_PER_PAGE) + return mk_pid(pspace, map, offset); } return -1; } @@ -153,6 +175,7 @@ fastcall void put_pid(struct pid *pid) atomic_dec_and_test(&pid->count)) kmem_cache_free(pid_cachep, pid); } +EXPORT_SYMBOL_GPL(put_pid); static void delayed_put_pid(struct rcu_head *rhp) { @@ -169,7 +192,7 @@ fastcall void free_pid(struct pid *pid) hlist_del_rcu(&pid->pid_chain); spin_unlock_irqrestore(&pidmap_lock, flags); - free_pidmap(pid->nr); + free_pidmap(&init_pspace, pid->nr); call_rcu(&pid->rcu, delayed_put_pid); } @@ -183,7 +206,7 @@ struct pid *alloc_pid(void) if (!pid) goto out; - nr = alloc_pidmap(); + nr = alloc_pidmap(&init_pspace); if (nr < 0) goto out_free; @@ -217,6 +240,7 @@ struct pid * fastcall find_pid(int nr) } return NULL; } +EXPORT_SYMBOL_GPL(find_pid); int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr) { @@ -280,6 +304,15 @@ struct task_struct *find_task_by_pid_type(int type, int nr) EXPORT_SYMBOL(find_task_by_pid_type); +struct pid *get_task_pid(struct task_struct *task, enum pid_type type) +{ + struct pid *pid; + rcu_read_lock(); + pid = get_pid(task->pids[type].pid); + rcu_read_unlock(); + return pid; +} + struct task_struct *fastcall get_pid_task(struct pid *pid, enum pid_type type) { struct task_struct *result; @@ -303,6 +336,26 @@ struct pid *find_get_pid(pid_t nr) } /* + * Used by proc to find the first pid that is greater then or equal to nr. + * + * If there is a pid at nr this function is exactly the same as find_pid. + */ +struct pid *find_ge_pid(int nr) +{ + struct pid *pid; + + do { + pid = find_pid(nr); + if (pid) + break; + nr = next_pidmap(&init_pspace, nr); + } while (nr > 0); + + return pid; +} +EXPORT_SYMBOL_GPL(find_get_pid); + +/* * The pid hash table is scaled according to the amount of memory in the * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or * more. @@ -329,10 +382,10 @@ void __init pidhash_init(void) void __init pidmap_init(void) { - pidmap_array->page = (void *)get_zeroed_page(GFP_KERNEL); + init_pspace.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); /* Reserve PID 0. We never call free_pidmap(0) */ - set_bit(0, pidmap_array->page); - atomic_dec(&pidmap_array->nr_free); + set_bit(0, init_pspace.pidmap[0].page); + atomic_dec(&init_pspace.pidmap[0].nr_free); pid_cachep = kmem_cache_create("pid", sizeof(struct pid), __alignof__(struct pid), diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 1b84313cbab..99f9b7d177d 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -906,7 +906,7 @@ static void init_header(struct swsusp_info *info) memset(info, 0, sizeof(struct swsusp_info)); info->version_code = LINUX_VERSION_CODE; info->num_physpages = num_physpages; - memcpy(&info->uts, &system_utsname, sizeof(system_utsname)); + memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname)); info->cpus = num_online_cpus(); info->image_pages = nr_copy_pages; info->pages = nr_copy_pages + nr_meta_pages + 1; @@ -1050,13 +1050,13 @@ static inline int check_header(struct swsusp_info *info) reason = "kernel version"; if (info->num_physpages != num_physpages) reason = "memory size"; - if (strcmp(info->uts.sysname,system_utsname.sysname)) + if (strcmp(info->uts.sysname,init_utsname()->sysname)) reason = "system type"; - if (strcmp(info->uts.release,system_utsname.release)) + if (strcmp(info->uts.release,init_utsname()->release)) reason = "kernel release"; - if (strcmp(info->uts.version,system_utsname.version)) + if (strcmp(info->uts.version,init_utsname()->version)) reason = "version"; - if (strcmp(info->uts.machine,system_utsname.machine)) + if (strcmp(info->uts.machine,init_utsname()->machine)) reason = "machine"; if (reason) { printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason); diff --git a/kernel/sched.c b/kernel/sched.c index 2bbd948f016..e4e54e86f4a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4384,7 +4384,10 @@ EXPORT_SYMBOL(cpu_present_map); #ifndef CONFIG_SMP cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL; +EXPORT_SYMBOL(cpu_online_map); + cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL; +EXPORT_SYMBOL(cpu_possible_map); #endif long sched_getaffinity(pid_t pid, cpumask_t *mask) diff --git a/kernel/signal.c b/kernel/signal.c index fb5da6d19f1..7ed8d5304be 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1055,28 +1055,44 @@ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) } /* - * kill_pg_info() sends a signal to a process group: this is what the tty + * kill_pgrp_info() sends a signal to a process group: this is what the tty * control characters do (^C, ^Z etc) */ -int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) +int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) { struct task_struct *p = NULL; int retval, success; - if (pgrp <= 0) - return -EINVAL; - success = 0; retval = -ESRCH; - do_each_task_pid(pgrp, PIDTYPE_PGID, p) { + do_each_pid_task(pgrp, PIDTYPE_PGID, p) { int err = group_send_sig_info(sig, info, p); success |= !err; retval = err; - } while_each_task_pid(pgrp, PIDTYPE_PGID, p); + } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return success ? 0 : retval; } +int kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) +{ + int retval; + + read_lock(&tasklist_lock); + retval = __kill_pgrp_info(sig, info, pgrp); + read_unlock(&tasklist_lock); + + return retval; +} + +int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) +{ + if (pgrp <= 0) + return -EINVAL; + + return __kill_pgrp_info(sig, info, find_pid(pgrp)); +} + int kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) { @@ -1089,8 +1105,7 @@ kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) return retval; } -int -kill_proc_info(int sig, struct siginfo *info, pid_t pid) +int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) { int error; int acquired_tasklist_lock = 0; @@ -1101,7 +1116,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid) read_lock(&tasklist_lock); acquired_tasklist_lock = 1; } - p = find_task_by_pid(pid); + p = pid_task(pid, PIDTYPE_PID); error = -ESRCH; if (p) error = group_send_sig_info(sig, info, p); @@ -1111,8 +1126,18 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid) return error; } -/* like kill_proc_info(), but doesn't use uid/euid of "current" */ -int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, +int +kill_proc_info(int sig, struct siginfo *info, pid_t pid) +{ + int error; + rcu_read_lock(); + error = kill_pid_info(sig, info, find_pid(pid)); + rcu_read_unlock(); + return error; +} + +/* like kill_pid_info(), but doesn't use uid/euid of "current" */ +int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, uid_t uid, uid_t euid, u32 secid) { int ret = -EINVAL; @@ -1122,7 +1147,7 @@ int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, return ret; read_lock(&tasklist_lock); - p = find_task_by_pid(pid); + p = pid_task(pid, PIDTYPE_PID); if (!p) { ret = -ESRCH; goto out_unlock; @@ -1146,7 +1171,7 @@ out_unlock: read_unlock(&tasklist_lock); return ret; } -EXPORT_SYMBOL_GPL(kill_proc_info_as_uid); +EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); /* * kill_something_info() interprets pid in interesting ways just like kill(2). @@ -1264,6 +1289,18 @@ force_sigsegv(int sig, struct task_struct *p) return 0; } +int kill_pgrp(struct pid *pid, int sig, int priv) +{ + return kill_pgrp_info(sig, __si_special(priv), pid); +} +EXPORT_SYMBOL(kill_pgrp); + +int kill_pid(struct pid *pid, int sig, int priv) +{ + return kill_pid_info(sig, __si_special(priv), pid); +} +EXPORT_SYMBOL(kill_pid); + int kill_pg(pid_t pgrp, int sig, int priv) { diff --git a/kernel/sys.c b/kernel/sys.c index 2460581c928..2314867ae34 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -92,7 +92,8 @@ EXPORT_SYMBOL(fs_overflowgid); */ int C_A_D = 1; -int cad_pid = 1; +struct pid *cad_pid; +EXPORT_SYMBOL(cad_pid); /* * Notifier list for kernel code which wants to be called @@ -221,7 +222,7 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); * of the last notifier function called. */ -int atomic_notifier_call_chain(struct atomic_notifier_head *nh, +int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh, unsigned long val, void *v) { int ret; @@ -773,10 +774,9 @@ void ctrl_alt_del(void) if (C_A_D) schedule_work(&cad_work); else - kill_proc(cad_pid, SIGINT, 1); + kill_cad_pid(SIGINT, 1); } - /* * Unprivileged users may change the real gid to the effective gid * or vice versa. (BSD-style) @@ -1655,7 +1655,7 @@ asmlinkage long sys_newuname(struct new_utsname __user * name) int errno = 0; down_read(&uts_sem); - if (copy_to_user(name,&system_utsname,sizeof *name)) + if (copy_to_user(name, utsname(), sizeof *name)) errno = -EFAULT; up_read(&uts_sem); return errno; @@ -1673,8 +1673,8 @@ asmlinkage long sys_sethostname(char __user *name, int len) down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - memcpy(system_utsname.nodename, tmp, len); - system_utsname.nodename[len] = 0; + memcpy(utsname()->nodename, tmp, len); + utsname()->nodename[len] = 0; errno = 0; } up_write(&uts_sem); @@ -1690,11 +1690,11 @@ asmlinkage long sys_gethostname(char __user *name, int len) if (len < 0) return -EINVAL; down_read(&uts_sem); - i = 1 + strlen(system_utsname.nodename); + i = 1 + strlen(utsname()->nodename); if (i > len) i = len; errno = 0; - if (copy_to_user(name, system_utsname.nodename, i)) + if (copy_to_user(name, utsname()->nodename, i)) errno = -EFAULT; up_read(&uts_sem); return errno; @@ -1719,8 +1719,8 @@ asmlinkage long sys_setdomainname(char __user *name, int len) down_write(&uts_sem); errno = -EFAULT; if (!copy_from_user(tmp, name, len)) { - memcpy(system_utsname.domainname, tmp, len); - system_utsname.domainname[len] = 0; + memcpy(utsname()->domainname, tmp, len); + utsname()->domainname[len] = 0; errno = 0; } up_write(&uts_sem); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ba42694f045..8020fb273c4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -68,7 +68,6 @@ extern int sysrq_enabled; extern int core_uses_pid; extern int suid_dumpable; extern char core_pattern[]; -extern int cad_pid; extern int pid_max; extern int min_free_kbytes; extern int printk_ratelimit_jiffies; @@ -92,13 +91,8 @@ extern char modprobe_path[]; extern int sg_big_buff; #endif #ifdef CONFIG_SYSVIPC -extern size_t shm_ctlmax; -extern size_t shm_ctlall; -extern int shm_ctlmni; -extern int msg_ctlmax; -extern int msg_ctlmnb; -extern int msg_ctlmni; -extern int sem_ctls[]; +static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); #endif #ifdef __sparc__ @@ -139,7 +133,10 @@ static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, ctl_table *, void **); #endif -static int proc_doutsstring(ctl_table *table, int write, struct file *filp, +static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); + +static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); static ctl_table root_table[]; @@ -229,51 +226,100 @@ static ctl_table root_table[] = { }; static ctl_table kern_table[] = { +#ifndef CONFIG_UTS_NS + { + .ctl_name = KERN_OSTYPE, + .procname = "ostype", + .data = init_uts_ns.name.sysname, + .maxlen = sizeof(init_uts_ns.name.sysname), + .mode = 0444, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_OSRELEASE, + .procname = "osrelease", + .data = init_uts_ns.name.release, + .maxlen = sizeof(init_uts_ns.name.release), + .mode = 0444, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_VERSION, + .procname = "version", + .data = init_uts_ns.name.version, + .maxlen = sizeof(init_uts_ns.name.version), + .mode = 0444, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_NODENAME, + .procname = "hostname", + .data = init_uts_ns.name.nodename, + .maxlen = sizeof(init_uts_ns.name.nodename), + .mode = 0644, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_string, + }, + { + .ctl_name = KERN_DOMAINNAME, + .procname = "domainname", + .data = init_uts_ns.name.domainname, + .maxlen = sizeof(init_uts_ns.name.domainname), + .mode = 0644, + .proc_handler = &proc_do_uts_string, + .strategy = &sysctl_string, + }, +#else /* !CONFIG_UTS_NS */ { .ctl_name = KERN_OSTYPE, .procname = "ostype", - .data = system_utsname.sysname, - .maxlen = sizeof(system_utsname.sysname), + .data = NULL, + /* could maybe use __NEW_UTS_LEN here? */ + .maxlen = FIELD_SIZEOF(struct new_utsname, sysname), .mode = 0444, - .proc_handler = &proc_doutsstring, + .proc_handler = &proc_do_uts_string, .strategy = &sysctl_string, }, { .ctl_name = KERN_OSRELEASE, .procname = "osrelease", - .data = system_utsname.release, - .maxlen = sizeof(system_utsname.release), + .data = NULL, + .maxlen = FIELD_SIZEOF(struct new_utsname, release), .mode = 0444, - .proc_handler = &proc_doutsstring, + .proc_handler = &proc_do_uts_string, .strategy = &sysctl_string, }, { .ctl_name = KERN_VERSION, .procname = "version", - .data = system_utsname.version, - .maxlen = sizeof(system_utsname.version), + .data = NULL, + .maxlen = FIELD_SIZEOF(struct new_utsname, version), .mode = 0444, - .proc_handler = &proc_doutsstring, + .proc_handler = &proc_do_uts_string, .strategy = &sysctl_string, }, { .ctl_name = KERN_NODENAME, .procname = "hostname", - .data = system_utsname.nodename, - .maxlen = sizeof(system_utsname.nodename), + .data = NULL, + .maxlen = FIELD_SIZEOF(struct new_utsname, nodename), .mode = 0644, - .proc_handler = &proc_doutsstring, + .proc_handler = &proc_do_uts_string, .strategy = &sysctl_string, }, { .ctl_name = KERN_DOMAINNAME, .procname = "domainname", - .data = system_utsname.domainname, - .maxlen = sizeof(system_utsname.domainname), + .data = NULL, + .maxlen = FIELD_SIZEOF(struct new_utsname, domainname), .mode = 0644, - .proc_handler = &proc_doutsstring, + .proc_handler = &proc_do_uts_string, .strategy = &sysctl_string, }, +#endif /* !CONFIG_UTS_NS */ { .ctl_name = KERN_PANIC, .procname = "panic", @@ -432,58 +478,58 @@ static ctl_table kern_table[] = { { .ctl_name = KERN_SHMMAX, .procname = "shmmax", - .data = &shm_ctlmax, + .data = NULL, .maxlen = sizeof (size_t), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = &proc_do_ipc_string, }, { .ctl_name = KERN_SHMALL, .procname = "shmall", - .data = &shm_ctlall, + .data = NULL, .maxlen = sizeof (size_t), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = &proc_do_ipc_string, }, { .ctl_name = KERN_SHMMNI, .procname = "shmmni", - .data = &shm_ctlmni, + .data = NULL, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_ipc_string, }, { .ctl_name = KERN_MSGMAX, .procname = "msgmax", - .data = &msg_ctlmax, + .data = NULL, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_ipc_string, }, { .ctl_name = KERN_MSGMNI, .procname = "msgmni", - .data = &msg_ctlmni, + .data = NULL, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_ipc_string, }, { .ctl_name = KERN_MSGMNB, .procname = "msgmnb", - .data = &msg_ctlmnb, + .data = NULL, .maxlen = sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_ipc_string, }, { .ctl_name = KERN_SEM, .procname = "sem", - .data = &sem_ctls, + .data = NULL, .maxlen = 4*sizeof (int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_ipc_string, }, #endif #ifdef CONFIG_MAGIC_SYSRQ @@ -499,10 +545,10 @@ static ctl_table kern_table[] = { { .ctl_name = KERN_CADPID, .procname = "cad_pid", - .data = &cad_pid, + .data = NULL, .maxlen = sizeof (int), .mode = 0600, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_do_cad_pid, }, { .ctl_name = KERN_MAX_THREADS, @@ -1624,32 +1670,15 @@ static ssize_t proc_writesys(struct file * file, const char __user * buf, return do_rw_proc(1, file, (char __user *) buf, count, ppos); } -/** - * proc_dostring - read a string sysctl - * @table: the sysctl table - * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure - * @buffer: the user buffer - * @lenp: the size of the user buffer - * @ppos: file position - * - * Reads/writes a string from/to the user buffer. If the kernel - * buffer provided is not large enough to hold the string, the - * string is truncated. The copied string is %NULL-terminated. - * If the string is being read by the user process, it is copied - * and a newline '\n' is added. It is truncated if the buffer is - * not large enough. - * - * Returns 0 on success. - */ -int proc_dostring(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int _proc_do_string(void* data, int maxlen, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos) { size_t len; char __user *p; char c; - if (!table->data || !table->maxlen || !*lenp || + if (!data || !maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; @@ -1665,20 +1694,20 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, break; len++; } - if (len >= table->maxlen) - len = table->maxlen-1; - if(copy_from_user(table->data, buffer, len)) + if (len >= maxlen) + len = maxlen-1; + if(copy_from_user(data, buffer, len)) return -EFAULT; - ((char *) table->data)[len] = 0; + ((char *) data)[len] = 0; *ppos += *lenp; } else { - len = strlen(table->data); - if (len > table->maxlen) - len = table->maxlen; + len = strlen(data); + if (len > maxlen) + len = maxlen; if (len > *lenp) len = *lenp; if (len) - if(copy_to_user(buffer, table->data, len)) + if(copy_to_user(buffer, data, len)) return -EFAULT; if (len < *lenp) { if(put_user('\n', ((char __user *) buffer) + len)) @@ -1691,12 +1720,38 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, return 0; } +/** + * proc_dostring - read a string sysctl + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @filp: the file structure + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes a string from/to the user buffer. If the kernel + * buffer provided is not large enough to hold the string, the + * string is truncated. The copied string is %NULL-terminated. + * If the string is being read by the user process, it is copied + * and a newline '\n' is added. It is truncated if the buffer is + * not large enough. + * + * Returns 0 on success. + */ +int proc_dostring(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return _proc_do_string(table->data, table->maxlen, write, filp, + buffer, lenp, ppos); +} + /* * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? */ -static int proc_doutsstring(ctl_table *table, int write, struct file *filp, +#ifndef CONFIG_UTS_NS +static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int r; @@ -1712,6 +1767,48 @@ static int proc_doutsstring(ctl_table *table, int write, struct file *filp, } return r; } +#else /* !CONFIG_UTS_NS */ +static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int r; + struct uts_namespace* uts_ns = current->nsproxy->uts_ns; + char* which; + + switch (table->ctl_name) { + case KERN_OSTYPE: + which = uts_ns->name.sysname; + break; + case KERN_NODENAME: + which = uts_ns->name.nodename; + break; + case KERN_OSRELEASE: + which = uts_ns->name.release; + break; + case KERN_VERSION: + which = uts_ns->name.version; + break; + case KERN_DOMAINNAME: + which = uts_ns->name.domainname; + break; + default: + r = -EINVAL; + goto out; + } + + if (!write) { + down_read(&uts_sem); + r=_proc_do_string(which,table->maxlen,0,filp,buffer,lenp, ppos); + up_read(&uts_sem); + } else { + down_write(&uts_sem); + r=_proc_do_string(which,table->maxlen,1,filp,buffer,lenp, ppos); + up_write(&uts_sem); + } + out: + return r; +} +#endif /* !CONFIG_UTS_NS */ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, int *valp, @@ -1732,8 +1829,9 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, return 0; } -static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos, +static int __do_proc_dointvec(void *tbl_data, ctl_table *table, + int write, struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos, int (*conv)(int *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) @@ -1746,13 +1844,13 @@ static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, char buf[TMPBUFLEN], *p; char __user *s = buffer; - if (!table->data || !table->maxlen || !*lenp || + if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; } - i = (int *) table->data; + i = (int *) tbl_data; vleft = table->maxlen / sizeof(*i); left = *lenp; @@ -1841,6 +1939,16 @@ static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, #undef TMPBUFLEN } +static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(int *negp, unsigned long *lvalp, int *valp, + int write, void *data), + void *data) +{ + return __do_proc_dointvec(table->data, table, write, filp, + buffer, lenp, ppos, conv, data); +} + /** * proc_dointvec - read a vector of integers * @table: the sysctl table @@ -1974,7 +2082,7 @@ int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp, do_proc_dointvec_minmax_conv, ¶m); } -static int do_proc_doulongvec_minmax(ctl_table *table, int write, +static int __do_proc_doulongvec_minmax(void *data, ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos, @@ -1988,13 +2096,13 @@ static int do_proc_doulongvec_minmax(ctl_table *table, int write, char buf[TMPBUFLEN], *p; char __user *s = buffer; - if (!table->data || !table->maxlen || !*lenp || + if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; } - i = (unsigned long *) table->data; + i = (unsigned long *) data; min = (unsigned long *) table->extra1; max = (unsigned long *) table->extra2; vleft = table->maxlen / sizeof(unsigned long); @@ -2079,6 +2187,17 @@ static int do_proc_doulongvec_minmax(ctl_table *table, int write, #undef TMPBUFLEN } +static int do_proc_doulongvec_minmax(ctl_table *table, int write, + struct file *filp, + void __user *buffer, + size_t *lenp, loff_t *ppos, + unsigned long convmul, + unsigned long convdiv) +{ + return __do_proc_doulongvec_minmax(table->data, table, write, + filp, buffer, lenp, ppos, convmul, convdiv); +} + /** * proc_doulongvec_minmax - read a vector of long integers with min/max values * @table: the sysctl table @@ -2267,6 +2386,71 @@ int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp, do_proc_dointvec_ms_jiffies_conv, NULL); } +#ifdef CONFIG_SYSVIPC +static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + void *data; + struct ipc_namespace *ns; + + ns = current->nsproxy->ipc_ns; + + switch (table->ctl_name) { + case KERN_SHMMAX: + data = &ns->shm_ctlmax; + goto proc_minmax; + case KERN_SHMALL: + data = &ns->shm_ctlall; + goto proc_minmax; + case KERN_SHMMNI: + data = &ns->shm_ctlmni; + break; + case KERN_MSGMAX: + data = &ns->msg_ctlmax; + break; + case KERN_MSGMNI: + data = &ns->msg_ctlmni; + break; + case KERN_MSGMNB: + data = &ns->msg_ctlmnb; + break; + case KERN_SEM: + data = &ns->sem_ctls; + break; + default: + return -EINVAL; + } + + return __do_proc_dointvec(data, table, write, filp, buffer, + lenp, ppos, NULL, NULL); +proc_minmax: + return __do_proc_doulongvec_minmax(data, table, write, filp, buffer, + lenp, ppos, 1l, 1l); +} +#endif + +static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct pid *new_pid; + pid_t tmp; + int r; + + tmp = pid_nr(cad_pid); + + r = __do_proc_dointvec(&tmp, table, write, filp, buffer, + lenp, ppos, NULL, NULL); + if (r || !write) + return r; + + new_pid = find_get_pid(tmp); + if (!new_pid) + return -ESRCH; + + put_pid(xchg(&cad_pid, new_pid)); + return 0; +} + #else /* CONFIG_PROC_FS */ int proc_dostring(ctl_table *table, int write, struct file *filp, @@ -2275,12 +2459,20 @@ int proc_dostring(ctl_table *table, int write, struct file *filp, return -ENOSYS; } -static int proc_doutsstring(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } +#ifdef CONFIG_SYSVIPC +static int proc_do_ipc_string(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} +#endif + int proc_dointvec(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { diff --git a/kernel/utsname.c b/kernel/utsname.c new file mode 100644 index 00000000000..c859164a699 --- /dev/null +++ b/kernel/utsname.c @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2004 IBM Corporation + * + * Author: Serge Hallyn <serue@us.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include <linux/module.h> +#include <linux/uts.h> +#include <linux/utsname.h> +#include <linux/version.h> + +/* + * Clone a new ns copying an original utsname, setting refcount to 1 + * @old_ns: namespace to clone + * Return NULL on error (failure to kmalloc), new ns otherwise + */ +static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns) +{ + struct uts_namespace *ns; + + ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); + if (ns) { + memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); + kref_init(&ns->kref); + } + return ns; +} + +/* + * unshare the current process' utsname namespace. + * called only in sys_unshare() + */ +int unshare_utsname(unsigned long unshare_flags, struct uts_namespace **new_uts) +{ + if (unshare_flags & CLONE_NEWUTS) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + *new_uts = clone_uts_ns(current->nsproxy->uts_ns); + if (!*new_uts) + return -ENOMEM; + } + + return 0; +} + +/* + * Copy task tsk's utsname namespace, or clone it if flags + * specifies CLONE_NEWUTS. In latter case, changes to the + * utsname of this process won't be seen by parent, and vice + * versa. + */ +int copy_utsname(int flags, struct task_struct *tsk) +{ + struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; + struct uts_namespace *new_ns; + int err = 0; + + if (!old_ns) + return 0; + + get_uts_ns(old_ns); + + if (!(flags & CLONE_NEWUTS)) + return 0; + + if (!capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto out; + } + + new_ns = clone_uts_ns(old_ns); + if (!new_ns) { + err = -ENOMEM; + goto out; + } + tsk->nsproxy->uts_ns = new_ns; + +out: + put_uts_ns(old_ns); + return err; +} + +void free_uts_ns(struct kref *kref) +{ + struct uts_namespace *ns; + + ns = container_of(kref, struct uts_namespace, kref); + kfree(ns); +} diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index f9ae75cc014..756a908c441 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -384,3 +384,17 @@ config RCU_TORTURE_TEST at boot time (you probably don't). Say M if you want the RCU torture tests to build as a module. Say N if you are unsure. + +config LKDTM + tristate "Linux Kernel Dump Test Tool Module" + depends on KPROBES + default n + help + This module enables testing of the different dumping mechanisms by + inducing system failures at predefined crash points. + If you don't need it: say N + Choose M here to compile this code as a module. The module will be + called lkdtm. + + Documentation on how to use the module can be found in + drivers/misc/lkdtm.c diff --git a/lib/Makefile b/lib/Makefile index ddf3e676e1f..b0361756e22 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -2,7 +2,7 @@ # Makefile for some libs needed in the kernel. # -lib-y := errno.o ctype.o string.o vsprintf.o cmdline.o \ +lib-y := ctype.o string.o vsprintf.o cmdline.o \ bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \ idr.o div64.o int_sqrt.o bitmap.o extable.o prio_tree.o \ sha1.o diff --git a/lib/cpumask.c b/lib/cpumask.c index 3a67dc5ada7..7a2a73f88d5 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -43,3 +43,19 @@ int __any_online_cpu(const cpumask_t *mask) return cpu; } EXPORT_SYMBOL(__any_online_cpu); + +#if MAX_NUMNODES > 1 +/* + * Find the highest possible node id. + */ +int highest_possible_node_id(void) +{ + unsigned int node; + unsigned int highest = 0; + + for_each_node_mask(node, node_possible_map) + highest = node; + return highest; +} +EXPORT_SYMBOL(highest_possible_node_id); +#endif diff --git a/lib/errno.c b/lib/errno.c deleted file mode 100644 index 41cb9d76c05..00000000000 --- a/lib/errno.c +++ /dev/null @@ -1,7 +0,0 @@ -/* - * linux/lib/errno.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -int errno; diff --git a/lib/genalloc.c b/lib/genalloc.c index 71338b48e88..75ae68ce03e 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -14,11 +14,13 @@ #include <linux/genalloc.h> -/* - * Create a new special memory pool. - * +/** + * gen_pool_create - create a new special memory pool * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents * @nid: node id of the node the pool structure should be allocated on, or -1 + * + * Create a new special memory pool that can be used to manage special purpose + * memory not managed by the regular kmalloc/kfree interface. */ struct gen_pool *gen_pool_create(int min_alloc_order, int nid) { @@ -34,15 +36,15 @@ struct gen_pool *gen_pool_create(int min_alloc_order, int nid) } EXPORT_SYMBOL(gen_pool_create); - -/* - * Add a new chunk of memory to the specified pool. - * +/** + * gen_pool_add - add a new chunk of special memory to the pool * @pool: pool to add new memory chunk to * @addr: starting address of memory chunk to add to pool * @size: size in bytes of the memory chunk to add to pool * @nid: node id of the node the chunk structure and bitmap should be * allocated on, or -1 + * + * Add a new chunk of special memory to the specified pool. */ int gen_pool_add(struct gen_pool *pool, unsigned long addr, size_t size, int nid) @@ -69,13 +71,44 @@ int gen_pool_add(struct gen_pool *pool, unsigned long addr, size_t size, } EXPORT_SYMBOL(gen_pool_add); - -/* - * Allocate the requested number of bytes from the specified pool. - * Uses a first-fit algorithm. +/** + * gen_pool_destroy - destroy a special memory pool + * @pool: pool to destroy * + * Destroy the specified special memory pool. Verifies that there are no + * outstanding allocations. + */ +void gen_pool_destroy(struct gen_pool *pool) +{ + struct list_head *_chunk, *_next_chunk; + struct gen_pool_chunk *chunk; + int order = pool->min_alloc_order; + int bit, end_bit; + + + write_lock(&pool->lock); + list_for_each_safe(_chunk, _next_chunk, &pool->chunks) { + chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk); + list_del(&chunk->next_chunk); + + end_bit = (chunk->end_addr - chunk->start_addr) >> order; + bit = find_next_bit(chunk->bits, end_bit, 0); + BUG_ON(bit < end_bit); + + kfree(chunk); + } + kfree(pool); + return; +} +EXPORT_SYMBOL(gen_pool_destroy); + +/** + * gen_pool_alloc - allocate special memory from the pool * @pool: pool to allocate from * @size: number of bytes to allocate from the pool + * + * Allocate the requested number of bytes from the specified pool. + * Uses a first-fit algorithm. */ unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size) { @@ -127,13 +160,13 @@ unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size) } EXPORT_SYMBOL(gen_pool_alloc); - -/* - * Free the specified memory back to the specified pool. - * +/** + * gen_pool_free - free allocated special memory back to the pool * @pool: pool to free to * @addr: starting address of memory to free back to pool * @size: size in bytes of memory to free + * + * Free previously allocated special memory back to the specified pool. */ void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size) { diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index 26f322737db..1958ad1b854 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -1011,7 +1011,7 @@ static int rfcomm_tty_tiocmset(struct tty_struct *tty, struct file *filp, unsign /* ---- TTY structure ---- */ -static struct tty_operations rfcomm_ops = { +static const struct tty_operations rfcomm_ops = { .open = rfcomm_tty_open, .close = rfcomm_tty_close, .write = rfcomm_tty_write, diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 1fbb38415b1..f8ce8475915 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -366,7 +366,7 @@ static int __init ic_defaults(void) */ if (!ic_host_name_set) - sprintf(system_utsname.nodename, "%u.%u.%u.%u", NIPQUAD(ic_myaddr)); + sprintf(init_utsname()->nodename, "%u.%u.%u.%u", NIPQUAD(ic_myaddr)); if (root_server_addr == INADDR_NONE) root_server_addr = ic_servaddr; @@ -805,7 +805,7 @@ static void __init ic_do_bootp_ext(u8 *ext) } break; case 12: /* Host name */ - ic_bootp_string(system_utsname.nodename, ext+1, *ext, __NEW_UTS_LEN); + ic_bootp_string(utsname()->nodename, ext+1, *ext, __NEW_UTS_LEN); ic_host_name_set = 1; break; case 15: /* Domain name (DNS) */ @@ -816,7 +816,7 @@ static void __init ic_do_bootp_ext(u8 *ext) ic_bootp_string(root_server_path, ext+1, *ext, sizeof(root_server_path)); break; case 40: /* NIS Domain name (_not_ DNS) */ - ic_bootp_string(system_utsname.domainname, ext+1, *ext, __NEW_UTS_LEN); + ic_bootp_string(utsname()->domainname, ext+1, *ext, __NEW_UTS_LEN); break; } } @@ -1368,7 +1368,7 @@ static int __init ip_auto_config(void) printk(", mask=%u.%u.%u.%u", NIPQUAD(ic_netmask)); printk(", gw=%u.%u.%u.%u", NIPQUAD(ic_gateway)); printk(",\n host=%s, domain=%s, nis-domain=%s", - system_utsname.nodename, ic_domain, system_utsname.domainname); + utsname()->nodename, ic_domain, utsname()->domainname); printk(",\n bootserver=%u.%u.%u.%u", NIPQUAD(ic_servaddr)); printk(", rootserver=%u.%u.%u.%u", NIPQUAD(root_server_addr)); printk(", rootpath=%s", root_server_path); @@ -1478,11 +1478,11 @@ static int __init ip_auto_config_setup(char *addrs) case 4: if ((dp = strchr(ip, '.'))) { *dp++ = '\0'; - strlcpy(system_utsname.domainname, dp, - sizeof(system_utsname.domainname)); + strlcpy(utsname()->domainname, dp, + sizeof(utsname()->domainname)); } - strlcpy(system_utsname.nodename, ip, - sizeof(system_utsname.nodename)); + strlcpy(utsname()->nodename, ip, + sizeof(utsname()->nodename)); ic_host_name_set = 1; break; case 5: diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index dab37d2f65f..4be336f1788 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -99,8 +99,10 @@ static int jtcp_sendmsg(struct kiocb *iocb, struct sock *sk, } static struct jprobe tcp_send_probe = { - .kp = { .addr = (kprobe_opcode_t *) &tcp_sendmsg, }, - .entry = (kprobe_opcode_t *) &jtcp_sendmsg, + .kp = { + .symbol_name = "tcp_sendmsg", + }, + .entry = JPROBE_ENTRY(jtcp_sendmsg), }; diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c index 3bcdb467efc..d50a02030ad 100644 --- a/net/irda/ircomm/ircomm_tty.c +++ b/net/irda/ircomm/ircomm_tty.c @@ -79,7 +79,7 @@ static struct tty_driver *driver; hashbin_t *ircomm_tty = NULL; -static struct tty_operations ops = { +static const struct tty_operations ops = { .open = ircomm_tty_open, .close = ircomm_tty_close, .write = ircomm_tty_write, diff --git a/net/socket.c b/net/socket.c index 01918f7a301..6c9b9b326d7 100644 --- a/net/socket.c +++ b/net/socket.c @@ -825,7 +825,7 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) break; case FIOGETOWN: case SIOCGPGRP: - err = put_user(sock->file->f_owner.pid, + err = put_user(f_getown(sock->file), (int __user *)argp); break; case SIOCGIFBR: diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 124ff0ceb55..78696f2dc7d 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -161,10 +161,10 @@ static struct rpc_clnt * rpc_new_client(struct rpc_xprt *xprt, char *servname, s } /* save the nodename */ - clnt->cl_nodelen = strlen(system_utsname.nodename); + clnt->cl_nodelen = strlen(utsname()->nodename); if (clnt->cl_nodelen > UNX_MAXNODENAME) clnt->cl_nodelen = UNX_MAXNODENAME; - memcpy(clnt->cl_nodename, system_utsname.nodename, clnt->cl_nodelen); + memcpy(clnt->cl_nodename, utsname()->nodename, clnt->cl_nodelen); return clnt; out_no_auth: diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 26c0531d7e2..192dff5dabc 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -70,6 +70,8 @@ EXPORT_SYMBOL(put_rpccred); /* RPC server stuff */ EXPORT_SYMBOL(svc_create); EXPORT_SYMBOL(svc_create_thread); +EXPORT_SYMBOL(svc_create_pooled); +EXPORT_SYMBOL(svc_set_num_threads); EXPORT_SYMBOL(svc_exit_thread); EXPORT_SYMBOL(svc_destroy); EXPORT_SYMBOL(svc_drop); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 44b8d9d4c18..a99e67b164c 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -4,6 +4,10 @@ * High-level RPC service routines * * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de> + * + * Multiple threads pools and NUMAisation + * Copyright (c) 2006 Silicon Graphics, Inc. + * by Greg Banks <gnb@melbourne.sgi.com> */ #include <linux/linkage.h> @@ -12,6 +16,8 @@ #include <linux/net.h> #include <linux/in.h> #include <linux/mm.h> +#include <linux/interrupt.h> +#include <linux/module.h> #include <linux/sunrpc/types.h> #include <linux/sunrpc/xdr.h> @@ -23,14 +29,252 @@ #define RPC_PARANOIA 1 /* + * Mode for mapping cpus to pools. + */ +enum { + SVC_POOL_NONE = -1, /* uninitialised, choose one of the others */ + SVC_POOL_GLOBAL, /* no mapping, just a single global pool + * (legacy & UP mode) */ + SVC_POOL_PERCPU, /* one pool per cpu */ + SVC_POOL_PERNODE /* one pool per numa node */ +}; + +/* + * Structure for mapping cpus to pools and vice versa. + * Setup once during sunrpc initialisation. + */ +static struct svc_pool_map { + int mode; /* Note: int not enum to avoid + * warnings about "enumeration value + * not handled in switch" */ + unsigned int npools; + unsigned int *pool_to; /* maps pool id to cpu or node */ + unsigned int *to_pool; /* maps cpu or node to pool id */ +} svc_pool_map = { + .mode = SVC_POOL_NONE +}; + + +/* + * Detect best pool mapping mode heuristically, + * according to the machine's topology. + */ +static int +svc_pool_map_choose_mode(void) +{ + unsigned int node; + + if (num_online_nodes() > 1) { + /* + * Actually have multiple NUMA nodes, + * so split pools on NUMA node boundaries + */ + return SVC_POOL_PERNODE; + } + + node = any_online_node(node_online_map); + if (nr_cpus_node(node) > 2) { + /* + * Non-trivial SMP, or CONFIG_NUMA on + * non-NUMA hardware, e.g. with a generic + * x86_64 kernel on Xeons. In this case we + * want to divide the pools on cpu boundaries. + */ + return SVC_POOL_PERCPU; + } + + /* default: one global pool */ + return SVC_POOL_GLOBAL; +} + +/* + * Allocate the to_pool[] and pool_to[] arrays. + * Returns 0 on success or an errno. + */ +static int +svc_pool_map_alloc_arrays(struct svc_pool_map *m, unsigned int maxpools) +{ + m->to_pool = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL); + if (!m->to_pool) + goto fail; + m->pool_to = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL); + if (!m->pool_to) + goto fail_free; + + return 0; + +fail_free: + kfree(m->to_pool); +fail: + return -ENOMEM; +} + +/* + * Initialise the pool map for SVC_POOL_PERCPU mode. + * Returns number of pools or <0 on error. + */ +static int +svc_pool_map_init_percpu(struct svc_pool_map *m) +{ + unsigned int maxpools = highest_possible_processor_id()+1; + unsigned int pidx = 0; + unsigned int cpu; + int err; + + err = svc_pool_map_alloc_arrays(m, maxpools); + if (err) + return err; + + for_each_online_cpu(cpu) { + BUG_ON(pidx > maxpools); + m->to_pool[cpu] = pidx; + m->pool_to[pidx] = cpu; + pidx++; + } + /* cpus brought online later all get mapped to pool0, sorry */ + + return pidx; +}; + + +/* + * Initialise the pool map for SVC_POOL_PERNODE mode. + * Returns number of pools or <0 on error. + */ +static int +svc_pool_map_init_pernode(struct svc_pool_map *m) +{ + unsigned int maxpools = highest_possible_node_id()+1; + unsigned int pidx = 0; + unsigned int node; + int err; + + err = svc_pool_map_alloc_arrays(m, maxpools); + if (err) + return err; + + for_each_node_with_cpus(node) { + /* some architectures (e.g. SN2) have cpuless nodes */ + BUG_ON(pidx > maxpools); + m->to_pool[node] = pidx; + m->pool_to[pidx] = node; + pidx++; + } + /* nodes brought online later all get mapped to pool0, sorry */ + + return pidx; +} + + +/* + * Build the global map of cpus to pools and vice versa. + */ +static unsigned int +svc_pool_map_init(void) +{ + struct svc_pool_map *m = &svc_pool_map; + int npools = -1; + + if (m->mode != SVC_POOL_NONE) + return m->npools; + + m->mode = svc_pool_map_choose_mode(); + + switch (m->mode) { + case SVC_POOL_PERCPU: + npools = svc_pool_map_init_percpu(m); + break; + case SVC_POOL_PERNODE: + npools = svc_pool_map_init_pernode(m); + break; + } + + if (npools < 0) { + /* default, or memory allocation failure */ + npools = 1; + m->mode = SVC_POOL_GLOBAL; + } + m->npools = npools; + + return m->npools; +} + +/* + * Set the current thread's cpus_allowed mask so that it + * will only run on cpus in the given pool. + * + * Returns 1 and fills in oldmask iff a cpumask was applied. + */ +static inline int +svc_pool_map_set_cpumask(unsigned int pidx, cpumask_t *oldmask) +{ + struct svc_pool_map *m = &svc_pool_map; + unsigned int node; /* or cpu */ + + /* + * The caller checks for sv_nrpools > 1, which + * implies that we've been initialized and the + * map mode is not NONE. + */ + BUG_ON(m->mode == SVC_POOL_NONE); + + switch (m->mode) + { + default: + return 0; + case SVC_POOL_PERCPU: + node = m->pool_to[pidx]; + *oldmask = current->cpus_allowed; + set_cpus_allowed(current, cpumask_of_cpu(node)); + return 1; + case SVC_POOL_PERNODE: + node = m->pool_to[pidx]; + *oldmask = current->cpus_allowed; + set_cpus_allowed(current, node_to_cpumask(node)); + return 1; + } +} + +/* + * Use the mapping mode to choose a pool for a given CPU. + * Used when enqueueing an incoming RPC. Always returns + * a non-NULL pool pointer. + */ +struct svc_pool * +svc_pool_for_cpu(struct svc_serv *serv, int cpu) +{ + struct svc_pool_map *m = &svc_pool_map; + unsigned int pidx = 0; + + /* + * SVC_POOL_NONE happens in a pure client when + * lockd is brought up, so silently treat it the + * same as SVC_POOL_GLOBAL. + */ + + switch (m->mode) { + case SVC_POOL_PERCPU: + pidx = m->to_pool[cpu]; + break; + case SVC_POOL_PERNODE: + pidx = m->to_pool[cpu_to_node(cpu)]; + break; + } + return &serv->sv_pools[pidx % serv->sv_nrpools]; +} + + +/* * Create an RPC service */ -struct svc_serv * -svc_create(struct svc_program *prog, unsigned int bufsize) +static struct svc_serv * +__svc_create(struct svc_program *prog, unsigned int bufsize, int npools, + void (*shutdown)(struct svc_serv *serv)) { struct svc_serv *serv; int vers; unsigned int xdrsize; + unsigned int i; if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL))) return NULL; @@ -39,6 +283,7 @@ svc_create(struct svc_program *prog, unsigned int bufsize) serv->sv_nrthreads = 1; serv->sv_stats = prog->pg_stats; serv->sv_bufsz = bufsize? bufsize : 4096; + serv->sv_shutdown = shutdown; xdrsize = 0; while (prog) { prog->pg_lovers = prog->pg_nvers-1; @@ -53,20 +298,68 @@ svc_create(struct svc_program *prog, unsigned int bufsize) prog = prog->pg_next; } serv->sv_xdrsize = xdrsize; - INIT_LIST_HEAD(&serv->sv_threads); - INIT_LIST_HEAD(&serv->sv_sockets); INIT_LIST_HEAD(&serv->sv_tempsocks); INIT_LIST_HEAD(&serv->sv_permsocks); + init_timer(&serv->sv_temptimer); spin_lock_init(&serv->sv_lock); + serv->sv_nrpools = npools; + serv->sv_pools = + kcalloc(sizeof(struct svc_pool), serv->sv_nrpools, + GFP_KERNEL); + if (!serv->sv_pools) { + kfree(serv); + return NULL; + } + + for (i = 0; i < serv->sv_nrpools; i++) { + struct svc_pool *pool = &serv->sv_pools[i]; + + dprintk("initialising pool %u for %s\n", + i, serv->sv_name); + + pool->sp_id = i; + INIT_LIST_HEAD(&pool->sp_threads); + INIT_LIST_HEAD(&pool->sp_sockets); + INIT_LIST_HEAD(&pool->sp_all_threads); + spin_lock_init(&pool->sp_lock); + } + + /* Remove any stale portmap registrations */ svc_register(serv, 0, 0); return serv; } +struct svc_serv * +svc_create(struct svc_program *prog, unsigned int bufsize, + void (*shutdown)(struct svc_serv *serv)) +{ + return __svc_create(prog, bufsize, /*npools*/1, shutdown); +} + +struct svc_serv * +svc_create_pooled(struct svc_program *prog, unsigned int bufsize, + void (*shutdown)(struct svc_serv *serv), + svc_thread_fn func, int sig, struct module *mod) +{ + struct svc_serv *serv; + unsigned int npools = svc_pool_map_init(); + + serv = __svc_create(prog, bufsize, npools, shutdown); + + if (serv != NULL) { + serv->sv_function = func; + serv->sv_kill_signal = sig; + serv->sv_module = mod; + } + + return serv; +} + /* - * Destroy an RPC service + * Destroy an RPC service. Should be called with the BKL held */ void svc_destroy(struct svc_serv *serv) @@ -85,12 +378,17 @@ svc_destroy(struct svc_serv *serv) } else printk("svc_destroy: no threads for serv=%p!\n", serv); + del_timer_sync(&serv->sv_temptimer); + while (!list_empty(&serv->sv_tempsocks)) { svsk = list_entry(serv->sv_tempsocks.next, struct svc_sock, sk_list); svc_delete_socket(svsk); } + if (serv->sv_shutdown) + serv->sv_shutdown(serv); + while (!list_empty(&serv->sv_permsocks)) { svsk = list_entry(serv->sv_permsocks.next, struct svc_sock, @@ -102,6 +400,7 @@ svc_destroy(struct svc_serv *serv) /* Unregister service with the portmapper */ svc_register(serv, 0, 0); + kfree(serv->sv_pools); kfree(serv); } @@ -150,13 +449,18 @@ svc_release_buffer(struct svc_rqst *rqstp) } /* - * Create a server thread + * Create a thread in the given pool. Caller must hold BKL. + * On a NUMA or SMP machine, with a multi-pool serv, the thread + * will be restricted to run on the cpus belonging to the pool. */ -int -svc_create_thread(svc_thread_fn func, struct svc_serv *serv) +static int +__svc_create_thread(svc_thread_fn func, struct svc_serv *serv, + struct svc_pool *pool) { struct svc_rqst *rqstp; int error = -ENOMEM; + int have_oldmask = 0; + cpumask_t oldmask; rqstp = kzalloc(sizeof(*rqstp), GFP_KERNEL); if (!rqstp) @@ -170,8 +474,21 @@ svc_create_thread(svc_thread_fn func, struct svc_serv *serv) goto out_thread; serv->sv_nrthreads++; + spin_lock_bh(&pool->sp_lock); + pool->sp_nrthreads++; + list_add(&rqstp->rq_all, &pool->sp_all_threads); + spin_unlock_bh(&pool->sp_lock); rqstp->rq_server = serv; + rqstp->rq_pool = pool; + + if (serv->sv_nrpools > 1) + have_oldmask = svc_pool_map_set_cpumask(pool->sp_id, &oldmask); + error = kernel_thread((int (*)(void *)) func, rqstp, 0); + + if (have_oldmask) + set_cpus_allowed(current, oldmask); + if (error < 0) goto out_thread; svc_sock_update_bufs(serv); @@ -185,17 +502,136 @@ out_thread: } /* - * Destroy an RPC server thread + * Create a thread in the default pool. Caller must hold BKL. + */ +int +svc_create_thread(svc_thread_fn func, struct svc_serv *serv) +{ + return __svc_create_thread(func, serv, &serv->sv_pools[0]); +} + +/* + * Choose a pool in which to create a new thread, for svc_set_num_threads + */ +static inline struct svc_pool * +choose_pool(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) +{ + if (pool != NULL) + return pool; + + return &serv->sv_pools[(*state)++ % serv->sv_nrpools]; +} + +/* + * Choose a thread to kill, for svc_set_num_threads + */ +static inline struct task_struct * +choose_victim(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) +{ + unsigned int i; + struct task_struct *task = NULL; + + if (pool != NULL) { + spin_lock_bh(&pool->sp_lock); + } else { + /* choose a pool in round-robin fashion */ + for (i = 0; i < serv->sv_nrpools; i++) { + pool = &serv->sv_pools[--(*state) % serv->sv_nrpools]; + spin_lock_bh(&pool->sp_lock); + if (!list_empty(&pool->sp_all_threads)) + goto found_pool; + spin_unlock_bh(&pool->sp_lock); + } + return NULL; + } + +found_pool: + if (!list_empty(&pool->sp_all_threads)) { + struct svc_rqst *rqstp; + + /* + * Remove from the pool->sp_all_threads list + * so we don't try to kill it again. + */ + rqstp = list_entry(pool->sp_all_threads.next, struct svc_rqst, rq_all); + list_del_init(&rqstp->rq_all); + task = rqstp->rq_task; + } + spin_unlock_bh(&pool->sp_lock); + + return task; +} + +/* + * Create or destroy enough new threads to make the number + * of threads the given number. If `pool' is non-NULL, applies + * only to threads in that pool, otherwise round-robins between + * all pools. Must be called with a svc_get() reference and + * the BKL held. + * + * Destroying threads relies on the service threads filling in + * rqstp->rq_task, which only the nfs ones do. Assumes the serv + * has been created using svc_create_pooled(). + * + * Based on code that used to be in nfsd_svc() but tweaked + * to be pool-aware. + */ +int +svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) +{ + struct task_struct *victim; + int error = 0; + unsigned int state = serv->sv_nrthreads-1; + + if (pool == NULL) { + /* The -1 assumes caller has done a svc_get() */ + nrservs -= (serv->sv_nrthreads-1); + } else { + spin_lock_bh(&pool->sp_lock); + nrservs -= pool->sp_nrthreads; + spin_unlock_bh(&pool->sp_lock); + } + + /* create new threads */ + while (nrservs > 0) { + nrservs--; + __module_get(serv->sv_module); + error = __svc_create_thread(serv->sv_function, serv, + choose_pool(serv, pool, &state)); + if (error < 0) { + module_put(serv->sv_module); + break; + } + } + /* destroy old threads */ + while (nrservs < 0 && + (victim = choose_victim(serv, pool, &state)) != NULL) { + send_sig(serv->sv_kill_signal, victim, 1); + nrservs++; + } + + return error; +} + +/* + * Called from a server thread as it's exiting. Caller must hold BKL. */ void svc_exit_thread(struct svc_rqst *rqstp) { struct svc_serv *serv = rqstp->rq_server; + struct svc_pool *pool = rqstp->rq_pool; svc_release_buffer(rqstp); kfree(rqstp->rq_resp); kfree(rqstp->rq_argp); kfree(rqstp->rq_auth_data); + + spin_lock_bh(&pool->sp_lock); + pool->sp_nrthreads--; + list_del(&rqstp->rq_all); + spin_unlock_bh(&pool->sp_lock); + kfree(rqstp); /* Release the server */ @@ -248,13 +684,14 @@ svc_register(struct svc_serv *serv, int proto, unsigned short port) * Process the RPC request. */ int -svc_process(struct svc_serv *serv, struct svc_rqst *rqstp) +svc_process(struct svc_rqst *rqstp) { struct svc_program *progp; struct svc_version *versp = NULL; /* compiler food */ struct svc_procedure *procp = NULL; struct kvec * argv = &rqstp->rq_arg.head[0]; struct kvec * resv = &rqstp->rq_res.head[0]; + struct svc_serv *serv = rqstp->rq_server; kxdrproc_t xdr; __be32 *statp; u32 dir, prog, vers, proc; diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c index 1020d54b01d..40d41a2831d 100644 --- a/net/sunrpc/svcauth_unix.c +++ b/net/sunrpc/svcauth_unix.c @@ -348,12 +348,9 @@ int auth_unix_forget_old(struct auth_domain *dom) struct auth_domain *auth_unix_lookup(struct in_addr addr) { - struct ip_map key, *ipm; + struct ip_map *ipm; struct auth_domain *rv; - strcpy(key.m_class, "nfsd"); - key.m_addr = addr; - ipm = ip_map_lookup("nfsd", addr); if (!ipm) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 5b0fe1b66a2..cba85d19522 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -31,6 +31,7 @@ #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/skbuff.h> +#include <linux/file.h> #include <net/sock.h> #include <net/checksum.h> #include <net/ip.h> @@ -45,13 +46,16 @@ /* SMP locking strategy: * - * svc_serv->sv_lock protects most stuff for that service. + * svc_pool->sp_lock protects most of the fields of that pool. + * svc_serv->sv_lock protects sv_tempsocks, sv_permsocks, sv_tmpcnt. + * when both need to be taken (rare), svc_serv->sv_lock is first. + * BKL protects svc_serv->sv_nrthread. + * svc_sock->sk_defer_lock protects the svc_sock->sk_deferred list + * svc_sock->sk_flags.SK_BUSY prevents a svc_sock being enqueued multiply. * * Some flags can be set to certain values at any time * providing that certain rules are followed: * - * SK_BUSY can be set to 0 at any time. - * svc_sock_enqueue must be called afterwards * SK_CONN, SK_DATA, can be set or cleared at any time. * after a set, svc_sock_enqueue must be called. * after a clear, the socket must be read/accepted @@ -73,23 +77,30 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk); static int svc_deferred_recv(struct svc_rqst *rqstp); static struct cache_deferred_req *svc_defer(struct cache_req *req); +/* apparently the "standard" is that clients close + * idle connections after 5 minutes, servers after + * 6 minutes + * http://www.connectathon.org/talks96/nfstcp.pdf + */ +static int svc_conn_age_period = 6*60; + /* - * Queue up an idle server thread. Must have serv->sv_lock held. + * Queue up an idle server thread. Must have pool->sp_lock held. * Note: this is really a stack rather than a queue, so that we only - * use as many different threads as we need, and the rest don't polute + * use as many different threads as we need, and the rest don't pollute * the cache. */ static inline void -svc_serv_enqueue(struct svc_serv *serv, struct svc_rqst *rqstp) +svc_thread_enqueue(struct svc_pool *pool, struct svc_rqst *rqstp) { - list_add(&rqstp->rq_list, &serv->sv_threads); + list_add(&rqstp->rq_list, &pool->sp_threads); } /* - * Dequeue an nfsd thread. Must have serv->sv_lock held. + * Dequeue an nfsd thread. Must have pool->sp_lock held. */ static inline void -svc_serv_dequeue(struct svc_serv *serv, struct svc_rqst *rqstp) +svc_thread_dequeue(struct svc_pool *pool, struct svc_rqst *rqstp) { list_del(&rqstp->rq_list); } @@ -140,7 +151,9 @@ static void svc_sock_enqueue(struct svc_sock *svsk) { struct svc_serv *serv = svsk->sk_server; + struct svc_pool *pool; struct svc_rqst *rqstp; + int cpu; if (!(svsk->sk_flags & ( (1<<SK_CONN)|(1<<SK_DATA)|(1<<SK_CLOSE)|(1<<SK_DEFERRED)) )) @@ -148,10 +161,14 @@ svc_sock_enqueue(struct svc_sock *svsk) if (test_bit(SK_DEAD, &svsk->sk_flags)) return; - spin_lock_bh(&serv->sv_lock); + cpu = get_cpu(); + pool = svc_pool_for_cpu(svsk->sk_server, cpu); + put_cpu(); - if (!list_empty(&serv->sv_threads) && - !list_empty(&serv->sv_sockets)) + spin_lock_bh(&pool->sp_lock); + + if (!list_empty(&pool->sp_threads) && + !list_empty(&pool->sp_sockets)) printk(KERN_ERR "svc_sock_enqueue: threads and sockets both waiting??\n"); @@ -161,73 +178,79 @@ svc_sock_enqueue(struct svc_sock *svsk) goto out_unlock; } - if (test_bit(SK_BUSY, &svsk->sk_flags)) { - /* Don't enqueue socket while daemon is receiving */ + /* Mark socket as busy. It will remain in this state until the + * server has processed all pending data and put the socket back + * on the idle list. We update SK_BUSY atomically because + * it also guards against trying to enqueue the svc_sock twice. + */ + if (test_and_set_bit(SK_BUSY, &svsk->sk_flags)) { + /* Don't enqueue socket while already enqueued */ dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk); goto out_unlock; } + BUG_ON(svsk->sk_pool != NULL); + svsk->sk_pool = pool; set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); - if (((svsk->sk_reserved + serv->sv_bufsz)*2 + if (((atomic_read(&svsk->sk_reserved) + serv->sv_bufsz)*2 > svc_sock_wspace(svsk)) && !test_bit(SK_CLOSE, &svsk->sk_flags) && !test_bit(SK_CONN, &svsk->sk_flags)) { /* Don't enqueue while not enough space for reply */ dprintk("svc: socket %p no space, %d*2 > %ld, not enqueued\n", - svsk->sk_sk, svsk->sk_reserved+serv->sv_bufsz, + svsk->sk_sk, atomic_read(&svsk->sk_reserved)+serv->sv_bufsz, svc_sock_wspace(svsk)); + svsk->sk_pool = NULL; + clear_bit(SK_BUSY, &svsk->sk_flags); goto out_unlock; } clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); - /* Mark socket as busy. It will remain in this state until the - * server has processed all pending data and put the socket back - * on the idle list. - */ - set_bit(SK_BUSY, &svsk->sk_flags); - if (!list_empty(&serv->sv_threads)) { - rqstp = list_entry(serv->sv_threads.next, + if (!list_empty(&pool->sp_threads)) { + rqstp = list_entry(pool->sp_threads.next, struct svc_rqst, rq_list); dprintk("svc: socket %p served by daemon %p\n", svsk->sk_sk, rqstp); - svc_serv_dequeue(serv, rqstp); + svc_thread_dequeue(pool, rqstp); if (rqstp->rq_sock) printk(KERN_ERR "svc_sock_enqueue: server %p, rq_sock=%p!\n", rqstp, rqstp->rq_sock); rqstp->rq_sock = svsk; - svsk->sk_inuse++; + atomic_inc(&svsk->sk_inuse); rqstp->rq_reserved = serv->sv_bufsz; - svsk->sk_reserved += rqstp->rq_reserved; + atomic_add(rqstp->rq_reserved, &svsk->sk_reserved); + BUG_ON(svsk->sk_pool != pool); wake_up(&rqstp->rq_wait); } else { dprintk("svc: socket %p put into queue\n", svsk->sk_sk); - list_add_tail(&svsk->sk_ready, &serv->sv_sockets); + list_add_tail(&svsk->sk_ready, &pool->sp_sockets); + BUG_ON(svsk->sk_pool != pool); } out_unlock: - spin_unlock_bh(&serv->sv_lock); + spin_unlock_bh(&pool->sp_lock); } /* - * Dequeue the first socket. Must be called with the serv->sv_lock held. + * Dequeue the first socket. Must be called with the pool->sp_lock held. */ static inline struct svc_sock * -svc_sock_dequeue(struct svc_serv *serv) +svc_sock_dequeue(struct svc_pool *pool) { struct svc_sock *svsk; - if (list_empty(&serv->sv_sockets)) + if (list_empty(&pool->sp_sockets)) return NULL; - svsk = list_entry(serv->sv_sockets.next, + svsk = list_entry(pool->sp_sockets.next, struct svc_sock, sk_ready); list_del_init(&svsk->sk_ready); dprintk("svc: socket %p dequeued, inuse=%d\n", - svsk->sk_sk, svsk->sk_inuse); + svsk->sk_sk, atomic_read(&svsk->sk_inuse)); return svsk; } @@ -241,6 +264,7 @@ svc_sock_dequeue(struct svc_serv *serv) static inline void svc_sock_received(struct svc_sock *svsk) { + svsk->sk_pool = NULL; clear_bit(SK_BUSY, &svsk->sk_flags); svc_sock_enqueue(svsk); } @@ -262,10 +286,8 @@ void svc_reserve(struct svc_rqst *rqstp, int space) if (space < rqstp->rq_reserved) { struct svc_sock *svsk = rqstp->rq_sock; - spin_lock_bh(&svsk->sk_server->sv_lock); - svsk->sk_reserved -= (rqstp->rq_reserved - space); + atomic_sub((rqstp->rq_reserved - space), &svsk->sk_reserved); rqstp->rq_reserved = space; - spin_unlock_bh(&svsk->sk_server->sv_lock); svc_sock_enqueue(svsk); } @@ -277,17 +299,11 @@ void svc_reserve(struct svc_rqst *rqstp, int space) static inline void svc_sock_put(struct svc_sock *svsk) { - struct svc_serv *serv = svsk->sk_server; - - spin_lock_bh(&serv->sv_lock); - if (!--(svsk->sk_inuse) && test_bit(SK_DEAD, &svsk->sk_flags)) { - spin_unlock_bh(&serv->sv_lock); + if (atomic_dec_and_test(&svsk->sk_inuse) && test_bit(SK_DEAD, &svsk->sk_flags)) { dprintk("svc: releasing dead socket\n"); sock_release(svsk->sk_sock); kfree(svsk); } - else - spin_unlock_bh(&serv->sv_lock); } static void @@ -321,25 +337,33 @@ svc_sock_release(struct svc_rqst *rqstp) /* * External function to wake up a server waiting for data + * This really only makes sense for services like lockd + * which have exactly one thread anyway. */ void svc_wake_up(struct svc_serv *serv) { struct svc_rqst *rqstp; - - spin_lock_bh(&serv->sv_lock); - if (!list_empty(&serv->sv_threads)) { - rqstp = list_entry(serv->sv_threads.next, - struct svc_rqst, - rq_list); - dprintk("svc: daemon %p woken up.\n", rqstp); - /* - svc_serv_dequeue(serv, rqstp); - rqstp->rq_sock = NULL; - */ - wake_up(&rqstp->rq_wait); + unsigned int i; + struct svc_pool *pool; + + for (i = 0; i < serv->sv_nrpools; i++) { + pool = &serv->sv_pools[i]; + + spin_lock_bh(&pool->sp_lock); + if (!list_empty(&pool->sp_threads)) { + rqstp = list_entry(pool->sp_threads.next, + struct svc_rqst, + rq_list); + dprintk("svc: daemon %p woken up.\n", rqstp); + /* + svc_thread_dequeue(pool, rqstp); + rqstp->rq_sock = NULL; + */ + wake_up(&rqstp->rq_wait); + } + spin_unlock_bh(&pool->sp_lock); } - spin_unlock_bh(&serv->sv_lock); } /* @@ -429,6 +453,51 @@ out: } /* + * Report socket names for nfsdfs + */ +static int one_sock_name(char *buf, struct svc_sock *svsk) +{ + int len; + + switch(svsk->sk_sk->sk_family) { + case AF_INET: + len = sprintf(buf, "ipv4 %s %u.%u.%u.%u %d\n", + svsk->sk_sk->sk_protocol==IPPROTO_UDP? + "udp" : "tcp", + NIPQUAD(inet_sk(svsk->sk_sk)->rcv_saddr), + inet_sk(svsk->sk_sk)->num); + break; + default: + len = sprintf(buf, "*unknown-%d*\n", + svsk->sk_sk->sk_family); + } + return len; +} + +int +svc_sock_names(char *buf, struct svc_serv *serv, char *toclose) +{ + struct svc_sock *svsk, *closesk = NULL; + int len = 0; + + if (!serv) + return 0; + spin_lock(&serv->sv_lock); + list_for_each_entry(svsk, &serv->sv_permsocks, sk_list) { + int onelen = one_sock_name(buf+len, svsk); + if (toclose && strcmp(toclose, buf+len) == 0) + closesk = svsk; + else + len += onelen; + } + spin_unlock(&serv->sv_lock); + if (closesk) + svc_delete_socket(closesk); + return len; +} +EXPORT_SYMBOL(svc_sock_names); + +/* * Check input queue length */ static int @@ -557,7 +626,10 @@ svc_udp_recvfrom(struct svc_rqst *rqstp) /* udp sockets need large rcvbuf as all pending * requests are still in that buffer. sndbuf must * also be large enough that there is enough space - * for one reply per thread. + * for one reply per thread. We count all threads + * rather than threads in a particular pool, which + * provides an upper bound on the number of threads + * which will access the socket. */ svc_sock_setbufsize(svsk->sk_sock, (serv->sv_nrthreads+3) * serv->sv_bufsz, @@ -844,7 +916,7 @@ svc_tcp_accept(struct svc_sock *svsk) struct svc_sock, sk_list); set_bit(SK_CLOSE, &svsk->sk_flags); - svsk->sk_inuse ++; + atomic_inc(&svsk->sk_inuse); } spin_unlock_bh(&serv->sv_lock); @@ -902,6 +974,11 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp) /* sndbuf needs to have room for one request * per thread, otherwise we can stall even when the * network isn't a bottleneck. + * + * We count all threads rather than threads in a + * particular pool, which provides an upper bound + * on the number of threads which will access the socket. + * * rcvbuf just needs to be able to hold a few requests. * Normally they will be removed from the queue * as soon a a complete request arrives. @@ -1117,12 +1194,16 @@ svc_sock_update_bufs(struct svc_serv *serv) } /* - * Receive the next request on any socket. + * Receive the next request on any socket. This code is carefully + * organised not to touch any cachelines in the shared svc_serv + * structure, only cachelines in the local svc_pool. */ int -svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout) +svc_recv(struct svc_rqst *rqstp, long timeout) { struct svc_sock *svsk =NULL; + struct svc_serv *serv = rqstp->rq_server; + struct svc_pool *pool = rqstp->rq_pool; int len; int pages; struct xdr_buf *arg; @@ -1172,32 +1253,15 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout) if (signalled()) return -EINTR; - spin_lock_bh(&serv->sv_lock); - if (!list_empty(&serv->sv_tempsocks)) { - svsk = list_entry(serv->sv_tempsocks.next, - struct svc_sock, sk_list); - /* apparently the "standard" is that clients close - * idle connections after 5 minutes, servers after - * 6 minutes - * http://www.connectathon.org/talks96/nfstcp.pdf - */ - if (get_seconds() - svsk->sk_lastrecv < 6*60 - || test_bit(SK_BUSY, &svsk->sk_flags)) - svsk = NULL; - } - if (svsk) { - set_bit(SK_BUSY, &svsk->sk_flags); - set_bit(SK_CLOSE, &svsk->sk_flags); - rqstp->rq_sock = svsk; - svsk->sk_inuse++; - } else if ((svsk = svc_sock_dequeue(serv)) != NULL) { + spin_lock_bh(&pool->sp_lock); + if ((svsk = svc_sock_dequeue(pool)) != NULL) { rqstp->rq_sock = svsk; - svsk->sk_inuse++; + atomic_inc(&svsk->sk_inuse); rqstp->rq_reserved = serv->sv_bufsz; - svsk->sk_reserved += rqstp->rq_reserved; + atomic_add(rqstp->rq_reserved, &svsk->sk_reserved); } else { /* No data pending. Go to sleep */ - svc_serv_enqueue(serv, rqstp); + svc_thread_enqueue(pool, rqstp); /* * We have to be able to interrupt this wait @@ -1205,26 +1269,26 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout) */ set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&rqstp->rq_wait, &wait); - spin_unlock_bh(&serv->sv_lock); + spin_unlock_bh(&pool->sp_lock); schedule_timeout(timeout); try_to_freeze(); - spin_lock_bh(&serv->sv_lock); + spin_lock_bh(&pool->sp_lock); remove_wait_queue(&rqstp->rq_wait, &wait); if (!(svsk = rqstp->rq_sock)) { - svc_serv_dequeue(serv, rqstp); - spin_unlock_bh(&serv->sv_lock); + svc_thread_dequeue(pool, rqstp); + spin_unlock_bh(&pool->sp_lock); dprintk("svc: server %p, no data yet\n", rqstp); return signalled()? -EINTR : -EAGAIN; } } - spin_unlock_bh(&serv->sv_lock); + spin_unlock_bh(&pool->sp_lock); - dprintk("svc: server %p, socket %p, inuse=%d\n", - rqstp, svsk, svsk->sk_inuse); + dprintk("svc: server %p, pool %u, socket %p, inuse=%d\n", + rqstp, pool->sp_id, svsk, atomic_read(&svsk->sk_inuse)); len = svsk->sk_recvfrom(rqstp); dprintk("svc: got len=%d\n", len); @@ -1235,13 +1299,7 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout) return -EAGAIN; } svsk->sk_lastrecv = get_seconds(); - if (test_bit(SK_TEMP, &svsk->sk_flags)) { - /* push active sockets to end of list */ - spin_lock_bh(&serv->sv_lock); - if (!list_empty(&svsk->sk_list)) - list_move_tail(&svsk->sk_list, &serv->sv_tempsocks); - spin_unlock_bh(&serv->sv_lock); - } + clear_bit(SK_OLD, &svsk->sk_flags); rqstp->rq_secure = ntohs(rqstp->rq_addr.sin_port) < 1024; rqstp->rq_chandle.defer = svc_defer; @@ -1301,6 +1359,58 @@ svc_send(struct svc_rqst *rqstp) } /* + * Timer function to close old temporary sockets, using + * a mark-and-sweep algorithm. + */ +static void +svc_age_temp_sockets(unsigned long closure) +{ + struct svc_serv *serv = (struct svc_serv *)closure; + struct svc_sock *svsk; + struct list_head *le, *next; + LIST_HEAD(to_be_aged); + + dprintk("svc_age_temp_sockets\n"); + + if (!spin_trylock_bh(&serv->sv_lock)) { + /* busy, try again 1 sec later */ + dprintk("svc_age_temp_sockets: busy\n"); + mod_timer(&serv->sv_temptimer, jiffies + HZ); + return; + } + + list_for_each_safe(le, next, &serv->sv_tempsocks) { + svsk = list_entry(le, struct svc_sock, sk_list); + + if (!test_and_set_bit(SK_OLD, &svsk->sk_flags)) + continue; + if (atomic_read(&svsk->sk_inuse) || test_bit(SK_BUSY, &svsk->sk_flags)) + continue; + atomic_inc(&svsk->sk_inuse); + list_move(le, &to_be_aged); + set_bit(SK_CLOSE, &svsk->sk_flags); + set_bit(SK_DETACHED, &svsk->sk_flags); + } + spin_unlock_bh(&serv->sv_lock); + + while (!list_empty(&to_be_aged)) { + le = to_be_aged.next; + /* fiddling the sk_list node is safe 'cos we're SK_DETACHED */ + list_del_init(le); + svsk = list_entry(le, struct svc_sock, sk_list); + + dprintk("queuing svsk %p for closing, %lu seconds old\n", + svsk, get_seconds() - svsk->sk_lastrecv); + + /* a thread will dequeue and close it soon */ + svc_sock_enqueue(svsk); + svc_sock_put(svsk); + } + + mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ); +} + +/* * Initialize socket for RPC use and create svc_sock struct * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF. */ @@ -1337,7 +1447,9 @@ svc_setup_socket(struct svc_serv *serv, struct socket *sock, svsk->sk_odata = inet->sk_data_ready; svsk->sk_owspace = inet->sk_write_space; svsk->sk_server = serv; + atomic_set(&svsk->sk_inuse, 0); svsk->sk_lastrecv = get_seconds(); + spin_lock_init(&svsk->sk_defer_lock); INIT_LIST_HEAD(&svsk->sk_deferred); INIT_LIST_HEAD(&svsk->sk_ready); mutex_init(&svsk->sk_mutex); @@ -1353,6 +1465,13 @@ svc_setup_socket(struct svc_serv *serv, struct socket *sock, set_bit(SK_TEMP, &svsk->sk_flags); list_add(&svsk->sk_list, &serv->sv_tempsocks); serv->sv_tmpcnt++; + if (serv->sv_temptimer.function == NULL) { + /* setup timer to age temp sockets */ + setup_timer(&serv->sv_temptimer, svc_age_temp_sockets, + (unsigned long)serv); + mod_timer(&serv->sv_temptimer, + jiffies + svc_conn_age_period * HZ); + } } else { clear_bit(SK_TEMP, &svsk->sk_flags); list_add(&svsk->sk_list, &serv->sv_permsocks); @@ -1367,6 +1486,38 @@ svc_setup_socket(struct svc_serv *serv, struct socket *sock, return svsk; } +int svc_addsock(struct svc_serv *serv, + int fd, + char *name_return, + int *proto) +{ + int err = 0; + struct socket *so = sockfd_lookup(fd, &err); + struct svc_sock *svsk = NULL; + + if (!so) + return err; + if (so->sk->sk_family != AF_INET) + err = -EAFNOSUPPORT; + else if (so->sk->sk_protocol != IPPROTO_TCP && + so->sk->sk_protocol != IPPROTO_UDP) + err = -EPROTONOSUPPORT; + else if (so->state > SS_UNCONNECTED) + err = -EISCONN; + else { + svsk = svc_setup_socket(serv, so, &err, 1); + if (svsk) + err = 0; + } + if (err) { + sockfd_put(so); + return err; + } + if (proto) *proto = so->sk->sk_protocol; + return one_sock_name(name_return, svsk); +} +EXPORT_SYMBOL_GPL(svc_addsock); + /* * Create socket for RPC service. */ @@ -1434,15 +1585,25 @@ svc_delete_socket(struct svc_sock *svsk) spin_lock_bh(&serv->sv_lock); - list_del_init(&svsk->sk_list); - list_del_init(&svsk->sk_ready); + if (!test_and_set_bit(SK_DETACHED, &svsk->sk_flags)) + list_del_init(&svsk->sk_list); + /* + * We used to delete the svc_sock from whichever list + * it's sk_ready node was on, but we don't actually + * need to. This is because the only time we're called + * while still attached to a queue, the queue itself + * is about to be destroyed (in svc_destroy). + */ if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags)) if (test_bit(SK_TEMP, &svsk->sk_flags)) serv->sv_tmpcnt--; - if (!svsk->sk_inuse) { + if (!atomic_read(&svsk->sk_inuse)) { spin_unlock_bh(&serv->sv_lock); - sock_release(svsk->sk_sock); + if (svsk->sk_sock->file) + sockfd_put(svsk->sk_sock); + else + sock_release(svsk->sk_sock); kfree(svsk); } else { spin_unlock_bh(&serv->sv_lock); @@ -1473,7 +1634,6 @@ svc_makesock(struct svc_serv *serv, int protocol, unsigned short port) static void svc_revisit(struct cache_deferred_req *dreq, int too_many) { struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle); - struct svc_serv *serv = dreq->owner; struct svc_sock *svsk; if (too_many) { @@ -1484,9 +1644,9 @@ static void svc_revisit(struct cache_deferred_req *dreq, int too_many) dprintk("revisit queued\n"); svsk = dr->svsk; dr->svsk = NULL; - spin_lock_bh(&serv->sv_lock); + spin_lock_bh(&svsk->sk_defer_lock); list_add(&dr->handle.recent, &svsk->sk_deferred); - spin_unlock_bh(&serv->sv_lock); + spin_unlock_bh(&svsk->sk_defer_lock); set_bit(SK_DEFERRED, &svsk->sk_flags); svc_sock_enqueue(svsk); svc_sock_put(svsk); @@ -1518,10 +1678,8 @@ svc_defer(struct cache_req *req) dr->argslen = rqstp->rq_arg.len >> 2; memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2); } - spin_lock_bh(&rqstp->rq_server->sv_lock); - rqstp->rq_sock->sk_inuse++; + atomic_inc(&rqstp->rq_sock->sk_inuse); dr->svsk = rqstp->rq_sock; - spin_unlock_bh(&rqstp->rq_server->sv_lock); dr->handle.revisit = svc_revisit; return &dr->handle; @@ -1548,11 +1706,10 @@ static int svc_deferred_recv(struct svc_rqst *rqstp) static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk) { struct svc_deferred_req *dr = NULL; - struct svc_serv *serv = svsk->sk_server; if (!test_bit(SK_DEFERRED, &svsk->sk_flags)) return NULL; - spin_lock_bh(&serv->sv_lock); + spin_lock_bh(&svsk->sk_defer_lock); clear_bit(SK_DEFERRED, &svsk->sk_flags); if (!list_empty(&svsk->sk_deferred)) { dr = list_entry(svsk->sk_deferred.next, @@ -1561,6 +1718,6 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk) list_del_init(&dr->handle.recent); set_bit(SK_DEFERRED, &svsk->sk_flags); } - spin_unlock_bh(&serv->sv_lock); + spin_unlock_bh(&svsk->sk_defer_lock); return dr; } diff --git a/sound/core/info_oss.c b/sound/core/info_oss.c index 3ebc34919c7..a444bfe2cf7 100644 --- a/sound/core/info_oss.c +++ b/sound/core/info_oss.c @@ -96,11 +96,11 @@ static void snd_sndstat_proc_read(struct snd_info_entry *entry, { snd_iprintf(buffer, "Sound Driver:3.8.1a-980706 (ALSA v" CONFIG_SND_VERSION " emulation code)\n"); snd_iprintf(buffer, "Kernel: %s %s %s %s %s\n", - system_utsname.sysname, - system_utsname.nodename, - system_utsname.release, - system_utsname.version, - system_utsname.machine); + init_utsname()->sysname, + init_utsname()->nodename, + init_utsname()->release, + init_utsname()->version, + init_utsname()->machine); snd_iprintf(buffer, "Config options: 0\n"); snd_iprintf(buffer, "\nInstalled drivers: \n"); snd_iprintf(buffer, "Type 10: ALSA emulation\n"); |