From 250cf776f74b5932a1977d0489cae9206e2351dd Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 28 Oct 2008 11:10:15 +0100 Subject: [S390] pgtables: Fix race in enable_sie vs. page table ops The current enable_sie code sets the mm->context.pgstes bit to tell dup_mm that the new mm should have extended page tables. This bit is also used by the s390 specific page table primitives to decide about the page table layout - which means context.pgstes has two meanings. This can cause any kind of bugs. For example - e.g. shrink_zone can call ptep_clear_flush_young while enable_sie is running. ptep_clear_flush_young will test for context.pgstes. Since enable_sie changed that value of the old struct mm without changing the page table layout ptep_clear_flush_young will do the wrong thing. The solution is to split pgstes into two bits - one for the allocation - one for the current state Signed-off-by: Christian Borntraeger Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/mmu.h | 3 ++- arch/s390/include/asm/mmu_context.h | 19 ++++++++++++++++--- arch/s390/include/asm/pgtable.h | 8 ++++---- 3 files changed, 22 insertions(+), 8 deletions(-) (limited to 'arch/s390/include/asm') diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h index 5dd5e7b3476..d2b4ff83147 100644 --- a/arch/s390/include/asm/mmu.h +++ b/arch/s390/include/asm/mmu.h @@ -7,7 +7,8 @@ typedef struct { unsigned long asce_bits; unsigned long asce_limit; int noexec; - int pgstes; + int has_pgste; /* The mmu context has extended page tables */ + int alloc_pgste; /* cloned contexts will have extended page tables */ } mm_context_t; #endif diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 4c2fbf48c9c..28ec870655a 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -20,12 +20,25 @@ static inline int init_new_context(struct task_struct *tsk, #ifdef CONFIG_64BIT mm->context.asce_bits |= _ASCE_TYPE_REGION3; #endif - if (current->mm->context.pgstes) { + if (current->mm->context.alloc_pgste) { + /* + * alloc_pgste indicates, that any NEW context will be created + * with extended page tables. The old context is unchanged. The + * page table allocation and the page table operations will + * look at has_pgste to distinguish normal and extended page + * tables. The only way to create extended page tables is to + * set alloc_pgste and then create a new context (e.g. dup_mm). + * The page table allocation is called after init_new_context + * and if has_pgste is set, it will create extended page + * tables. + */ mm->context.noexec = 0; - mm->context.pgstes = 1; + mm->context.has_pgste = 1; + mm->context.alloc_pgste = 1; } else { mm->context.noexec = s390_noexec; - mm->context.pgstes = 0; + mm->context.has_pgste = 0; + mm->context.alloc_pgste = 0; } mm->context.asce_limit = STACK_TOP_MAX; crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 1a928f84afd..7fc76133b3e 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -679,7 +679,7 @@ static inline void pmd_clear(pmd_t *pmd) static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - if (mm->context.pgstes) + if (mm->context.has_pgste) ptep_rcp_copy(ptep); pte_val(*ptep) = _PAGE_TYPE_EMPTY; if (mm->context.noexec) @@ -763,7 +763,7 @@ static inline int kvm_s390_test_and_clear_page_dirty(struct mm_struct *mm, struct page *page; unsigned int skey; - if (!mm->context.pgstes) + if (!mm->context.has_pgste) return -EINVAL; rcp_lock(ptep); pgste = (unsigned long *) (ptep + PTRS_PER_PTE); @@ -794,7 +794,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, int young; unsigned long *pgste; - if (!vma->vm_mm->context.pgstes) + if (!vma->vm_mm->context.has_pgste) return 0; physpage = pte_val(*ptep) & PAGE_MASK; pgste = (unsigned long *) (ptep + PTRS_PER_PTE); @@ -844,7 +844,7 @@ static inline void __ptep_ipte(unsigned long address, pte_t *ptep) static inline void ptep_invalidate(struct mm_struct *mm, unsigned long address, pte_t *ptep) { - if (mm->context.pgstes) { + if (mm->context.has_pgste) { rcp_lock(ptep); __ptep_ipte(address, ptep); ptep_rcp_copy(ptep); -- cgit v1.2.3 From 7f5a8ba6b0297ca941f43f8f5cbf0e5c8c4dd916 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 28 Oct 2008 11:10:21 +0100 Subject: [S390] No more 4kb stacks. We got a stack overflow with a small stack configuration on a 32 bit system. It just looks like as 4kb isn't enough and too dangerous. So lets get rid of 4kb stacks on 32 bit. But one thing I completely dislike about the call trace below is that just for debugging or tracing purposes sprintf gets called (cio_start_key): /* process condition code */ sprintf(dbf_txt, "ccode:%d", ccode); CIO_TRACE_EVENT(4, dbf_txt); But maybe its just me who thinks that this could be done better. <4>Kernel stack overflow. <4>Modules linked in: dm_multipath sunrpc bonding qeth_l2 dm_mod qeth ccwgroup vmur <4>CPU: 1 Not tainted 2.6.27-30.x.20081015-s390default #1 <4>Process httpd (pid: 3807, task: 20ae2df8, ksp: 1666fb78) <4>Krnl PSW : 040c0000 8027098a (number+0xe/0x348) <4> R:0 T:1 IO:0 EX:0 Key:0 M:1 W:0 P:0 AS:0 CC:0 PM:0 <4>Krnl GPRS: 00d43318 0027097c 1666f277 9666f270 <4> 00000000 00000000 0000000a ffffffff <4> 9666f270 1666f228 1666f277 1666f098 <4> 00000002 80270982 80271016 1666f098 <4>Krnl Code: 8027097e: f0340dd0a7f1 srp 3536(4,%r0),2033(%r10),4 <4> 80270984: 0f00 clcl %r0,%r0 <4> 80270986: a7840001 brc 8,80270988 <4> >8027098a: 18ef lr %r14,%r15 <4> 8027098c: a7faff68 ahi %r15,-152 <4> 80270990: 18bf lr %r11,%r15 <4> 80270992: 18a2 lr %r10,%r2 <4> 80270994: 1893 lr %r9,%r3 Modified calltrace with annotated stackframe size of each function: stackframe size | 0 304 vsnprintf+850 [0x271016] 1 72 sprintf+74 [0x271522] 2 56 cio_start_key+262 [0x2d4c16] 3 56 ccw_device_start_key+222 [0x2dfe92] 4 56 ccw_device_start+40 [0x2dff28] 5 48 raw3215_start_io+104 [0x30b0f8] 6 56 raw3215_write+494 [0x30ba0a] 7 40 con3215_write+68 [0x30bafc] 8 40 __call_console_drivers+146 [0x12b0fa] 9 32 _call_console_drivers+102 [0x12b192] 10 64 release_console_sem+268 [0x12b614] 11 168 vprintk+462 [0x12bca6] 12 72 printk+68 [0x12bfd0] 13 256 __print_symbol+50 [0x15a882] 14 56 __show_trace+162 [0x103d06] 15 32 show_trace+224 [0x103e70] 16 48 show_stack+152 [0x103f20] 17 56 dump_stack+126 [0x104612] 18 96 __alloc_pages_internal+592 [0x175004] 19 80 cache_alloc_refill+776 [0x196f3c] 20 40 __kmalloc+258 [0x1972ae] 21 40 __alloc_skb+94 [0x328086] 22 32 pskb_copy+50 [0x328252] 23 32 skb_realloc_headroom+110 [0x328a72] 24 104 qeth_l2_hard_start_xmit+378 [0x7803bfde] 25 56 dev_hard_start_xmit+450 [0x32ef6e] 26 56 __qdisc_run+390 [0x3425d6] 27 48 dev_queue_xmit+410 [0x331e06] 28 40 ip_finish_output+308 [0x354ac8] 29 56 ip_output+218 [0x355b6e] 30 24 ip_local_out+56 [0x354584] 31 120 ip_queue_xmit+300 [0x355cec] 32 96 tcp_transmit_skb+812 [0x367da8] 33 40 tcp_push_one+158 [0x369fda] 34 112 tcp_sendmsg+852 [0x35d5a0] 35 240 sock_sendmsg+164 [0x32035c] 36 56 kernel_sendmsg+86 [0x32064a] 37 88 sock_no_sendpage+98 [0x322b22] 38 104 tcp_sendpage+70 [0x35cc1e] 39 48 sock_sendpage+74 [0x31eb66] 40 64 pipe_to_sendpage+102 [0x1c4b2e] 41 64 __splice_from_pipe+120 [0x1c5340] 42 72 splice_from_pipe+90 [0x1c57e6] 43 56 generic_splice_sendpage+38 [0x1c5832] 44 48 do_splice_from+104 [0x1c4c38] 45 48 direct_splice_actor+52 [0x1c4c88] 46 80 splice_direct_to_actor+180 [0x1c4f80] 47 72 do_splice_direct+70 [0x1c5112] 48 64 do_sendfile+360 [0x19de18] 49 72 sys_sendfile64+126 [0x19df32] 50 336 sysc_do_restart+18 [0x111a1a] Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/thread_info.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/s390/include/asm') diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index de3fad60c68..c1eaf9604da 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -15,13 +15,8 @@ * Size of kernel stack for each process */ #ifndef __s390x__ -#ifndef __SMALL_STACK #define THREAD_ORDER 1 #define ASYNC_ORDER 1 -#else -#define THREAD_ORDER 0 -#define ASYNC_ORDER 0 -#endif #else /* __s390x__ */ #ifndef __SMALL_STACK #define THREAD_ORDER 2 -- cgit v1.2.3 From ea4bfdf52a5a84492cce881baadc5fab36adeade Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 28 Oct 2008 11:10:22 +0100 Subject: [S390] s390: Fix build for !CONFIG_S390_GUEST + CONFIG_VIRTIO_CONSOLE The s390 kernel does not compile if virtio console is enabled, but guest support is disabled: LD .tmp_vmlinux1 arch/s390/kernel/built-in.o: In function `setup_arch': /space/linux-2.5/arch/s390/kernel/setup.c:773: undefined reference to `s390_virtio_console_init' The fix is related to commit 99e65c92f2bbf84f43766a8bf701e36817d62822 Author: Christian Borntraeger Date: Fri Jul 25 15:50:04 2008 +0200 KVM: s390: Fix guest kconfig Which changed the build process to build kvm_virtio.c only if CONFIG_S390_GUEST is set. We must ifdef the prototype in the header file accordingly. Reported-by: Heiko Carstens Signed-off-by: Christian Borntraeger Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/kvm_virtio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/s390/include/asm') diff --git a/arch/s390/include/asm/kvm_virtio.h b/arch/s390/include/asm/kvm_virtio.h index 146100224de..c13568b9351 100644 --- a/arch/s390/include/asm/kvm_virtio.h +++ b/arch/s390/include/asm/kvm_virtio.h @@ -52,7 +52,7 @@ struct kvm_vqconfig { #ifdef __KERNEL__ /* early virtio console setup */ -#ifdef CONFIG_VIRTIO_CONSOLE +#ifdef CONFIG_S390_GUEST extern void s390_virtio_console_init(void); #else static inline void s390_virtio_console_init(void) -- cgit v1.2.3 From 59da21398e680e8100625d689c8bebee6a139e93 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 27 Nov 2008 11:05:55 +0100 Subject: [S390] fix system call parameter functions. syscall_get_nr() currently returns a valid result only if the call chain of the traced process includes do_syscall_trace_enter(). But collect_syscall() can be called for any sleeping task, the result of syscall_get_nr() in general is completely bogus. To make syscall_get_nr() work for any sleeping task the traps field in pt_regs is replace with svcnr - the system call number the process is executing. If svcnr == 0 the process is not on a system call path. The syscall_get_arguments and syscall_set_arguments use regs->gprs[2] for the first system call parameter. This is incorrect since gprs[2] may have been overwritten with the system call number if the call chain includes do_syscall_trace_enter. Use regs->orig_gprs2 instead. Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/ptrace.h | 2 +- arch/s390/include/asm/syscall.h | 28 ++++++++++++++++------------ 2 files changed, 17 insertions(+), 13 deletions(-) (limited to 'arch/s390/include/asm') diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index a7226f8143f..fb0ca4796d3 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -321,8 +321,8 @@ struct pt_regs psw_t psw; unsigned long gprs[NUM_GPRS]; unsigned long orig_gpr2; + unsigned short svcnr; unsigned short ilc; - unsigned short trap; }; #endif diff --git a/arch/s390/include/asm/syscall.h b/arch/s390/include/asm/syscall.h index 6e623971fbb..2429b87eb28 100644 --- a/arch/s390/include/asm/syscall.h +++ b/arch/s390/include/asm/syscall.h @@ -17,9 +17,7 @@ static inline long syscall_get_nr(struct task_struct *task, struct pt_regs *regs) { - if (regs->trap != __LC_SVC_OLD_PSW) - return -1; - return regs->gprs[2]; + return regs->svcnr ? regs->svcnr : -1; } static inline void syscall_rollback(struct task_struct *task, @@ -52,18 +50,20 @@ static inline void syscall_get_arguments(struct task_struct *task, unsigned int i, unsigned int n, unsigned long *args) { + unsigned long mask = -1UL; + BUG_ON(i + n > 6); #ifdef CONFIG_COMPAT - if (test_tsk_thread_flag(task, TIF_31BIT)) { - if (i + n == 6) - args[--n] = (u32) regs->args[0]; - while (n-- > 0) - args[n] = (u32) regs->gprs[2 + i + n]; - } + if (test_tsk_thread_flag(task, TIF_31BIT)) + mask = 0xffffffff; #endif if (i + n == 6) - args[--n] = regs->args[0]; - memcpy(args, ®s->gprs[2 + i], n * sizeof(args[0])); + args[--n] = regs->args[0] & mask; + while (n-- > 0) + if (i + n > 0) + args[n] = regs->gprs[2 + i + n] & mask; + if (i == 0) + args[0] = regs->orig_gpr2 & mask; } static inline void syscall_set_arguments(struct task_struct *task, @@ -74,7 +74,11 @@ static inline void syscall_set_arguments(struct task_struct *task, BUG_ON(i + n > 6); if (i + n == 6) regs->args[0] = args[--n]; - memcpy(®s->gprs[2 + i], args, n * sizeof(args[0])); + while (n-- > 0) + if (i + n > 0) + regs->gprs[2 + i + n] = args[n]; + if (i == 0) + regs->orig_gpr2 = args[0]; } #endif /* _ASM_SYSCALL_H */ -- cgit v1.2.3 From 2944a5c971c12766e2438ea407ec3567949c32b7 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 27 Nov 2008 11:05:57 +0100 Subject: [S390] pgtable.h: Fix oops in unmap_vmas for KVM processes When running several kvm processes with lots of memory overcommitment, we have seen an oops during process shutdown: ------------[ cut here ]------------ Kernel BUG at 0000000000193434 [verbose debug info unavailable] addressing exception: 0005 [#1] PREEMPT SMP Modules linked in: kvm sunrpc qeth_l2 dm_mod qeth ccwgroup CPU: 10 Not tainted 2.6.28-rc4-kvm-bigiron-00521-g0ccca08-dirty #8 Process kuli (pid: 14460, task: 0000000149822338, ksp: 0000000024f57650) Krnl PSW : 0704e00180000000 0000000000193434 (unmap_vmas+0x884/0xf10) R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:2 PM:0 EA:3 Krnl GPRS: 0000000000000002 0000000000000000 000000051008d000 000003e05e6034e0 00000000001933f6 00000000000001e9 0000000407259e0a 00000002be88c400 00000200001c1000 0000000407259608 0000000407259e08 0000000024f577f0 0000000407259e09 0000000000445fa8 00000000001933f6 0000000024f577f0 Krnl Code: 0000000000193426: eb22000c000d sllg %r2,%r2,12 000000000019342c: a7180000 lhi %r1,0 0000000000193430: b2290012 iske %r1,%r2 >0000000000193434: a7110002 tmll %r1,2 0000000000193438: a7840006 brc 8,193444 000000000019343c: 9602c000 oi 0(%r12),2 0000000000193440: 96806000 oi 0(%r6),128 0000000000193444: a7110004 tmll %r1,4 Call Trace: ([<00000000001933f6>] unmap_vmas+0x846/0xf10) [<0000000000199680>] exit_mmap+0x210/0x458 [<000000000012a8f8>] mmput+0x54/0xfc [<000000000012f714>] exit_mm+0x134/0x144 [<000000000013120c>] do_exit+0x240/0x878 [<00000000001318dc>] do_group_exit+0x98/0xc8 [<000000000013e6b0>] get_signal_to_deliver+0x30c/0x358 [<000000000010bee0>] do_signal+0xec/0x860 [<0000000000112e30>] sysc_sigpending+0xe/0x22 [<000002000013198a>] 0x2000013198a INFO: lockdep is turned off. Last Breaking-Event-Address: [<00000000001a68d0>] free_swap_and_cache+0x1a0/0x1a4 <4>---[ end trace bc19f1d51ac9db7c ]--- The faulting instruction is the storage key operation (iske) in ptep_rcp_copy (called by pte_clear, called by unmap_vmas). iske reads dirty and reference bit information for a physical page and requires a valid physical address. Since we are in pte_clear, we cannot rely on the pte containing a valid address. Fortunately we dont need these information in pte_clear - after all there is no mapping. The best fix is to remove the needless call to ptep_rcp_copy that contains the iske. Signed-off-by: Christian Borntraeger Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/pgtable.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/s390/include/asm') diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 7fc76133b3e..5caddd4f7be 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -679,8 +679,6 @@ static inline void pmd_clear(pmd_t *pmd) static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - if (mm->context.has_pgste) - ptep_rcp_copy(ptep); pte_val(*ptep) = _PAGE_TYPE_EMPTY; if (mm->context.noexec) pte_val(ptep[PTRS_PER_PTE]) = _PAGE_TYPE_EMPTY; -- cgit v1.2.3 From 96b8936a9ed08746e47081458a5eb9e43a751e24 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 25 Nov 2008 08:10:03 +0100 Subject: remove __ARCH_WANT_COMPAT_SYS_PTRACE All architectures now use the generic compat_sys_ptrace, as should every new architecture that needs 32bit compat (if we'll ever get another). Remove the now superflous __ARCH_WANT_COMPAT_SYS_PTRACE define, and also kill a comment about __ARCH_SYS_PTRACE that was added after __ARCH_SYS_PTRACE was already gone. Signed-off-by: Christoph Hellwig Acked-by: David S. Miller Signed-off-by: Linus Torvalds --- arch/s390/include/asm/ptrace.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/s390/include/asm') diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index a7226f8143f..560ce8561df 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -486,8 +486,6 @@ struct task_struct; extern void user_enable_single_step(struct task_struct *); extern void user_disable_single_step(struct task_struct *); -#define __ARCH_WANT_COMPAT_SYS_PTRACE - #define user_mode(regs) (((regs)->psw.mask & PSW_MASK_PSTATE) != 0) #define instruction_pointer(regs) ((regs)->psw.addr & PSW_ADDR_INSN) #define user_stack_pointer(regs)((regs)->gprs[15]) -- cgit v1.2.3