Merge branch 'x86-stage-3-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'x86-stage-3-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (190 commits) Revert "cpuacct: reduce one NULL check in fast-path" Revert "x86: don't compile vsmp_64 for 32bit" x86: Correct behaviour of irq affinity x86: early_ioremap_init(), use __fix_to_virt(), because we are sure it's safe x86: use default_cpu_mask_to_apicid for 64bit x86: fix set_extra_move_desc calling x86, PAT, PCI: Change vma prot in pci_mmap to reflect inherited prot x86/dmi: fix dmi_alloc() section mismatches x86: e820 fix various signedness issues in setup.c and e820.c x86: apic/io_apic.c define msi_ir_chip and ir_ioapic_chip all the time x86: irq.c keep CONFIG_X86_LOCAL_APIC interrupts together x86: irq.c use same path for show_interrupts x86: cpu/cpu.h cleanup x86: Fix a couple of sparse warnings in arch/x86/kernel/apic/io_apic.c Revert "x86: create a non-zero sized bm_pte only when needed" x86: pci-nommu.c cleanup x86: io_delay.c cleanup x86: rtc.c cleanup x86: i8253 cleanup x86: kdebugfs.c cleanup ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2009-03-30 11:38:31 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2009-03-30 11:38:31 -0700
commit: 019abbc87025a030fd25008612afd4eff8a375f7 (patch)
tree: 6d745dedcf90ceff8f5b7b996a17f666b7c574e3 /arch/x86/lib/memcpy_64.S
parent: 2d25ee36c84d5b2d6be8bfaf80256ecad69a06ca (diff)
parent: 5a3c8fe7353f78b73b9636353c6f7b881f19ebea (diff)
1 files changed, 81 insertions, 62 deletions
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index c22981fa2f3..ad5441ed1b5 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,30 +1,38 @@
 /* Copyright 2002 Andi Kleen */
 
 #include <linux/linkage.h>
-#include <asm/dwarf2.h>
+
 #include <asm/cpufeature.h>
+#include <asm/dwarf2.h>
 
 /*
  * memcpy - Copy a memory block.
  *
- * Input:	
- * rdi destination
- * rsi source
- * rdx count
- * 
+ * Input:
+ *  rdi destination
+ *  rsi source
+ *  rdx count
+ *
  * Output:
  * rax original destination
- */	
+ */
 
+/*
+ * memcpy_c() - fast string ops (REP MOVSQ) based variant.
+ *
+ * Calls to this get patched into the kernel image via the
+ * alternative instructions framework:
+ */
 	ALIGN
 memcpy_c:
 	CFI_STARTPROC
-	movq %rdi,%rax
-	movl %edx,%ecx
-	shrl $3,%ecx
-	andl $7,%edx
+	movq %rdi, %rax
+
+	movl %edx, %ecx
+	shrl $3, %ecx
+	andl $7, %edx
 	rep movsq
-	movl %edx,%ecx
+	movl %edx, %ecx
 	rep movsb
 	ret
 	CFI_ENDPROC
@@ -33,99 +41,110 @@ ENDPROC(memcpy_c)
 ENTRY(__memcpy)
 ENTRY(memcpy)
 	CFI_STARTPROC
-	pushq %rbx
-	CFI_ADJUST_CFA_OFFSET 8
-	CFI_REL_OFFSET rbx, 0
-	movq %rdi,%rax
 
-	movl %edx,%ecx
-	shrl $6,%ecx
+	/*
+	 * Put the number of full 64-byte blocks into %ecx.
+	 * Tail portion is handled at the end:
+	 */
+	movq %rdi, %rax
+	movl %edx, %ecx
+	shrl   $6, %ecx
 	jz .Lhandle_tail
 
 	.p2align 4
 .Lloop_64:
+	/*
+	 * We decrement the loop index here - and the zero-flag is
+	 * checked at the end of the loop (instructions inbetween do
+	 * not change the zero flag):
+	 */
 	decl %ecx
 
-	movq (%rsi),%r11
-	movq 8(%rsi),%r8
+	/*
+	 * Move in blocks of 4x16 bytes:
+	 */
+	movq 0*8(%rsi),		%r11
+	movq 1*8(%rsi),		%r8
+	movq %r11,		0*8(%rdi)
+	movq %r8,		1*8(%rdi)
 
-	movq %r11,(%rdi)
-	movq %r8,1*8(%rdi)
+	movq 2*8(%rsi),		%r9
+	movq 3*8(%rsi),		%r10
+	movq %r9,		2*8(%rdi)
+	movq %r10,		3*8(%rdi)
 
-	movq 2*8(%rsi),%r9
-	movq 3*8(%rsi),%r10
+	movq 4*8(%rsi),		%r11
+	movq 5*8(%rsi),		%r8
+	movq %r11,		4*8(%rdi)
+	movq %r8,		5*8(%rdi)
 
-	movq %r9,2*8(%rdi)
-	movq %r10,3*8(%rdi)
+	movq 6*8(%rsi),		%r9
+	movq 7*8(%rsi),		%r10
+	movq %r9,		6*8(%rdi)
+	movq %r10,		7*8(%rdi)
 
-	movq 4*8(%rsi),%r11
-	movq 5*8(%rsi),%r8
+	leaq 64(%rsi), %rsi
+	leaq 64(%rdi), %rdi
 
-	movq %r11,4*8(%rdi)
-	movq %r8,5*8(%rdi)
-
-	movq 6*8(%rsi),%r9
-	movq 7*8(%rsi),%r10
-
-	movq %r9,6*8(%rdi)
-	movq %r10,7*8(%rdi)
-
-	leaq 64(%rsi),%rsi
-	leaq 64(%rdi),%rdi
 	jnz  .Lloop_64
 
 .Lhandle_tail:
-	movl %edx,%ecx
-	andl $63,%ecx
-	shrl $3,%ecx
+	movl %edx, %ecx
+	andl  $63, %ecx
+	shrl   $3, %ecx
 	jz   .Lhandle_7
+
 	.p2align 4
 .Lloop_8:
 	decl %ecx
-	movq (%rsi),%r8
-	movq %r8,(%rdi)
-	leaq 8(%rdi),%rdi
-	leaq 8(%rsi),%rsi
+	movq (%rsi),		%r8
+	movq %r8,		(%rdi)
+	leaq 8(%rdi),		%rdi
+	leaq 8(%rsi),		%rsi
 	jnz  .Lloop_8
 
 .Lhandle_7:
-	movl %edx,%ecx
-	andl $7,%ecx
-	jz .Lende
+	movl %edx, %ecx
+	andl $7, %ecx
+	jz .Lend
+
 	.p2align 4
 .Lloop_1:
-	movb (%rsi),%r8b
-	movb %r8b,(%rdi)
+	movb (%rsi), %r8b
+	movb %r8b, (%rdi)
 	incq %rdi
 	incq %rsi
 	decl %ecx
 	jnz .Lloop_1
 
-.Lende:
-	popq %rbx
-	CFI_ADJUST_CFA_OFFSET -8
-	CFI_RESTORE rbx
+.Lend:
 	ret
-.Lfinal:
 	CFI_ENDPROC
 ENDPROC(memcpy)
 ENDPROC(__memcpy)
 
-	/* Some CPUs run faster using the string copy instructions.
-	   It is also a lot simpler. Use this when possible */
+	/*
+	 * Some CPUs run faster using the string copy instructions.
+	 * It is also a lot simpler. Use this when possible:
+	 */
 
-	.section .altinstr_replacement,"ax"
+	.section .altinstr_replacement, "ax"
 1:	.byte 0xeb				/* jmp <disp8> */
 	.byte (memcpy_c - memcpy) - (2f - 1b)	/* offset */
 2:
 	.previous
-	.section .altinstructions,"a"
+
+	.section .altinstructions, "a"
 	.align 8
 	.quad memcpy
 	.quad 1b
 	.byte X86_FEATURE_REP_GOOD
-	/* Replace only beginning, memcpy is used to apply alternatives, so it
-	 * is silly to overwrite itself with nops - reboot is only outcome... */
+
+	/*
+	 * Replace only beginning, memcpy is used to apply alternatives,
+	 * so it is silly to overwrite itself with nops - reboot is the
+	 * only outcome...
+	 */
 	.byte 2b - 1b
 	.byte 2b - 1b
 	.previous
author	Linus Torvalds <torvalds@linux-foundation.org>	2009-03-30 11:38:31 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2009-03-30 11:38:31 -0700
commit	019abbc87025a030fd25008612afd4eff8a375f7 (patch)
tree	6d745dedcf90ceff8f5b7b996a17f666b7c574e3 /arch/x86/lib/memcpy_64.S
parent	2d25ee36c84d5b2d6be8bfaf80256ecad69a06ca (diff)
parent	5a3c8fe7353f78b73b9636353c6f7b881f19ebea (diff)