aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/lib/memcpy_64.S
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-03-30 11:38:31 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2009-03-30 11:38:31 -0700
commit019abbc87025a030fd25008612afd4eff8a375f7 (patch)
tree6d745dedcf90ceff8f5b7b996a17f666b7c574e3 /arch/x86/lib/memcpy_64.S
parent2d25ee36c84d5b2d6be8bfaf80256ecad69a06ca (diff)
parent5a3c8fe7353f78b73b9636353c6f7b881f19ebea (diff)
Merge branch 'x86-stage-3-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'x86-stage-3-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (190 commits) Revert "cpuacct: reduce one NULL check in fast-path" Revert "x86: don't compile vsmp_64 for 32bit" x86: Correct behaviour of irq affinity x86: early_ioremap_init(), use __fix_to_virt(), because we are sure it's safe x86: use default_cpu_mask_to_apicid for 64bit x86: fix set_extra_move_desc calling x86, PAT, PCI: Change vma prot in pci_mmap to reflect inherited prot x86/dmi: fix dmi_alloc() section mismatches x86: e820 fix various signedness issues in setup.c and e820.c x86: apic/io_apic.c define msi_ir_chip and ir_ioapic_chip all the time x86: irq.c keep CONFIG_X86_LOCAL_APIC interrupts together x86: irq.c use same path for show_interrupts x86: cpu/cpu.h cleanup x86: Fix a couple of sparse warnings in arch/x86/kernel/apic/io_apic.c Revert "x86: create a non-zero sized bm_pte only when needed" x86: pci-nommu.c cleanup x86: io_delay.c cleanup x86: rtc.c cleanup x86: i8253 cleanup x86: kdebugfs.c cleanup ...
Diffstat (limited to 'arch/x86/lib/memcpy_64.S')
-rw-r--r--arch/x86/lib/memcpy_64.S143
1 files changed, 81 insertions, 62 deletions
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index c22981fa2f3..ad5441ed1b5 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,30 +1,38 @@
/* Copyright 2002 Andi Kleen */
#include <linux/linkage.h>
-#include <asm/dwarf2.h>
+
#include <asm/cpufeature.h>
+#include <asm/dwarf2.h>
/*
* memcpy - Copy a memory block.
*
- * Input:
- * rdi destination
- * rsi source
- * rdx count
- *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
* Output:
* rax original destination
- */
+ */
+/*
+ * memcpy_c() - fast string ops (REP MOVSQ) based variant.
+ *
+ * Calls to this get patched into the kernel image via the
+ * alternative instructions framework:
+ */
ALIGN
memcpy_c:
CFI_STARTPROC
- movq %rdi,%rax
- movl %edx,%ecx
- shrl $3,%ecx
- andl $7,%edx
+ movq %rdi, %rax
+
+ movl %edx, %ecx
+ shrl $3, %ecx
+ andl $7, %edx
rep movsq
- movl %edx,%ecx
+ movl %edx, %ecx
rep movsb
ret
CFI_ENDPROC
@@ -33,99 +41,110 @@ ENDPROC(memcpy_c)
ENTRY(__memcpy)
ENTRY(memcpy)
CFI_STARTPROC
- pushq %rbx
- CFI_ADJUST_CFA_OFFSET 8
- CFI_REL_OFFSET rbx, 0
- movq %rdi,%rax
- movl %edx,%ecx
- shrl $6,%ecx
+ /*
+ * Put the number of full 64-byte blocks into %ecx.
+ * Tail portion is handled at the end:
+ */
+ movq %rdi, %rax
+ movl %edx, %ecx
+ shrl $6, %ecx
jz .Lhandle_tail
.p2align 4
.Lloop_64:
+ /*
+ * We decrement the loop index here - and the zero-flag is
+ * checked at the end of the loop (instructions inbetween do
+ * not change the zero flag):
+ */
decl %ecx
- movq (%rsi),%r11
- movq 8(%rsi),%r8
+ /*
+ * Move in blocks of 4x16 bytes:
+ */
+ movq 0*8(%rsi), %r11
+ movq 1*8(%rsi), %r8
+ movq %r11, 0*8(%rdi)
+ movq %r8, 1*8(%rdi)
- movq %r11,(%rdi)
- movq %r8,1*8(%rdi)
+ movq 2*8(%rsi), %r9
+ movq 3*8(%rsi), %r10
+ movq %r9, 2*8(%rdi)
+ movq %r10, 3*8(%rdi)
- movq 2*8(%rsi),%r9
- movq 3*8(%rsi),%r10
+ movq 4*8(%rsi), %r11
+ movq 5*8(%rsi), %r8
+ movq %r11, 4*8(%rdi)
+ movq %r8, 5*8(%rdi)
- movq %r9,2*8(%rdi)
- movq %r10,3*8(%rdi)
+ movq 6*8(%rsi), %r9
+ movq 7*8(%rsi), %r10
+ movq %r9, 6*8(%rdi)
+ movq %r10, 7*8(%rdi)
- movq 4*8(%rsi),%r11
- movq 5*8(%rsi),%r8
+ leaq 64(%rsi), %rsi
+ leaq 64(%rdi), %rdi
- movq %r11,4*8(%rdi)
- movq %r8,5*8(%rdi)
-
- movq 6*8(%rsi),%r9
- movq 7*8(%rsi),%r10
-
- movq %r9,6*8(%rdi)
- movq %r10,7*8(%rdi)
-
- leaq 64(%rsi),%rsi
- leaq 64(%rdi),%rdi
jnz .Lloop_64
.Lhandle_tail:
- movl %edx,%ecx
- andl $63,%ecx
- shrl $3,%ecx
+ movl %edx, %ecx
+ andl $63, %ecx
+ shrl $3, %ecx
jz .Lhandle_7
+
.p2align 4
.Lloop_8:
decl %ecx
- movq (%rsi),%r8
- movq %r8,(%rdi)
- leaq 8(%rdi),%rdi
- leaq 8(%rsi),%rsi
+ movq (%rsi), %r8
+ movq %r8, (%rdi)
+ leaq 8(%rdi), %rdi
+ leaq 8(%rsi), %rsi
jnz .Lloop_8
.Lhandle_7:
- movl %edx,%ecx
- andl $7,%ecx
- jz .Lende
+ movl %edx, %ecx
+ andl $7, %ecx
+ jz .Lend
+
.p2align 4
.Lloop_1:
- movb (%rsi),%r8b
- movb %r8b,(%rdi)
+ movb (%rsi), %r8b
+ movb %r8b, (%rdi)
incq %rdi
incq %rsi
decl %ecx
jnz .Lloop_1
-.Lende:
- popq %rbx
- CFI_ADJUST_CFA_OFFSET -8
- CFI_RESTORE rbx
+.Lend:
ret
-.Lfinal:
CFI_ENDPROC
ENDPROC(memcpy)
ENDPROC(__memcpy)
- /* Some CPUs run faster using the string copy instructions.
- It is also a lot simpler. Use this when possible */
+ /*
+ * Some CPUs run faster using the string copy instructions.
+ * It is also a lot simpler. Use this when possible:
+ */
- .section .altinstr_replacement,"ax"
+ .section .altinstr_replacement, "ax"
1: .byte 0xeb /* jmp <disp8> */
.byte (memcpy_c - memcpy) - (2f - 1b) /* offset */
2:
.previous
- .section .altinstructions,"a"
+
+ .section .altinstructions, "a"
.align 8
.quad memcpy
.quad 1b
.byte X86_FEATURE_REP_GOOD
- /* Replace only beginning, memcpy is used to apply alternatives, so it
- * is silly to overwrite itself with nops - reboot is only outcome... */
+
+ /*
+ * Replace only beginning, memcpy is used to apply alternatives,
+ * so it is silly to overwrite itself with nops - reboot is the
+ * only outcome...
+ */
.byte 2b - 1b
.byte 2b - 1b
.previous