aboutsummaryrefslogtreecommitdiff
path: root/arch/sh/lib64
diff options
context:
space:
mode:
Diffstat (limited to 'arch/sh/lib64')
-rw-r--r--arch/sh/lib64/Makefile3
-rw-r--r--arch/sh/lib64/sdivsi3.S131
-rw-r--r--arch/sh/lib64/udivdi3.S120
-rw-r--r--arch/sh/lib64/udivsi3.S59
4 files changed, 313 insertions, 0 deletions
diff --git a/arch/sh/lib64/Makefile b/arch/sh/lib64/Makefile
index 1d932e7d0ca..4bacb9e8347 100644
--- a/arch/sh/lib64/Makefile
+++ b/arch/sh/lib64/Makefile
@@ -12,3 +12,6 @@
# Panic should really be compiled as PIC
lib-y := udelay.o c-checksum.o dbg.o panic.o memcpy.o memset.o \
copy_user_memcpy.o copy_page.o clear_page.o strcpy.o strlen.o
+
+# Extracted from libgcc
+lib-y += udivsi3.o udivdi3.o sdivsi3.o
diff --git a/arch/sh/lib64/sdivsi3.S b/arch/sh/lib64/sdivsi3.S
new file mode 100644
index 00000000000..6a800c6a490
--- /dev/null
+++ b/arch/sh/lib64/sdivsi3.S
@@ -0,0 +1,131 @@
+ .global __sdivsi3
+ .section .text..SHmedia32,"ax"
+ .align 2
+
+ /* inputs: r4,r5 */
+ /* clobbered: r1,r18,r19,r20,r21,r25,tr0 */
+ /* result in r0 */
+__sdivsi3:
+ ptb __div_table,tr0
+
+ nsb r5, r1
+ shlld r5, r1, r25 /* normalize; [-2 ..1, 1..2) in s2.62 */
+ shari r25, 58, r21 /* extract 5(6) bit index (s2.4 with hole -1..1) */
+ /* bubble */
+ gettr tr0,r20
+ ldx.ub r20, r21, r19 /* u0.8 */
+ shari r25, 32, r25 /* normalize to s2.30 */
+ shlli r21, 1, r21
+ muls.l r25, r19, r19 /* s2.38 */
+ ldx.w r20, r21, r21 /* s2.14 */
+ ptabs r18, tr0
+ shari r19, 24, r19 /* truncate to s2.14 */
+ sub r21, r19, r19 /* some 11 bit inverse in s1.14 */
+ muls.l r19, r19, r21 /* u0.28 */
+ sub r63, r1, r1
+ addi r1, 92, r1
+ muls.l r25, r21, r18 /* s2.58 */
+ shlli r19, 45, r19 /* multiply by two and convert to s2.58 */
+ /* bubble */
+ sub r19, r18, r18
+ shari r18, 28, r18 /* some 22 bit inverse in s1.30 */
+ muls.l r18, r25, r0 /* s2.60 */
+ muls.l r18, r4, r25 /* s32.30 */
+ /* bubble */
+ shari r0, 16, r19 /* s-16.44 */
+ muls.l r19, r18, r19 /* s-16.74 */
+ shari r25, 63, r0
+ shari r4, 14, r18 /* s19.-14 */
+ shari r19, 30, r19 /* s-16.44 */
+ muls.l r19, r18, r19 /* s15.30 */
+ xor r21, r0, r21 /* You could also use the constant 1 << 27. */
+ add r21, r25, r21
+ sub r21, r19, r21
+ shard r21, r1, r21
+ sub r21, r0, r0
+ blink tr0, r63
+
+/* This table has been generated by divtab.c .
+Defects for bias -330:
+ Max defect: 6.081536e-07 at -1.000000e+00
+ Min defect: 2.849516e-08 at 1.030651e+00
+ Max 2nd step defect: 9.606539e-12 at -1.000000e+00
+ Min 2nd step defect: 0.000000e+00 at 0.000000e+00
+ Defect at 1: 1.238659e-07
+ Defect at -2: 1.061708e-07 */
+
+ .balign 2
+ .type __div_table,@object
+ .size __div_table,128
+/* negative division constants */
+ .word -16638
+ .word -17135
+ .word -17737
+ .word -18433
+ .word -19103
+ .word -19751
+ .word -20583
+ .word -21383
+ .word -22343
+ .word -23353
+ .word -24407
+ .word -25582
+ .word -26863
+ .word -28382
+ .word -29965
+ .word -31800
+/* negative division factors */
+ .byte 66
+ .byte 70
+ .byte 75
+ .byte 81
+ .byte 87
+ .byte 93
+ .byte 101
+ .byte 109
+ .byte 119
+ .byte 130
+ .byte 142
+ .byte 156
+ .byte 172
+ .byte 192
+ .byte 214
+ .byte 241
+ .skip 16
+ .global __div_table
+__div_table:
+ .skip 16
+/* positive division factors */
+ .byte 241
+ .byte 214
+ .byte 192
+ .byte 172
+ .byte 156
+ .byte 142
+ .byte 130
+ .byte 119
+ .byte 109
+ .byte 101
+ .byte 93
+ .byte 87
+ .byte 81
+ .byte 75
+ .byte 70
+ .byte 66
+/* positive division constants */
+ .word 31801
+ .word 29966
+ .word 28383
+ .word 26864
+ .word 25583
+ .word 24408
+ .word 23354
+ .word 22344
+ .word 21384
+ .word 20584
+ .word 19752
+ .word 19104
+ .word 18434
+ .word 17738
+ .word 17136
+ .word 16639
diff --git a/arch/sh/lib64/udivdi3.S b/arch/sh/lib64/udivdi3.S
new file mode 100644
index 00000000000..6895c0225b8
--- /dev/null
+++ b/arch/sh/lib64/udivdi3.S
@@ -0,0 +1,120 @@
+ .section .text..SHmedia32,"ax"
+ .align 2
+ .global __udivdi3
+__udivdi3:
+ shlri r3,1,r4
+ nsb r4,r22
+ shlld r3,r22,r6
+ shlri r6,49,r5
+ movi 0xffffffffffffbaf1,r21 /* .l shift count 17. */
+ sub r21,r5,r1
+ mmulfx.w r1,r1,r4
+ mshflo.w r1,r63,r1
+ sub r63,r22,r20 // r63 == 64 % 64
+ mmulfx.w r5,r4,r4
+ pta large_divisor,tr0
+ addi r20,32,r9
+ msub.w r1,r4,r1
+ madd.w r1,r1,r1
+ mmulfx.w r1,r1,r4
+ shlri r6,32,r7
+ bgt/u r9,r63,tr0 // large_divisor
+ mmulfx.w r5,r4,r4
+ shlri r2,32+14,r19
+ addi r22,-31,r0
+ msub.w r1,r4,r1
+
+ mulu.l r1,r7,r4
+ addi r1,-3,r5
+ mulu.l r5,r19,r5
+ sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
+ shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
+ the case may be, %0000000000000000 000.11111111111, still */
+ muls.l r1,r4,r4 /* leaving at least one sign bit. */
+ mulu.l r5,r3,r8
+ mshalds.l r1,r21,r1
+ shari r4,26,r4
+ shlld r8,r0,r8
+ add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+ sub r2,r8,r2
+ /* Can do second step of 64 : 32 div now, using r1 and the rest in r2. */
+
+ shlri r2,22,r21
+ mulu.l r21,r1,r21
+ shlld r5,r0,r8
+ addi r20,30-22,r0
+ shlrd r21,r0,r21
+ mulu.l r21,r3,r5
+ add r8,r21,r8
+ mcmpgt.l r21,r63,r21 // See Note 1
+ addi r20,30,r0
+ mshfhi.l r63,r21,r21
+ sub r2,r5,r2
+ andc r2,r21,r2
+
+ /* small divisor: need a third divide step */
+ mulu.l r2,r1,r7
+ ptabs r18,tr0
+ addi r2,1,r2
+ shlrd r7,r0,r7
+ mulu.l r7,r3,r5
+ add r8,r7,r8
+ sub r2,r3,r2
+ cmpgt r2,r5,r5
+ add r8,r5,r2
+ /* could test r3 here to check for divide by zero. */
+ blink tr0,r63
+
+large_divisor:
+ mmulfx.w r5,r4,r4
+ shlrd r2,r9,r25
+ shlri r25,32,r8
+ msub.w r1,r4,r1
+
+ mulu.l r1,r7,r4
+ addi r1,-3,r5
+ mulu.l r5,r8,r5
+ sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
+ shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
+ the case may be, %0000000000000000 000.11111111111, still */
+ muls.l r1,r4,r4 /* leaving at least one sign bit. */
+ shlri r5,14-1,r8
+ mulu.l r8,r7,r5
+ mshalds.l r1,r21,r1
+ shari r4,26,r4
+ add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+ sub r25,r5,r25
+ /* Can do second step of 64 : 32 div now, using r1 and the rest in r25. */
+
+ shlri r25,22,r21
+ mulu.l r21,r1,r21
+ pta no_lo_adj,tr0
+ addi r22,32,r0
+ shlri r21,40,r21
+ mulu.l r21,r7,r5
+ add r8,r21,r8
+ shlld r2,r0,r2
+ sub r25,r5,r25
+ bgtu/u r7,r25,tr0 // no_lo_adj
+ addi r8,1,r8
+ sub r25,r7,r25
+no_lo_adj:
+ mextr4 r2,r25,r2
+
+ /* large_divisor: only needs a few adjustments. */
+ mulu.l r8,r6,r5
+ ptabs r18,tr0
+ /* bubble */
+ cmpgtu r5,r2,r5
+ sub r8,r5,r2
+ blink tr0,r63
+
+/* Note 1: To shift the result of the second divide stage so that the result
+ always fits into 32 bits, yet we still reduce the rest sufficiently
+ would require a lot of instructions to do the shifts just right. Using
+ the full 64 bit shift result to multiply with the divisor would require
+ four extra instructions for the upper 32 bits (shift / mulu / shift / sub).
+ Fortunately, if the upper 32 bits of the shift result are nonzero, we
+ know that the rest after taking this partial result into account will
+ fit into 32 bits. So we just clear the upper 32 bits of the rest if the
+ upper 32 bits of the partial result are nonzero. */
diff --git a/arch/sh/lib64/udivsi3.S b/arch/sh/lib64/udivsi3.S
new file mode 100644
index 00000000000..e68120e4b84
--- /dev/null
+++ b/arch/sh/lib64/udivsi3.S
@@ -0,0 +1,59 @@
+ .global __udivsi3
+ .section .text..SHmedia32,"ax"
+ .align 2
+
+/*
+ inputs: r4,r5
+ clobbered: r18,r19,r20,r21,r22,r25,tr0
+ result in r0.
+ */
+__udivsi3:
+ addz.l r5,r63,r22
+ nsb r22,r0
+ shlld r22,r0,r25
+ shlri r25,48,r25
+ movi 0xffffffffffffbb0c,r20 /* shift count eqiv 76 */
+ sub r20,r25,r21
+ mmulfx.w r21,r21,r19
+ mshflo.w r21,r63,r21
+ ptabs r18,tr0
+ mmulfx.w r25,r19,r19
+ sub r20,r0,r0
+ /* bubble */
+ msub.w r21,r19,r19
+
+ /*
+ * It would be nice for scheduling to do this add to r21 before
+ * the msub.w, but we need a different value for r19 to keep
+ * errors under control.
+ */
+ addi r19,-2,r21
+ mulu.l r4,r21,r18
+ mmulfx.w r19,r19,r19
+ shlli r21,15,r21
+ shlrd r18,r0,r18
+ mulu.l r18,r22,r20
+ mmacnfx.wl r25,r19,r21
+ /* bubble */
+ sub r4,r20,r25
+
+ mulu.l r25,r21,r19
+ addi r0,14,r0
+ /* bubble */
+ shlrd r19,r0,r19
+ mulu.l r19,r22,r20
+ add r18,r19,r18
+ /* bubble */
+ sub.l r25,r20,r25
+
+ mulu.l r25,r21,r19
+ addz.l r25,r63,r25
+ sub r25,r22,r25
+ shlrd r19,r0,r19
+ mulu.l r19,r22,r20
+ addi r25,1,r25
+ add r18,r19,r18
+
+ cmpgt r25,r20,r25
+ add.l r18,r25,r0
+ blink tr0,r63