[Gmp-commit] /var/hg/gmp: 2 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Fri Apr 19 00:35:44 CEST 2013
details: /var/hg/gmp/rev/c935e67658f1
changeset: 15740:c935e67658f1
user: Torbjorn Granlund <tege at gmplib.org>
date: Fri Apr 19 00:35:19 2013 +0200
description:
Rewrite.
details: /var/hg/gmp/rev/b81108588266
changeset: 15741:b81108588266
user: Torbjorn Granlund <tege at gmplib.org>
date: Fri Apr 19 00:35:37 2013 +0200
description:
ChangeLog
diffstat:
ChangeLog | 4 +
mpn/arm/v7a/cora15/addmul_1.asm | 140 ++++++++++++++++++++++++---------------
2 files changed, 90 insertions(+), 54 deletions(-)
diffs (179 lines):
diff -r 1dde616d03b2 -r b81108588266 ChangeLog
--- a/ChangeLog Thu Apr 18 18:45:02 2013 +0200
+++ b/ChangeLog Fri Apr 19 00:35:37 2013 +0200
@@ -1,3 +1,7 @@
+2013-04-19 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/arm/v7a/cora15/addmul_1.asm: Rewrite.
+
2013-04-18 Torbjorn Granlund <tege at gmplib.org>
* mpn/alpha/tabselect.asm: New file.
diff -r 1dde616d03b2 -r b81108588266 mpn/arm/v7a/cora15/addmul_1.asm
--- a/mpn/arm/v7a/cora15/addmul_1.asm Thu Apr 18 18:45:02 2013 +0200
+++ b/mpn/arm/v7a/cora15/addmul_1.asm Fri Apr 19 00:35:37 2013 +0200
@@ -20,78 +20,110 @@
include(`../config.m4')
C cycles/limb best
-C StrongARM: -
+C StrongARM: ?
C XScale ?
C Cortex-A7 ?
C Cortex-A8 ?
-C Cortex-A9 6.5 3.25
-C Cortex-A15 3 this
+C Cortex-A9 6 3.25
+C Cortex-A15 2 this
-
-C This runs well on A15 but very poorly on A9. We have made no effort at
-C improving its A9 performance, as doubling the speed seems hard.
-
-C This is armv5 code, optimized for the armv7a cpu A15. Its location in the
+C This code uses umlal for adding in the rp[] data, keeping the recurrency path
+C separate from any multiply instructions. It performs well on A15, at umlal's
+C bandwidth.
+C
+C An A9 variant should perhaps stick to 3-way unrolling, and use ldm and stm
+C for all loads and stores. Alternatively, it could do 2-way or 4-way, but
+C then alignment aware code will be necessary (adding O(1) bookkeeping
+C overhead).
+C
+C We don't use r12 due to ldrd and strd limitations.
+C
+C This is armv5 code, optimised for the armv7a cpu A15. Its location in the
C GMP file structure might be misleading.
-
define(`rp', `r0')
define(`up', `r1')
define(`n', `r2')
define(`v0', `r3')
+define(`w0', `r10') define(`w1', `r11')
+define(`u0', `r8') define(`u1', `r9')
+
ASM_START()
PROLOGUE(mpn_addmul_1)
- push {r4-r8}
+ push { r4-r11 }
- adds r0, r0, #0 C clear carry
+ ands r6, n, #3
+ sub n, n, #3
+ beq L(b00)
+ cmp r6, #2
+ bcc L(b01)
+ beq L(b10)
- tst n, #1
- beq L(bx0)
+L(b11): mov r6, #0
+ cmn r13, #0 C carry clear
+ ldr u1, [up], #-4
+ ldr w1, [rp], #-4
+ mov r7, #0
+ b L(mid)
-L(bx1): mov r5, #0
- ldr r8, [up], #4
- tst n, #2
- beq L(lo1)
- b L(lo3)
+L(b00): ldrd u0, u1, [up]
+ ldrd w0, w1, [rp]
+ mov r6, #0
+ umlal w0, r6, u0, v0
+ cmn r13, #0 C carry clear
+ mov r7, #0
+ str w0, [rp]
+ b L(mid)
-L(bx0): mov r7, #0
- ldr r8, [up], #4
- adds r0, r0, #0
- tst n, #2
- beq L(lo0)
- b L(lo2)
+L(b10): ldrd u0, u1, [up], #8
+ ldrd w0, w1, [rp]
+ mov r4, #0
+ umlal w0, r4, u0, v0
+ cmn r13, #0 C carry clear
+ mov r5, #0
+ str w0, [rp], #8
+ umlal w1, r5, u1, v0
+ tst n, n
+ bmi L(end)
+ b L(top)
-L(top): ldr r8, [up], #4
- str r6, [rp, #-4]
-L(lo0): ldr r4, [rp], #4
+L(b01): mov r4, #0
+ ldr u1, [up], #4
+ ldr w1, [rp], #4
mov r5, #0
- umlal r4, r5, r8, v0
- adds r4, r4, r7
- ldr r8, [up], #4
+ umlal w1, r5, u1, v0
+ tst n, n
+ bmi L(end)
+
+ ALIGN(16)
+L(top): ldrd u0, u1, [up, #0]
+ adcs r4, r4, w1
+ ldrd w0, w1, [rp, #0]
+ mov r6, #0
+ umlal w0, r6, u0, v0 C 1 2
+ adcs r5, r5, w0
+ mov r7, #0
+ strd r4, r5, [rp, #-4]
+L(mid): umlal w1, r7, u1, v0 C 2 3
+ ldrd u0, u1, [up, #8]
+ adcs r6, r6, w1
+ ldrd w0, w1, [rp, #8]
+ mov r4, #0
+ umlal w0, r4, u0, v0 C 3 4
+ adcs r7, r7, w0
+ mov r5, #0
+ strd r6, r7, [rp, #4]
+ umlal w1, r5, u1, v0 C 0 1
+ sub n, n, #4
+ add up, up, #16
+ add rp, rp, #16
+ tst n, n
+ bpl L(top)
+
+L(end): adcs r4, r4, w1
str r4, [rp, #-4]
-L(lo3): ldr r6, [rp], #4
- mov r7, #0
- umlal r6, r7, r8, v0
- adcs r6, r6, r5
- ldr r8, [up], #4
- str r6, [rp, #-4]
-L(lo2): ldr r4, [rp], #4
- mov r5, #0
- umlal r4, r5, r8, v0
- adcs r4, r4, r7
- ldr r8, [up], #4
- str r4, [rp, #-4]
-L(lo1): ldr r6, [rp], #4
- mov r7, #0
- umlal r6, r7, r8, v0
- adcs r6, r6, r5
- adc r7, r7, #0
- subs n, n, #4
- bgt L(top)
-
- str r6, [rp, #-4]
- mov r0, r7
- pop {r4-r8}
- bx lr
+ adc r0, r5, #0
+ pop { r4-r11 }
+ bx r14
EPILOGUE()
More information about the gmp-commit
mailing list