[Gmp-commit] /var/hg/gmp: Rewrite arm64 copyi/copyd to use scalar regs.
mercurial at gmplib.org
mercurial at gmplib.org
Sun Dec 20 01:37:20 UTC 2020
details: /var/hg/gmp/rev/e5f487a21419
changeset: 18183:e5f487a21419
user: Torbjorn Granlund <tg at gmplib.org>
date: Sun Dec 20 02:36:18 2020 +0100
description:
Rewrite arm64 copyi/copyd to use scalar regs.
diffstat:
mpn/arm64/copyd.asm | 46 +++++++++++++++++++---------------------------
mpn/arm64/copyi.asm | 34 +++++++++++++++++++---------------
2 files changed, 38 insertions(+), 42 deletions(-)
diffs (140 lines):
diff -r 006e55d697e2 -r e5f487a21419 mpn/arm64/copyd.asm
--- a/mpn/arm64/copyd.asm Sat Dec 19 05:33:15 2020 +0100
+++ b/mpn/arm64/copyd.asm Sun Dec 20 02:36:18 2020 +0100
@@ -31,8 +31,13 @@
include(`../config.m4')
C cycles/limb
-C Cortex-A53 ?
-C Cortex-A57 ?
+C Cortex-A53 1.8
+C Cortex-A55 1.28
+C Cortex-A57
+C Cortex-A72 1
+C Cortex-A73 1.1-1.35 (alignment dependent)
+C X-Gene 1
+C Apple M1 0.31
changecom(blah)
@@ -50,44 +55,31 @@
C Copy until rp is 128-bit aligned
tbz rp, #3, L(al2)
- sub up, up, #8
- ld1 {v22.1d}, [up]
+ ldr x4, [up,#-8]!
sub n, n, #1
- sub rp, rp, #8
- st1 {v22.1d}, [rp]
+ str x4, [rp,#-8]!
-L(al2): sub up, up, #16
- ld1 {v26.2d}, [up]
+L(al2): ldp x4,x5, [up,#-16]!
sub n, n, #6
- sub rp, rp, #16 C offset rp for loop
tbnz n, #63, L(end)
- sub up, up, #16 C offset up for loop
- mov x12, #-16
-
ALIGN(16)
-L(top): ld1 {v22.2d}, [up], x12
- st1 {v26.2d}, [rp], x12
- ld1 {v26.2d}, [up], x12
- st1 {v22.2d}, [rp], x12
+L(top): ldp x6,x7, [up,#-16]
+ stp x4,x5, [rp,#-16]
+ ldp x4,x5, [up,#-32]!
+ stp x6,x7, [rp,#-32]!
sub n, n, #4
tbz n, #63, L(top)
- add up, up, #16 C undo up offset
-
-L(end): st1 {v26.2d}, [rp]
+L(end): stp x4,x5, [rp,#-16]!
C Copy last 0-3 limbs. Note that rp is aligned after loop, but not when we
C arrive here via L(bc)
L(bc): tbz n, #1, L(tl1)
- sub up, up, #16
- ld1 {v22.2d}, [up]
- sub rp, rp, #16
- st1 {v22.2d}, [rp]
+ ldp x4,x5, [up,#-16]!
+ stp x4,x5, [rp,#-16]!
L(tl1): tbz n, #0, L(tl2)
- sub up, up, #8
- ld1 {v22.1d}, [up]
- sub rp, rp, #8
- st1 {v22.1d}, [rp]
+ ldr x4, [up,#-8]
+ str x4, [rp,#-8]
L(tl2): ret
EPILOGUE()
diff -r 006e55d697e2 -r e5f487a21419 mpn/arm64/copyi.asm
--- a/mpn/arm64/copyi.asm Sat Dec 19 05:33:15 2020 +0100
+++ b/mpn/arm64/copyi.asm Sun Dec 20 02:36:18 2020 +0100
@@ -31,9 +31,13 @@
include(`../config.m4')
C cycles/limb
-C Cortex-A53 2
-C Cortex-A57 1
-C X-Gene 1.25
+C Cortex-A53 1.8
+C Cortex-A55 1.28
+C Cortex-A57
+C Cortex-A72 1
+C Cortex-A73 1.1-1.35 (alignment dependent)
+C X-Gene 1
+C Apple M1 0.31
changecom(blah)
@@ -48,31 +52,31 @@
C Copy until rp is 128-bit aligned
tbz rp, #3, L(al2)
- ld1 {v22.1d}, [up], #8
+ ldr x4, [up],#8
sub n, n, #1
- st1 {v22.1d}, [rp], #8
+ str x4, [rp],#8
-L(al2): ld1 {v26.2d}, [up], #16
+L(al2): ldp x4,x5, [up],#16
sub n, n, #6
tbnz n, #63, L(end)
ALIGN(16)
-L(top): ld1 {v22.2d}, [up], #16
- st1 {v26.2d}, [rp], #16
- ld1 {v26.2d}, [up], #16
- st1 {v22.2d}, [rp], #16
+L(top): ldp x6,x7, [up],#32
+ stp x4,x5, [rp],#32
+ ldp x4,x5, [up,#-16]
+ stp x6,x7, [rp,#-16]
sub n, n, #4
tbz n, #63, L(top)
-L(end): st1 {v26.2d}, [rp], #16
+L(end): stp x4,x5, [rp],#16
C Copy last 0-3 limbs. Note that rp is aligned after loop, but not when we
C arrive here via L(bc)
L(bc): tbz n, #1, L(tl1)
- ld1 {v22.2d}, [up], #16
- st1 {v22.2d}, [rp], #16
+ ldp x4,x5, [up],#16
+ stp x4,x5, [rp],#16
L(tl1): tbz n, #0, L(tl2)
- ld1 {v22.1d}, [up]
- st1 {v22.1d}, [rp]
+ ldr x4, [up]
+ str x4, [rp]
L(tl2): ret
EPILOGUE()
More information about the gmp-commit
mailing list