[Gmp-commit] /var/hg/gmp: Rewrite arm64 copyi/copyd to use scalar regs.

Sun Dec 20 01:37:20 UTC 2020

details:   /var/hg/gmp/rev/e5f487a21419
changeset: 18183:e5f487a21419
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Sun Dec 20 02:36:18 2020 +0100
description:
Rewrite arm64 copyi/copyd to use scalar regs.

diffstat:

 mpn/arm64/copyd.asm |  46 +++++++++++++++++++---------------------------
 mpn/arm64/copyi.asm |  34 +++++++++++++++++++---------------
 2 files changed, 38 insertions(+), 42 deletions(-)

diffs (140 lines):

diff -r 006e55d697e2 -r e5f487a21419 mpn/arm64/copyd.asm

--- a/mpn/arm64/copyd.asm	Sat Dec 19 05:33:15 2020 +0100
+++ b/mpn/arm64/copyd.asm	Sun Dec 20 02:36:18 2020 +0100
@@ -31,8 +31,13 @@
 include(`../config.m4')
 
 C	     cycles/limb
-C Cortex-A53	 ?
-C Cortex-A57	 ?
+C Cortex-A53	 1.8
+C Cortex-A55	 1.28
+C Cortex-A57
+C Cortex-A72	 1
+C Cortex-A73	 1.1-1.35 (alignment dependent)
+C X-Gene	 1
+C Apple M1	 0.31
 
 changecom(blah)
 
@@ -50,44 +55,31 @@
 
 C Copy until rp is 128-bit aligned
 	tbz	rp, #3, L(al2)
-	sub	up, up, #8
-	ld1	{v22.1d}, [up]
+	ldr	x4, [up,#-8]!
 	sub	n, n, #1
-	sub	rp, rp, #8
-	st1	{v22.1d}, [rp]
+	str	x4, [rp,#-8]!
 
-L(al2):	sub	up, up, #16
-	ld1	{v26.2d}, [up]
+L(al2):	ldp	x4,x5, [up,#-16]!
 	sub	n, n, #6
-	sub	rp, rp, #16			C offset rp for loop
 	tbnz	n, #63, L(end)
 
-	sub	up, up, #16			C offset up for loop
-	mov	x12, #-16
-
 	ALIGN(16)
-L(top):	ld1	{v22.2d}, [up], x12
-	st1	{v26.2d}, [rp], x12
-	ld1	{v26.2d}, [up], x12
-	st1	{v22.2d}, [rp], x12
+L(top):	ldp	x6,x7, [up,#-16]
+	stp	x4,x5, [rp,#-16]
+	ldp	x4,x5, [up,#-32]!
+	stp	x6,x7, [rp,#-32]!
 	sub	n, n, #4
 	tbz	n, #63, L(top)
 
-	add	up, up, #16			C undo up offset
-
-L(end):	st1	{v26.2d}, [rp]
+L(end):	stp	x4,x5, [rp,#-16]!
 
 C Copy last 0-3 limbs.  Note that rp is aligned after loop, but not when we
 C arrive here via L(bc)
 L(bc):	tbz	n, #1, L(tl1)
-	sub	up, up, #16
-	ld1	{v22.2d}, [up]
-	sub	rp, rp, #16
-	st1	{v22.2d}, [rp]
+	ldp	x4,x5, [up,#-16]!
+	stp	x4,x5, [rp,#-16]!
 L(tl1):	tbz	n, #0, L(tl2)
-	sub	up, up, #8
-	ld1	{v22.1d}, [up]
-	sub	rp, rp, #8
-	st1	{v22.1d}, [rp]
+	ldr	x4, [up,#-8]
+	str	x4, [rp,#-8]
 L(tl2):	ret
 EPILOGUE()
diff -r 006e55d697e2 -r e5f487a21419 mpn/arm64/copyi.asm
--- a/mpn/arm64/copyi.asm	Sat Dec 19 05:33:15 2020 +0100
+++ b/mpn/arm64/copyi.asm	Sun Dec 20 02:36:18 2020 +0100
@@ -31,9 +31,13 @@
 include(`../config.m4')
 
 C	     cycles/limb
-C Cortex-A53	 2
-C Cortex-A57	 1
-C X-Gene	 1.25
+C Cortex-A53	 1.8
+C Cortex-A55	 1.28
+C Cortex-A57
+C Cortex-A72	 1
+C Cortex-A73	 1.1-1.35 (alignment dependent)
+C X-Gene	 1
+C Apple M1	 0.31
 
 changecom(blah)
 
@@ -48,31 +52,31 @@
 
 C Copy until rp is 128-bit aligned
 	tbz	rp, #3, L(al2)
-	ld1	{v22.1d}, [up], #8
+	ldr	x4, [up],#8
 	sub	n, n, #1
-	st1	{v22.1d}, [rp], #8
+	str	x4, [rp],#8
 
-L(al2):	ld1	{v26.2d}, [up], #16
+L(al2):	ldp	x4,x5, [up],#16
 	sub	n, n, #6
 	tbnz	n, #63, L(end)
 
 	ALIGN(16)
-L(top):	ld1	{v22.2d}, [up], #16
-	st1	{v26.2d}, [rp], #16
-	ld1	{v26.2d}, [up], #16
-	st1	{v22.2d}, [rp], #16
+L(top):	ldp	x6,x7, [up],#32
+	stp	x4,x5, [rp],#32
+	ldp	x4,x5, [up,#-16]
+	stp	x6,x7, [rp,#-16]
 	sub	n, n, #4
 	tbz	n, #63, L(top)
 
-L(end):	st1	{v26.2d}, [rp], #16
+L(end):	stp	x4,x5, [rp],#16
 
 C Copy last 0-3 limbs.  Note that rp is aligned after loop, but not when we
 C arrive here via L(bc)
 L(bc):	tbz	n, #1, L(tl1)
-	ld1	{v22.2d}, [up], #16
-	st1	{v22.2d}, [rp], #16
+	ldp	x4,x5, [up],#16
+	stp	x4,x5, [rp],#16
 L(tl1):	tbz	n, #0, L(tl2)
-	ld1	{v22.1d}, [up]
-	st1	{v22.1d}, [rp]
+	ldr	x4, [up]
+	str	x4, [rp]
 L(tl2):	ret
 EPILOGUE()