[Gmp-commit] /var/hg/gmp: 6 new changesets

Wed Feb 22 01:36:52 UTC 2017

details:   /var/hg/gmp/rev/eb589b75b792
changeset: 17285:eb589b75b792
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Tue Feb 21 02:10:41 2017 +0100
description:
Provide ARM64 rsh1add_n and rsh1sub_n.

details:   /var/hg/gmp/rev/c1bd17edc977
changeset: 17286:c1bd17edc977
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Tue Feb 21 02:20:20 2017 +0100
description:
Reduce number of unique register usaged.

details:   /var/hg/gmp/rev/75f977e3f338
changeset: 17287:75f977e3f338
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Tue Feb 21 16:20:58 2017 +0100
description:
Rewrite ARM64 shifting.

details:   /var/hg/gmp/rev/9383eb33865f
changeset: 17288:9383eb33865f
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Wed Feb 22 02:25:38 2017 +0100
description:
Add a comment.

details:   /var/hg/gmp/rev/a3e73a3444ce
changeset: 17289:a3e73a3444ce
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Wed Feb 22 02:26:21 2017 +0100
description:
Add a comment.

details:   /var/hg/gmp/rev/2567c1119e14
changeset: 17290:2567c1119e14
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Wed Feb 22 02:27:55 2017 +0100
description:
Provide arm64 mpn_lshiftc.

diffstat:

 mpn/arm64/aors_n.asm     |   16 ++--
 mpn/arm64/lshift.asm     |  107 +++++++++++++++--------------
 mpn/arm64/lshiftc.asm    |  130 ++++++++++++++++++++++++++++++++++++
 mpn/arm64/rsh1aors_n.asm |  168 +++++++++++++++++++++++++++++++++++++++++++++++
 mpn/arm64/rshift.asm     |  120 +++++++++++++++++----------------
 5 files changed, 424 insertions(+), 117 deletions(-)

diffs (truncated from 682 to 300 lines):

diff -r 814cfb09af29 -r 2567c1119e14 mpn/arm64/aors_n.asm

--- a/mpn/arm64/aors_n.asm	Sun Feb 19 07:00:08 2017 +0100
+++ b/mpn/arm64/aors_n.asm	Wed Feb 22 02:27:55 2017 +0100
@@ -73,8 +73,8 @@
 
 L(bx1):	ldr	x7, [up]
 	ldr	x11, [vp]
-	ADDSUBC	x15, x7, x11
-	str	x15, [rp],#8
+	ADDSUBC	x13, x7, x11
+	str	x13, [rp],#8
 	tbnz	n, #1, L(b11)
 
 L(b01):	cbz	x18, L(ret)
@@ -106,9 +106,9 @@
 	ALIGN(16)
 L(top):	ldp	x4, x5, [up,#16]
 	ldp	x8, x9, [vp,#16]
-	ADDSUBC	x14, x6, x10
-	ADDSUBC	x15, x7, x11
-	stp	x14, x15, [rp],#16
+	ADDSUBC	x12, x6, x10
+	ADDSUBC	x13, x7, x11
+	stp	x12, x13, [rp],#16
 L(mid):	ldp	x6, x7, [up,#32]!
 	ldp	x10, x11, [vp,#32]!
 	ADDSUBC	x12, x4, x8
@@ -117,9 +117,9 @@
 	sub	x18, x18, #1
 	cbnz	x18, L(top)
 
-L(end):	ADDSUBC	x14, x6, x10
-	ADDSUBC	x15, x7, x11
-	stp	x14, x15, [rp]
+L(end):	ADDSUBC	x12, x6, x10
+	ADDSUBC	x13, x7, x11
+	stp	x12, x13, [rp]
 L(ret):	RETVAL
 	ret
 EPILOGUE()
diff -r 814cfb09af29 -r 2567c1119e14 mpn/arm64/lshift.asm
--- a/mpn/arm64/lshift.asm	Sun Feb 19 07:00:08 2017 +0100
+++ b/mpn/arm64/lshift.asm	Wed Feb 22 02:27:55 2017 +0100
@@ -1,6 +1,6 @@
 dnl  ARM64 mpn_lshift.
 
-dnl  Copyright 2013, 2014 Free Software Foundation, Inc.
+dnl  Copyright 2013, 2014, 2017 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -19,9 +19,17 @@
 
 include(`../config.m4')
 
-C	     cycles/limb
-C Cortex-A53	 ?
-C Cortex-A57	 ?
+C	     cycles/limb   assumed optimal c/l
+C Cortex-A53	3.5-4.0		 3.25
+C Cortex-A57	 2.0		 2.0
+C X-Gene	 2.67		 2.5
+
+C TODO
+C  * The feed-in code used 1 ldr for odd sized and 2 ldr for even sizes.  These
+C    numbers should be 1 and 0, respectively.  The str in wind-down should also
+C    go.
+C  * Using extr and with 63 separate loops we might reach 1.25 c/l on A57.
+C  * A53's speed depends on alignment, tune/speed -w1 gives 3.5, -w0 gives 4.0.
 
 changecom(blah)
 
@@ -34,46 +42,45 @@
 
 define(`tnc',`x8')
 
+define(`PSHIFT', lsl)
+define(`NSHIFT', lsr)
+
 ASM_START()
 PROLOGUE(mpn_lshift)
 	add	rp, rp_arg, n, lsl #3
 	add	up, up, n, lsl #3
 	sub	tnc, xzr, cnt
+	lsr	x18, n, #2
 	tbz	n, #0, L(bx0)
 
 L(bx1):	ldr	x4, [up,#-8]
 	tbnz	n, #1, L(b11)
 
-L(b01):	lsr	x0, x4, tnc
-	lsl	x18, x4, cnt
-	sub	n, n, #1
-	cbnz	n, L(gt1)
-	str	x18, [rp,#-8]
+L(b01):	NSHIFT	x0, x4, tnc
+	PSHIFT	x2, x4, cnt
+	cbnz	x18, L(gt1)
+	str	x2, [rp,#-8]
 	ret
 L(gt1):	ldp	x4, x5, [up,#-24]
 	sub	up, up, #8
 	add	rp, rp, #16
 	b	L(lo2)
 
-L(b11):	lsr	x0, x4, tnc
-	lsl	x9, x4, cnt
-	ldp	x6, x7, [up,#-24]
-	add	n, n, #1
-	add	up, up, #8
-	add	rp, rp, #32
-	b	L(lo0)
+L(b11):	NSHIFT	x0, x4, tnc
+	PSHIFT	x2, x4, cnt
+	ldp	x6, x7, [up,#-24]!
+	b	L(lo3)
 
 L(bx0):	ldp	x4, x5, [up,#-16]
 	tbz	n, #1, L(b00)
 
-L(b10):	lsr	x0, x5, tnc
-	lsl	x13, x5, cnt
-	lsr	x10, x4, tnc
-	lsl	x18, x4, cnt
-	sub	n, n, #2
-	cbnz	n, L(gt2)
+L(b10):	NSHIFT	x0, x5, tnc
+	PSHIFT	x13, x5, cnt
+	NSHIFT	x10, x4, tnc
+	PSHIFT	x2, x4, cnt
+	cbnz	x18, L(gt2)
 	orr	x10, x10, x13
-	stp	x18, x10, [rp,#-16]
+	stp	x2, x10, [rp,#-16]
 	ret
 L(gt2):	ldp	x4, x5, [up,#-32]
 	orr	x10, x10, x13
@@ -82,41 +89,39 @@
 	add	rp, rp, #8
 	b	L(lo2)
 
-L(b00):	lsr	x0, x5, tnc
-	lsl	x13, x5, cnt
-	lsr	x10, x4, tnc
-	lsl	x9, x4, cnt
-	ldp	x6, x7, [up,#-32]
+L(b00):	NSHIFT	x0, x5, tnc
+	PSHIFT	x13, x5, cnt
+	NSHIFT	x10, x4, tnc
+	PSHIFT	x2, x4, cnt
+	ldp	x6, x7, [up,#-32]!
 	orr	x10, x10, x13
-	str	x10, [rp,#-8]
-	add	rp, rp, #24
+	str	x10, [rp,#-8]!
 	b	L(lo0)
 
 	ALIGN(16)
-L(top):	ldp	x4, x5, [up,#-48]
-	sub	rp, rp, #32		C integrate with stp?
-	sub	up, up, #32		C integrate with ldp?
-	orr	x11, x11, x9
+L(top):	ldp	x4, x5, [up,#-16]
 	orr	x10, x10, x13
+	orr	x11, x12, x2
 	stp	x10, x11, [rp,#-16]
-L(lo2):	lsr	x11, x5, tnc
-	lsl	x13, x5, cnt
-	lsr	x10, x4, tnc
-	lsl	x9, x4, cnt
-	ldp	x6, x7, [up,#-32]
-	orr	x11, x11, x18
+	PSHIFT	x2, x6, cnt
+L(lo2):	NSHIFT	x10, x4, tnc
+	PSHIFT	x13, x5, cnt
+	NSHIFT	x12, x5, tnc
+	ldp	x6, x7, [up,#-32]!
 	orr	x10, x10, x13
-	stp	x10, x11, [rp,#-32]
-L(lo0):	sub	n, n, #4
-	lsr	x11, x7, tnc
-	lsl	x13, x7, cnt
-	lsr	x10, x6, tnc
-	lsl	x18, x6, cnt
-	cbnz	n, L(top)
+	orr	x11, x12, x2
+	stp	x10, x11, [rp,#-32]!
+	PSHIFT	x2, x4, cnt
+L(lo0):	sub	x18, x18, #1
+L(lo3):	NSHIFT	x10, x6, tnc
+	PSHIFT	x13, x7, cnt
+	NSHIFT	x12, x7, tnc
+	cbnz	x18, L(top)
 
-L(end):	orr	x11, x11, x9
-	orr	x10, x10, x13
-	stp	x10, x11, [rp,#-48]
-	str	x18, [rp,#-56]
+L(end):	orr	x10, x10, x13
+	orr	x11, x12, x2
+	PSHIFT	x2, x6, cnt
+	stp	x10, x11, [rp,#-16]
+	str	x2, [rp,#-24]
 	ret
 EPILOGUE()
diff -r 814cfb09af29 -r 2567c1119e14 mpn/arm64/lshiftc.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/arm64/lshiftc.asm	Wed Feb 22 02:27:55 2017 +0100
@@ -0,0 +1,130 @@
+dnl  ARM64 mpn_lshiftc.
+
+dnl  Copyright 2013, 2014, 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb   assumed optimal c/l
+C Cortex-A53	3.5-4.0		 3.25
+C Cortex-A57	 2.0		 2.0
+C X-Gene	 2.67		 2.5
+
+C TODO
+C  * The feed-in code used 1 ldr for odd sized and 2 ldr for even sizes.  These
+C    numbers should be 1 and 0, respectively.  The str in wind-down should also
+C    go.
+C  * Using extr and with 63 separate loops we might reach 1.5 c/l on A57.
+C  * A53's speed depends on alignment, tune/speed -w1 gives 3.5, -w0 gives 4.0.
+
+changecom(blah)
+
+define(`rp_arg', `x0')
+define(`up',     `x1')
+define(`n',      `x2')
+define(`cnt',    `x3')
+
+define(`rp',     `x16')
+
+define(`tnc',`x8')
+
+define(`PSHIFT', lsl)
+define(`NSHIFT', lsr)
+
+ASM_START()
+PROLOGUE(mpn_lshiftc)
+	add	rp, rp_arg, n, lsl #3
+	add	up, up, n, lsl #3
+	sub	tnc, xzr, cnt
+	lsr	x18, n, #2
+	tbz	n, #0, L(bx0)
+
+L(bx1):	ldr	x4, [up,#-8]
+	tbnz	n, #1, L(b11)
+
+L(b01):	NSHIFT	x0, x4, tnc
+	PSHIFT	x2, x4, cnt
+	cbnz	x18, L(gt1)
+	mvn	x2, x2
+	str	x2, [rp,#-8]
+	ret
+L(gt1):	ldp	x4, x5, [up,#-24]
+	sub	up, up, #8
+	add	rp, rp, #16
+	b	L(lo2)
+
+L(b11):	NSHIFT	x0, x4, tnc
+	PSHIFT	x2, x4, cnt
+	ldp	x6, x7, [up,#-24]!
+	b	L(lo3)
+
+L(bx0):	ldp	x4, x5, [up,#-16]
+	tbz	n, #1, L(b00)
+
+L(b10):	NSHIFT	x0, x5, tnc
+	PSHIFT	x13, x5, cnt
+	NSHIFT	x10, x4, tnc
+	PSHIFT	x2, x4, cnt
+	cbnz	x18, L(gt2)
+	eon	x10, x10, x13
+	mvn	x2, x2
+	stp	x2, x10, [rp,#-16]
+	ret
+L(gt2):	ldp	x4, x5, [up,#-32]
+	eon	x10, x10, x13
+	str	x10, [rp,#-8]
+	sub	up, up, #16
+	add	rp, rp, #8