[Gmp-commit] /var/hg/gmp: 6 new changesets

mercurial at gmplib.org mercurial at gmplib.org
Fri Apr 26 00:13:55 CEST 2013


details:   /var/hg/gmp/rev/66d488db0ee2
changeset: 15747:66d488db0ee2
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Thu Apr 25 22:08:07 2013 +0200
description:
ARM A15 submul_1.

details:   /var/hg/gmp/rev/229974a3a698
changeset: 15748:229974a3a698
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Thu Apr 25 22:08:45 2013 +0200
description:
ARM A15 com.

details:   /var/hg/gmp/rev/f36214a935f6
changeset: 15749:f36214a935f6
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Thu Apr 25 22:13:22 2013 +0200
description:
Conditionally suppress conditionally used code.

details:   /var/hg/gmp/rev/b79fabc0d7eb
changeset: 15750:b79fabc0d7eb
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Thu Apr 25 22:15:58 2013 +0200
description:
Clear carry smarter.

details:   /var/hg/gmp/rev/f8b08b239a9a
changeset: 15751:f8b08b239a9a
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Thu Apr 25 22:16:44 2013 +0200
description:
Collect header comments.

details:   /var/hg/gmp/rev/fe5fa317ad04
changeset: 15752:fe5fa317ad04
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Fri Apr 26 00:13:51 2013 +0200
description:
ChangeLog

diffstat:

 ChangeLog                       |   11 ++
 mpn/arm/mod_34lsub1.asm         |    4 +-
 mpn/arm/v6/submul_1.asm         |   10 +-
 mpn/arm/v7a/cora15/com.asm      |  169 ++++++++++++++++++++++++++++++++++++++++
 mpn/arm/v7a/cora15/logops_n.asm |    2 +
 mpn/arm/v7a/cora15/submul_1.asm |  148 +++++++++++++++++++++++++++++++++++
 6 files changed, 337 insertions(+), 7 deletions(-)

diffs (truncated from 407 to 300 lines):

diff -r 46bfe0a1bb40 -r fe5fa317ad04 ChangeLog
--- a/ChangeLog	Wed Apr 24 01:13:44 2013 +0200
+++ b/ChangeLog	Fri Apr 26 00:13:51 2013 +0200
@@ -1,5 +1,16 @@
+2013-04-25  Torbjorn Granlund  <tege at gmplib.org>
+
+	* mpn/arm/mod_34lsub1.asm: Clear carry smarter.
+
+	* mpn/arm/v7a/cora15/logops_n.asm: Conditionally suppress conditionally
+	used code.
+
+	* mpn/arm/v7a/cora15/submul_1.asm: New file.
+
 2013-04-24  Torbjorn Granlund  <tege at gmplib.org>
 
+	* mpn/arm/v7a/cora15/com.asm: New file.
+
 	* mpn/arm/v7a/cora15/logops_n.asm: New file.
 
 2013-04-19  Torbjorn Granlund  <tege at gmplib.org>
diff -r 46bfe0a1bb40 -r fe5fa317ad04 mpn/arm/mod_34lsub1.asm
--- a/mpn/arm/mod_34lsub1.asm	Wed Apr 24 01:13:44 2013 +0200
+++ b/mpn/arm/mod_34lsub1.asm	Fri Apr 26 00:13:51 2013 +0200
@@ -1,6 +1,6 @@
 dnl  ARM mpn_mod_34lsub1 -- remainder modulo 2^24-1.
 
-dnl  Copyright 2012 Free Software Foundation, Inc.
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -49,7 +49,7 @@
 	ldmia	ap!, { r2, r3, r12 }
 	subs	n, n, #3
 	blt	L(sum)			C n <= 5
-	adds	r0, r0, #0		C clear carry
+	cmn	r0, #0			C clear carry
 	sub	n, n, #3
 	b	L(mid)
 
diff -r 46bfe0a1bb40 -r fe5fa317ad04 mpn/arm/v6/submul_1.asm
--- a/mpn/arm/v6/submul_1.asm	Wed Apr 24 01:13:44 2013 +0200
+++ b/mpn/arm/v6/submul_1.asm	Fri Apr 26 00:13:51 2013 +0200
@@ -27,6 +27,11 @@
 C Cortex-A9	 3.75
 C Cortex-A15	 4.0
 
+C This loop complements U on the fly,
+C   U' = B^n - 1 - U
+C and then uses that
+C   R - U*v = R + U'*v + v - B^n v
+
 C TODO
 C  * Micro-optimise feed-in code.
 C  * Optimise for n=1,2 by delaying register saving.
@@ -37,11 +42,6 @@
 define(`n', `r2')
 define(`v0',`r3')
 
-C This loop complements U on the fly,
-C   U' = B^n - 1 - U
-C and then uses that 
-C   R - U*v = R + U'*v + v - B^n v
-
 ASM_START()
 PROLOGUE(mpn_submul_1)
 	stmfd	sp!, { r4, r5, r6, r7 }
diff -r 46bfe0a1bb40 -r fe5fa317ad04 mpn/arm/v7a/cora15/com.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/arm/v7a/cora15/com.asm	Fri Apr 26 00:13:51 2013 +0200
@@ -0,0 +1,169 @@
+dnl  ARM mpn_com optimised for A15.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C            cycles/limb
+C StrongARM	 ?
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	2.5
+C Cortex-A15	1.0
+
+C This is great A15 core register code, but it is a bit large.
+C We use FEEDIN_VARIANT 1 to save some space, but use 8-way unrolling.
+
+C Architecture requirements:
+C v5	-
+C v5t	-
+C v5te	ldrd strd
+C v6	-
+C v6t2	-
+C v7a	-
+
+define(`FEEDIN_VARIANT', 1)	C alternatives: 0 1 2
+define(`UNROLL', 4x2)	 	C alternatives: 4 4x2
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n',  `r2')
+
+ASM_START()
+PROLOGUE(mpn_com)
+	push	{ r4-r5,r8-r9 }
+
+ifelse(FEEDIN_VARIANT,0,`
+	ands	r12, n, #3
+	mov	n, n, lsr #2
+	beq	L(b00a)
+	tst	r12, #1
+	beq	L(bx0)
+	ldr	r5, [up], #4
+	mvn	r9, r5
+	str	r9, [rp], #4
+	tst	r12, #2
+	beq	L(b00)
+L(bx0):	ldrd	r4, r5, [up, #0]
+	sub	rp, rp, #8
+	b	L(lo)
+L(b00):	tst	n, n
+	beq	L(wd1)
+L(b00a):ldrd	r4, r5, [up], #-8
+	sub	rp, rp, #16
+	b	L(mid)
+')
+ifelse(FEEDIN_VARIANT,1,`
+	and	r12, n, #3
+	mov	n, n, lsr #2
+	tst	r12, #1
+	beq	L(bx0)
+	ldr	r5, [up], #4
+	mvn	r9, r5
+	str	r9, [rp], #4
+L(bx0):	tst	r12, #2
+	beq	L(b00)
+	ldrd	r4, r5, [up, #0]
+	sub	rp, rp, #8
+	b	L(lo)
+L(b00):	tst	n, n
+	beq	L(wd1)
+	ldrd	r4, r5, [up], #-8
+	sub	rp, rp, #16
+	b	L(mid)
+')
+ifelse(FEEDIN_VARIANT,2,`
+	ands	r12, n, #3
+	mov	n, n, lsr #2
+	beq	L(b00)
+	cmp	r12, #2
+	bcc	L(b01)
+	beq	L(b10)
+
+L(b11):	ldr	r5, [up], #4
+	mvn	r9, r5
+	ldrd	r4, r5, [up, #0]
+	str	r9, [rp], #-4
+	b	L(lo)
+
+L(b00):	ldrd	r4, r5, [up], #-8
+	sub	rp, rp, #16
+	b	L(mid)
+
+L(b01):	ldr	r5, [up], #-4
+	mvn	r9, r5
+	str	r9, [rp], #-12
+	tst	n, n
+	beq	L(wd1)
+L(gt1):	ldrd	r4, r5, [up, #8]
+	b	L(mid)
+
+L(b10):	ldrd	r4, r5, [up]
+	sub	rp, rp, #8
+	b	L(lo)
+')
+	ALIGN(16)
+ifelse(UNROLL,4,`
+L(top):	ldrd	r4, r5, [up, #8]
+	strd	r8, r9, [rp, #8]
+L(mid):	mvn	r8, r4
+	mvn	r9, r5
+	ldrd	r4, r5, [up, #16]!
+	strd	r8, r9, [rp, #16]!
+	sub	n, n, #1
+L(lo):	mvn	r8, r4
+	mvn	r9, r5
+	tst	n, n
+	bne	L(top)
+')
+ifelse(UNROLL,4x2,`
+L(top):	ldrd	r4, r5, [up, #8]
+	strd	r8, r9, [rp, #8]
+L(mid):	mvn	r8, r4
+	mvn	r9, r5
+	ldrd	r4, r5, [up, #16]
+	strd	r8, r9, [rp, #16]
+	mvn	r8, r4
+	mvn	r9, r5
+	sub	n, n, #2
+	tst	n, n
+	bmi	L(dne)
+	ldrd	r4, r5, [up, #24]
+	strd	r8, r9, [rp, #24]
+	mvn	r8, r4
+	mvn	r9, r5
+	ldrd	r4, r5, [up, #32]!
+	strd	r8, r9, [rp, #32]!
+L(lo):	mvn	r8, r4
+	mvn	r9, r5
+	tst	n, n
+	bne	L(top)
+')
+
+L(end):	strd	r8, r9, [rp, #8]
+L(wd1):	pop	{ r4-r5,r8-r9 }
+	bx	r14
+ifelse(UNROLL,4x2,`
+L(dne):	strd	r8, r9, [rp, #24]
+	pop	{ r4-r5,r8-r9 }
+	bx	r14
+')
+EPILOGUE()
diff -r 46bfe0a1bb40 -r fe5fa317ad04 mpn/arm/v7a/cora15/logops_n.asm
--- a/mpn/arm/v7a/cora15/logops_n.asm	Wed Apr 24 01:13:44 2013 +0200
+++ b/mpn/arm/v7a/cora15/logops_n.asm	Fri Apr 26 00:13:51 2013 +0200
@@ -232,9 +232,11 @@
 	strd	r8, r9, [rp, #8]
 L(wd1):	pop	{ r4-r9 }
 	bx	r14
+ifelse(UNROLL,4x2,`
 L(dne):	POSTOP(	r8)
 	POSTOP(	r9)
 	strd	r8, r9, [rp, #24]
 	pop	{ r4-r9 }
 	bx	r14
+')
 EPILOGUE()
diff -r 46bfe0a1bb40 -r fe5fa317ad04 mpn/arm/v7a/cora15/submul_1.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/arm/v7a/cora15/submul_1.asm	Fri Apr 26 00:13:51 2013 +0200
@@ -0,0 +1,148 @@
+dnl  ARM mpn_submul_1 optimised for A15.
+
+dnl  Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C	     cycles/limb		best
+C StrongARM:     -
+C XScale	 ?
+C Cortex-A7	 ?
+C Cortex-A8	 ?
+C Cortex-A9	 5.75			3.75
+C Cortex-A15	 2.32			this
+
+C This code uses umlal and umaal for adding in the rp[] data, keeping the
+C recurrency path separate from any multiply instructions.  It performs well on
+C A15, but not quite at the multiply bandwidth like the corresponding addmul_1
+C code.
+C
+C We don't use r12 due to ldrd and strd limitations.
+C
+C This loop complements U on the fly,
+C   U' = B^n - 1 - U
+C and then uses that
+C   R - U*v = R + U'*v + v - B^n v
+


More information about the gmp-commit mailing list