[Gmp-commit] /var/hg/gmp: 5 new changesets

mercurial at gmplib.org mercurial at gmplib.org
Wed May 1 20:54:19 CEST 2013


details:   /var/hg/gmp/rev/36af66a56e21
changeset: 15779:36af66a56e21
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Wed May 01 20:44:17 2013 +0200
description:
(addxccc): Allow g2 as input.
(umulxhi): Save and restore o7 to allow it as in/out parameter.

details:   /var/hg/gmp/rev/6513190b6ca7
changeset: 15780:6513190b6ca7
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Wed May 01 20:46:34 2013 +0200
description:
Optimise lead-in code.

details:   /var/hg/gmp/rev/114c58ec1557
changeset: 15781:114c58ec1557
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Wed May 01 20:49:38 2013 +0200
description:
(GMP_MULFUNC_CHOICES): Support mul_3 + addmul_3 and mul_4 + addmul_4.

details:   /var/hg/gmp/rev/bf54de40fc47
changeset: 15782:bf54de40fc47
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Wed May 01 20:53:21 2013 +0200
description:
Add SPARC T3 mpn_mul_4 and mpn_addmul_4.

details:   /var/hg/gmp/rev/b5fac5fb3c21
changeset: 15783:b5fac5fb3c21
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Wed May 01 20:54:14 2013 +0200
description:
ChangeLog

diffstat:

 ChangeLog                             |   12 +
 configure.ac                          |    2 +
 mpn/sparc64/ultrasparct3/aormul_2.asm |   16 +-
 mpn/sparc64/ultrasparct3/aormul_4.asm |  208 ++++++++++++++++++++++++++++++++++
 mpn/sparc64/ultrasparct3/missing.m4   |    9 +-
 5 files changed, 236 insertions(+), 11 deletions(-)

diffs (truncated from 330 to 300 lines):

diff -r f1c123c3a516 -r b5fac5fb3c21 ChangeLog
--- a/ChangeLog	Tue Apr 30 01:02:02 2013 +0200
+++ b/ChangeLog	Wed May 01 20:54:14 2013 +0200
@@ -1,3 +1,15 @@
+2013-05-01  Torbjorn Granlund  <tege at gmplib.org>
+
+	* mpn/sparc64/ultrasparct3/aormul_4.asm: New file.
+
+	* configure.ac (GMP_MULFUNC_CHOICES): Support mul_3 + addmul_3 and
+	mul_4 + addmul_4.
+
+	* mpn/sparc64/ultrasparct3/aormul_2.asm: Optimise lead-in code.
+
+	* mpn/sparc64/ultrasparct3/missing.m4 (addxccc): Allow g2 as input.
+	(umulxhi): Save and restore o7 to allow it as in/out parameter.
+
 2013-04-29  Torbjorn Granlund  <tege at gmplib.org>
 
 	* mpn/arm/v7a/cora15/cnd_aors_n.asm: New file, was mis-named.
diff -r f1c123c3a516 -r b5fac5fb3c21 configure.ac
--- a/configure.ac	Tue Apr 30 01:02:02 2013 +0200
+++ b/configure.ac	Wed May 01 20:54:14 2013 +0200
@@ -2808,6 +2808,8 @@
   cnd_add_n|cnd_sub_n) tmp_mulfunc="cnd_aors_n"   ;;
   addmul_1|submul_1) tmp_mulfunc="aorsmul_1" ;;
   mul_2|addmul_2)    tmp_mulfunc="aormul_2" ;;
+  mul_3|addmul_3)    tmp_mulfunc="aormul_3" ;;
+  mul_4|addmul_4)    tmp_mulfunc="aormul_4" ;;
   popcount|hamdist)  tmp_mulfunc="popham"    ;;
   and_n|andn_n|nand_n | ior_n|iorn_n|nior_n | xor_n|xnor_n)
                      tmp_mulfunc="logops_n"  ;;
diff -r f1c123c3a516 -r b5fac5fb3c21 mpn/sparc64/ultrasparct3/aormul_2.asm
--- a/mpn/sparc64/ultrasparct3/aormul_2.asm	Tue Apr 30 01:02:02 2013 +0200
+++ b/mpn/sparc64/ultrasparct3/aormul_2.asm	Wed May 01 20:54:14 2013 +0200
@@ -1,5 +1,7 @@
 dnl  SPARC v9 mpn_mul_2 and mpn_addmul_2 for T3/T4/T5.
 
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
 dnl  Copyright 2013 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
@@ -26,9 +28,8 @@
 C UltraSPARC T4:	 3.25		 3.75
 
 
-C The code is reasonably scheduled but also relies on OoO.  Micro-scheduling
-C remains to be done.  There is hope that this could run at around 3.0 and 3.5
-C c/l respectively, on T4 if an optimal schedule is found.  Two cycles per
+C The code is reasonably scheduled but also relies on OoO.  There was hope that
+C this could run at around 3.0 and 3.5 c/l respectively, on T4.  Two cycles per
 C iteration needs to be removed.
 C
 C We could almost use 2-way unrolling, but currently the wN registers live too
@@ -47,14 +48,12 @@
 
 define(`v0', `%o0')
 define(`v1', `%o1')
+
 define(`w0', `%o2')
 define(`w1', `%o3')
 define(`w2', `%o4')
 define(`w3', `%o5')
 
-C Free or little used registers: o7, g4, g5.  We use g2 for addxccc emulation.
-C l0,l6, l1,l3, l5,l7 and l2,l4 could be coalesced.
-
 ifdef(`OPERATION_mul_2',`
       define(`AM2',      `')
       define(`ADDX',	 `addcc`'$1')
@@ -76,11 +75,10 @@
 	save	%sp, -176, %sp
 
 	ldx	[vp+0], v0		C load v0
+	and	n, 3, %g5
 	ldx	[vp+8], v1		C load v1
+	add	n, -6, n
 	ldx	[up+0], %g4
-
-	and	n, 3, %g5
-	add	n, -6, n
 	brz	%g5, L(b0)
 	 cmp	%g5, 2
 	bcs	L(b1)
diff -r f1c123c3a516 -r b5fac5fb3c21 mpn/sparc64/ultrasparct3/aormul_4.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/sparc64/ultrasparct3/aormul_4.asm	Wed May 01 20:54:14 2013 +0200
@@ -0,0 +1,208 @@
+dnl  SPARC v9 mpn_mul_4 and mpn_addmul_4 for T3/T4/T5.
+
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C		    cycles/limb      cycles/limb
+C		       mul_4           addmul_4
+C UltraSPARC T3:	 ?		 ?
+C UltraSPARC T4:	 2.5?		 2.75?
+
+
+C The code is well-scheduled and relies on OoO very little.  There is hope that
+C this will run at around 2.5 and 2.75 c/l respectively, on T4.
+
+define(`rp', `%i0')
+define(`up', `%i1')
+define(`n',  `%i2')
+define(`vp', `%i3')
+
+define(`v0', `%g1')
+define(`v1', `%o7')
+define(`v2', `%g2')
+define(`v3', `%i3')
+
+define(`w0', `%o0')
+define(`w1', `%o1')
+define(`w2', `%o2')
+define(`w3', `%o3')
+define(`w4', `%o4')
+
+define(`r0', `%o5')
+
+define(`u0', `%i4')
+define(`u1', `%i5')
+
+define(`rp0', `rp')
+define(`rp1', `%g3')
+define(`rp2', `%g4')
+define(`up0', `up')
+define(`up1', `%g5')
+
+ifdef(`OPERATION_mul_4',`
+      define(`AM4',      `')
+      define(`ADDX',	 `addcc`'$1')
+      define(`func',     `mpn_mul_4')
+')
+ifdef(`OPERATION_addmul_4',`
+      define(`AM4',      `$1')
+      define(`ADDX',	 `addxccc($1,$2,$3)')
+      define(`func',     `mpn_addmul_4')
+')
+
+
+MULFUNC_PROLOGUE(mpn_mul_4 mpn_addmul_4)
+
+ASM_START()
+	REGISTER(%g2,#scratch)
+	REGISTER(%g3,#scratch)
+PROLOGUE(func)
+	save	%sp, -176, %sp
+
+	ldx	[up + 0], u1		C load up[0] early
+	andcc	n, 1, %g0		C is n odd?
+	ldx	[vp + 0], v0
+	sllx	n, 3, n
+	ldx	[vp + 8], v1
+	add	n, -28, n
+	ldx	[vp + 16], v2
+	add	rp, -16, rp
+	ldx	[vp + 24], v3
+	add	up, n, up0
+	add	rp, n, rp0
+	add	up0, 8, up1
+	add	rp0, 8, rp1
+	add	rp0, 16, rp2
+	mulx	u1, v0, %l0
+	mov	0, w0
+	mulx	u1, v1, %l1
+	mov	0, w1
+	mulx	u1, v2, %l2
+	mov	0, w2
+	mulx	u1, v3, %l3
+	mov	0, w3
+
+	be	L(evn)
+	 neg	n, n
+
+L(odd):	mov	u1, u0
+	ldx	[up1 + n], u1
+AM4(`	ldx	[rp2 + n], r0')
+	umulxhi(u0, v0, %l4)
+	umulxhi(u0, v1, %l5)
+	umulxhi(u0, v2, %l6)
+	umulxhi(u0, v3, %l7)
+	b	L(mid)
+	 add	n, 8, n
+
+L(evn):	ldx	[up1 + n], u0
+AM4(`	ldx	[rp2 + n], r0')
+	umulxhi(u1, v0, %l4)
+	umulxhi(u1, v1, %l5)
+	umulxhi(u1, v2, %l6)
+	umulxhi(u1, v3, %l7)
+	add	n, 16, n
+
+	ALIGN(16)
+L(top):	addcc	%l0, w0, w0
+	mulx	u0, v0, %l0	C w 0
+	addxccc(%l1, w1, w1)
+	mulx	u0, v1, %l1	C w 1
+	addxccc(%l2, w2, w2)
+	mulx	u0, v2, %l2	C w 2
+	addxccc(%l3, w3, w3)
+	mulx	u0, v3, %l3	C w 3
+	ldx	[up0 + n], u1
+	addxc(	%g0, %g0, w4)
+AM4(`	addcc	r0, w0, w0')
+	stx	w0, [rp0 + n]
+	ADDX(`	%l4, w1, w0')
+	umulxhi(u0, v0, %l4)	C w 1
+AM4(`	ldx	[rp1 + n], r0')
+	addxccc(%l5, w2, w1)
+	umulxhi(u0, v1, %l5)	C w 2
+	addxccc(%l6, w3, w2)
+	umulxhi(u0, v2, %l6)	C w 3
+	addxc(	%l7, w4, w3)
+	umulxhi(u0, v3, %l7)	C w 4
+L(mid):	addcc	%l0, w0, w0
+	mulx	u1, v0, %l0	C w 1
+	addxccc(%l1, w1, w1)
+	mulx	u1, v1, %l1	C w 2
+	addxccc(%l2, w2, w2)
+	mulx	u1, v2, %l2	C w 3
+	addxccc(%l3, w3, w3)
+	mulx	u1, v3, %l3	C w 4
+	ldx	[up1 + n], u0
+	addxc(	%g0, %g0, w4)
+AM4(`	addcc	r0, w0, w0')
+	stx	w0, [rp1 + n]
+	ADDX(`	%l4, w1, w0')
+	umulxhi(u1, v0, %l4)	C w 2
+AM4(`	ldx	[rp2 + n], r0')
+	addxccc(%l5, w2, w1)
+	umulxhi(u1, v1, %l5)	C w 3
+	addxccc(%l6, w3, w2)
+	umulxhi(u1, v2, %l6)	C w 4
+	addxc(	%l7, w4, w3)
+	umulxhi(u1, v3, %l7)	C w 5
+	brlz	n, L(top)
+	 add	n, 16, n
+
+L(end):	addcc	%l0, w0, w0
+	mulx	u0, v0, %l0
+	addxccc(%l1, w1, w1)
+	mulx	u0, v1, %l1
+	addxccc(%l2, w2, w2)
+	mulx	u0, v2, %l2
+	addxccc(%l3, w3, w3)
+	mulx	u0, v3, %l3
+	addxc(	%g0, %g0, w4)
+AM4(`	addcc	r0, w0, w0')
+	stx	w0, [rp0 + n]
+	ADDX(`	%l4, w1, w0')
+	umulxhi(u0, v0, %l4)
+AM4(`	ldx	[rp1 + n], r0')
+	addxccc(%l5, w2, w1)
+	umulxhi(u0, v1, %l5)
+	addxccc(%l6, w3, w2)
+	umulxhi(u0, v2, %l6)
+	addxc(	%l7, w4, w3)
+	umulxhi(u0, v3, %l7)
+	addcc	%l0, w0, w0
+	addxccc(%l1, w1, w1)
+	addxccc(%l2, w2, w2)
+	addxccc(%l3, w3, w3)
+	addxc(	%g0, %g0, w4)
+AM4(`	addcc	r0, w0, w0')
+	stx	w0, [rp1 + n]
+	ADDX(`	%l4, w1, w0')
+	addxccc(%l5, w2, w1)
+	addxccc(%l6, w3, w2)
+	stx	w0, [rp2 + n]
+	add	n, 16, n
+	stx	w1, [rp1 + n]
+	stx	w2, [rp2 + n]
+	addxc(	%l7, w4, %i0)
+	ret
+	 restore
+EPILOGUE()
diff -r f1c123c3a516 -r b5fac5fb3c21 mpn/sparc64/ultrasparct3/missing.m4
--- a/mpn/sparc64/ultrasparct3/missing.m4	Tue Apr 30 01:02:02 2013 +0200
+++ b/mpn/sparc64/ultrasparct3/missing.m4	Wed May 01 20:54:14 2013 +0200
@@ -22,13 +22,17 @@


More information about the gmp-commit mailing list