[Gmp-commit] /var/hg/gmp: 5 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Wed May 1 20:54:19 CEST 2013
details: /var/hg/gmp/rev/36af66a56e21
changeset: 15779:36af66a56e21
user: Torbjorn Granlund <tege at gmplib.org>
date: Wed May 01 20:44:17 2013 +0200
description:
(addxccc): Allow g2 as input.
(umulxhi): Save and restore o7 to allow it as in/out parameter.
details: /var/hg/gmp/rev/6513190b6ca7
changeset: 15780:6513190b6ca7
user: Torbjorn Granlund <tege at gmplib.org>
date: Wed May 01 20:46:34 2013 +0200
description:
Optimise lead-in code.
details: /var/hg/gmp/rev/114c58ec1557
changeset: 15781:114c58ec1557
user: Torbjorn Granlund <tege at gmplib.org>
date: Wed May 01 20:49:38 2013 +0200
description:
(GMP_MULFUNC_CHOICES): Support mul_3 + addmul_3 and mul_4 + addmul_4.
details: /var/hg/gmp/rev/bf54de40fc47
changeset: 15782:bf54de40fc47
user: Torbjorn Granlund <tege at gmplib.org>
date: Wed May 01 20:53:21 2013 +0200
description:
Add SPARC T3 mpn_mul_4 and mpn_addmul_4.
details: /var/hg/gmp/rev/b5fac5fb3c21
changeset: 15783:b5fac5fb3c21
user: Torbjorn Granlund <tege at gmplib.org>
date: Wed May 01 20:54:14 2013 +0200
description:
ChangeLog
diffstat:
ChangeLog | 12 +
configure.ac | 2 +
mpn/sparc64/ultrasparct3/aormul_2.asm | 16 +-
mpn/sparc64/ultrasparct3/aormul_4.asm | 208 ++++++++++++++++++++++++++++++++++
mpn/sparc64/ultrasparct3/missing.m4 | 9 +-
5 files changed, 236 insertions(+), 11 deletions(-)
diffs (truncated from 330 to 300 lines):
diff -r f1c123c3a516 -r b5fac5fb3c21 ChangeLog
--- a/ChangeLog Tue Apr 30 01:02:02 2013 +0200
+++ b/ChangeLog Wed May 01 20:54:14 2013 +0200
@@ -1,3 +1,15 @@
+2013-05-01 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/sparc64/ultrasparct3/aormul_4.asm: New file.
+
+ * configure.ac (GMP_MULFUNC_CHOICES): Support mul_3 + addmul_3 and
+ mul_4 + addmul_4.
+
+ * mpn/sparc64/ultrasparct3/aormul_2.asm: Optimise lead-in code.
+
+ * mpn/sparc64/ultrasparct3/missing.m4 (addxccc): Allow g2 as input.
+ (umulxhi): Save and restore o7 to allow it as in/out parameter.
+
2013-04-29 Torbjorn Granlund <tege at gmplib.org>
* mpn/arm/v7a/cora15/cnd_aors_n.asm: New file, was mis-named.
diff -r f1c123c3a516 -r b5fac5fb3c21 configure.ac
--- a/configure.ac Tue Apr 30 01:02:02 2013 +0200
+++ b/configure.ac Wed May 01 20:54:14 2013 +0200
@@ -2808,6 +2808,8 @@
cnd_add_n|cnd_sub_n) tmp_mulfunc="cnd_aors_n" ;;
addmul_1|submul_1) tmp_mulfunc="aorsmul_1" ;;
mul_2|addmul_2) tmp_mulfunc="aormul_2" ;;
+ mul_3|addmul_3) tmp_mulfunc="aormul_3" ;;
+ mul_4|addmul_4) tmp_mulfunc="aormul_4" ;;
popcount|hamdist) tmp_mulfunc="popham" ;;
and_n|andn_n|nand_n | ior_n|iorn_n|nior_n | xor_n|xnor_n)
tmp_mulfunc="logops_n" ;;
diff -r f1c123c3a516 -r b5fac5fb3c21 mpn/sparc64/ultrasparct3/aormul_2.asm
--- a/mpn/sparc64/ultrasparct3/aormul_2.asm Tue Apr 30 01:02:02 2013 +0200
+++ b/mpn/sparc64/ultrasparct3/aormul_2.asm Wed May 01 20:54:14 2013 +0200
@@ -1,5 +1,7 @@
dnl SPARC v9 mpn_mul_2 and mpn_addmul_2 for T3/T4/T5.
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
dnl Copyright 2013 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -26,9 +28,8 @@
C UltraSPARC T4: 3.25 3.75
-C The code is reasonably scheduled but also relies on OoO. Micro-scheduling
-C remains to be done. There is hope that this could run at around 3.0 and 3.5
-C c/l respectively, on T4 if an optimal schedule is found. Two cycles per
+C The code is reasonably scheduled but also relies on OoO. There was hope that
+C this could run at around 3.0 and 3.5 c/l respectively, on T4. Two cycles per
C iteration needs to be removed.
C
C We could almost use 2-way unrolling, but currently the wN registers live too
@@ -47,14 +48,12 @@
define(`v0', `%o0')
define(`v1', `%o1')
+
define(`w0', `%o2')
define(`w1', `%o3')
define(`w2', `%o4')
define(`w3', `%o5')
-C Free or little used registers: o7, g4, g5. We use g2 for addxccc emulation.
-C l0,l6, l1,l3, l5,l7 and l2,l4 could be coalesced.
-
ifdef(`OPERATION_mul_2',`
define(`AM2', `')
define(`ADDX', `addcc`'$1')
@@ -76,11 +75,10 @@
save %sp, -176, %sp
ldx [vp+0], v0 C load v0
+ and n, 3, %g5
ldx [vp+8], v1 C load v1
+ add n, -6, n
ldx [up+0], %g4
-
- and n, 3, %g5
- add n, -6, n
brz %g5, L(b0)
cmp %g5, 2
bcs L(b1)
diff -r f1c123c3a516 -r b5fac5fb3c21 mpn/sparc64/ultrasparct3/aormul_4.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/sparc64/ultrasparct3/aormul_4.asm Wed May 01 20:54:14 2013 +0200
@@ -0,0 +1,208 @@
+dnl SPARC v9 mpn_mul_4 and mpn_addmul_4 for T3/T4/T5.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb cycles/limb
+C mul_4 addmul_4
+C UltraSPARC T3: ? ?
+C UltraSPARC T4: 2.5? 2.75?
+
+
+C The code is well-scheduled and relies on OoO very little. There is hope that
+C this will run at around 2.5 and 2.75 c/l respectively, on T4.
+
+define(`rp', `%i0')
+define(`up', `%i1')
+define(`n', `%i2')
+define(`vp', `%i3')
+
+define(`v0', `%g1')
+define(`v1', `%o7')
+define(`v2', `%g2')
+define(`v3', `%i3')
+
+define(`w0', `%o0')
+define(`w1', `%o1')
+define(`w2', `%o2')
+define(`w3', `%o3')
+define(`w4', `%o4')
+
+define(`r0', `%o5')
+
+define(`u0', `%i4')
+define(`u1', `%i5')
+
+define(`rp0', `rp')
+define(`rp1', `%g3')
+define(`rp2', `%g4')
+define(`up0', `up')
+define(`up1', `%g5')
+
+ifdef(`OPERATION_mul_4',`
+ define(`AM4', `')
+ define(`ADDX', `addcc`'$1')
+ define(`func', `mpn_mul_4')
+')
+ifdef(`OPERATION_addmul_4',`
+ define(`AM4', `$1')
+ define(`ADDX', `addxccc($1,$2,$3)')
+ define(`func', `mpn_addmul_4')
+')
+
+
+MULFUNC_PROLOGUE(mpn_mul_4 mpn_addmul_4)
+
+ASM_START()
+ REGISTER(%g2,#scratch)
+ REGISTER(%g3,#scratch)
+PROLOGUE(func)
+ save %sp, -176, %sp
+
+ ldx [up + 0], u1 C load up[0] early
+ andcc n, 1, %g0 C is n odd?
+ ldx [vp + 0], v0
+ sllx n, 3, n
+ ldx [vp + 8], v1
+ add n, -28, n
+ ldx [vp + 16], v2
+ add rp, -16, rp
+ ldx [vp + 24], v3
+ add up, n, up0
+ add rp, n, rp0
+ add up0, 8, up1
+ add rp0, 8, rp1
+ add rp0, 16, rp2
+ mulx u1, v0, %l0
+ mov 0, w0
+ mulx u1, v1, %l1
+ mov 0, w1
+ mulx u1, v2, %l2
+ mov 0, w2
+ mulx u1, v3, %l3
+ mov 0, w3
+
+ be L(evn)
+ neg n, n
+
+L(odd): mov u1, u0
+ ldx [up1 + n], u1
+AM4(` ldx [rp2 + n], r0')
+ umulxhi(u0, v0, %l4)
+ umulxhi(u0, v1, %l5)
+ umulxhi(u0, v2, %l6)
+ umulxhi(u0, v3, %l7)
+ b L(mid)
+ add n, 8, n
+
+L(evn): ldx [up1 + n], u0
+AM4(` ldx [rp2 + n], r0')
+ umulxhi(u1, v0, %l4)
+ umulxhi(u1, v1, %l5)
+ umulxhi(u1, v2, %l6)
+ umulxhi(u1, v3, %l7)
+ add n, 16, n
+
+ ALIGN(16)
+L(top): addcc %l0, w0, w0
+ mulx u0, v0, %l0 C w 0
+ addxccc(%l1, w1, w1)
+ mulx u0, v1, %l1 C w 1
+ addxccc(%l2, w2, w2)
+ mulx u0, v2, %l2 C w 2
+ addxccc(%l3, w3, w3)
+ mulx u0, v3, %l3 C w 3
+ ldx [up0 + n], u1
+ addxc( %g0, %g0, w4)
+AM4(` addcc r0, w0, w0')
+ stx w0, [rp0 + n]
+ ADDX(` %l4, w1, w0')
+ umulxhi(u0, v0, %l4) C w 1
+AM4(` ldx [rp1 + n], r0')
+ addxccc(%l5, w2, w1)
+ umulxhi(u0, v1, %l5) C w 2
+ addxccc(%l6, w3, w2)
+ umulxhi(u0, v2, %l6) C w 3
+ addxc( %l7, w4, w3)
+ umulxhi(u0, v3, %l7) C w 4
+L(mid): addcc %l0, w0, w0
+ mulx u1, v0, %l0 C w 1
+ addxccc(%l1, w1, w1)
+ mulx u1, v1, %l1 C w 2
+ addxccc(%l2, w2, w2)
+ mulx u1, v2, %l2 C w 3
+ addxccc(%l3, w3, w3)
+ mulx u1, v3, %l3 C w 4
+ ldx [up1 + n], u0
+ addxc( %g0, %g0, w4)
+AM4(` addcc r0, w0, w0')
+ stx w0, [rp1 + n]
+ ADDX(` %l4, w1, w0')
+ umulxhi(u1, v0, %l4) C w 2
+AM4(` ldx [rp2 + n], r0')
+ addxccc(%l5, w2, w1)
+ umulxhi(u1, v1, %l5) C w 3
+ addxccc(%l6, w3, w2)
+ umulxhi(u1, v2, %l6) C w 4
+ addxc( %l7, w4, w3)
+ umulxhi(u1, v3, %l7) C w 5
+ brlz n, L(top)
+ add n, 16, n
+
+L(end): addcc %l0, w0, w0
+ mulx u0, v0, %l0
+ addxccc(%l1, w1, w1)
+ mulx u0, v1, %l1
+ addxccc(%l2, w2, w2)
+ mulx u0, v2, %l2
+ addxccc(%l3, w3, w3)
+ mulx u0, v3, %l3
+ addxc( %g0, %g0, w4)
+AM4(` addcc r0, w0, w0')
+ stx w0, [rp0 + n]
+ ADDX(` %l4, w1, w0')
+ umulxhi(u0, v0, %l4)
+AM4(` ldx [rp1 + n], r0')
+ addxccc(%l5, w2, w1)
+ umulxhi(u0, v1, %l5)
+ addxccc(%l6, w3, w2)
+ umulxhi(u0, v2, %l6)
+ addxc( %l7, w4, w3)
+ umulxhi(u0, v3, %l7)
+ addcc %l0, w0, w0
+ addxccc(%l1, w1, w1)
+ addxccc(%l2, w2, w2)
+ addxccc(%l3, w3, w3)
+ addxc( %g0, %g0, w4)
+AM4(` addcc r0, w0, w0')
+ stx w0, [rp1 + n]
+ ADDX(` %l4, w1, w0')
+ addxccc(%l5, w2, w1)
+ addxccc(%l6, w3, w2)
+ stx w0, [rp2 + n]
+ add n, 16, n
+ stx w1, [rp1 + n]
+ stx w2, [rp2 + n]
+ addxc( %l7, w4, %i0)
+ ret
+ restore
+EPILOGUE()
diff -r f1c123c3a516 -r b5fac5fb3c21 mpn/sparc64/ultrasparct3/missing.m4
--- a/mpn/sparc64/ultrasparct3/missing.m4 Tue Apr 30 01:02:02 2013 +0200
+++ b/mpn/sparc64/ultrasparct3/missing.m4 Wed May 01 20:54:14 2013 +0200
@@ -22,13 +22,17 @@
More information about the gmp-commit
mailing list