[Gmp-commit] /var/hg/gmp: 4 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Wed Nov 28 20:32:40 UTC 2018
details: /var/hg/gmp/rev/426728008c40
changeset: 17712:426728008c40
user: Torbjorn Granlund <tg at gmplib.org>
date: Wed Nov 28 21:30:42 2018 +0100
description:
Add a TODO item.
details: /var/hg/gmp/rev/45d7e26707c3
changeset: 17713:45d7e26707c3
user: Torbjorn Granlund <tg at gmplib.org>
date: Wed Nov 28 21:31:26 2018 +0100
description:
Streamline a branch.
details: /var/hg/gmp/rev/d3ca14fe79e8
changeset: 17714:d3ca14fe79e8
user: Torbjorn Granlund <tg at gmplib.org>
date: Wed Nov 28 21:31:38 2018 +0100
description:
New file.
details: /var/hg/gmp/rev/a42aebaf893d
changeset: 17715:a42aebaf893d
user: Torbjorn Granlund <tg at gmplib.org>
date: Wed Nov 28 21:31:44 2018 +0100
description:
New file.
diffstat:
mpn/powerpc64/mode64/p9/addmul_2.asm | 1 +
mpn/powerpc64/mode64/p9/mul_1.asm | 126 +++++++
mpn/powerpc64/mode64/p9/mul_2.asm | 1 +
mpn/powerpc64/mode64/p9/mul_basecase.asm | 3 +-
mpn/powerpc64/mode64/p9/sqr_basecase.asm | 555 +++++++++++++++++++++++++++++++
5 files changed, 684 insertions(+), 2 deletions(-)
diffs (truncated from 731 to 300 lines):
diff -r d11fcaf1a0cd -r a42aebaf893d mpn/powerpc64/mode64/p9/addmul_2.asm
--- a/mpn/powerpc64/mode64/p9/addmul_2.asm Wed Nov 21 08:22:18 2018 +0100
+++ b/mpn/powerpc64/mode64/p9/addmul_2.asm Wed Nov 28 21:31:44 2018 +0100
@@ -28,6 +28,7 @@
C * Not written with any power9 pipeline understanding.
C * The 4x unrolling was not motivated by any timing tests.
C * No local scheduling for performance tweaking has been done.
+C * Decrease load scheduling!
define(`rp', `r3')
define(`up', `r4')
diff -r d11fcaf1a0cd -r a42aebaf893d mpn/powerpc64/mode64/p9/mul_1.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/powerpc64/mode64/p9/mul_1.asm Wed Nov 28 21:31:44 2018 +0100
@@ -0,0 +1,126 @@
+dnl Power9 mpn_mul_1.
+
+dnl Copyright 2017, 2018 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 ?
+C POWER4/PPC970 ?
+C POWER5 ?
+C POWER6 ?
+C POWER7 ?
+C POWER8 ?
+C POWER9 2.47
+
+C TODO
+C * Schedule for Power9 pipeline.
+C * Unroll 4x if that proves beneficial.
+C * This is marginally faster (but much smaller) than ../mul_1.asm.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n', `r5')
+define(`v0', `r6')
+
+ASM_START()
+PROLOGUE(mpn_mul_1c)
+ b L(ent)
+EPILOGUE()
+PROLOGUE(mpn_mul_1)
+ li r7, 0
+L(ent): ld r11, 0(up)
+ cmpdi cr6, n, 2
+ addi r0, n, -1 C FIXME: postpone
+ srdi r0, r0, 1 C FIXME: postpone
+ mtctr r0 C FIXME: postpone
+ rldicl. r12, n, 0,63 C r0 = n & 3, set cr0
+ bne cr0, L(b1)
+
+L(b0): ld r0, 8(up)
+ maddld( r9, r11, v0, r7)
+ maddhdu(r7, r11, v0, r7)
+ ble cr6, L(2)
+ ld r12, 16(up)
+ mulld r8, r0, v0
+ mulhdu r5, r0, v0
+ addic up, up, 16
+ addi rp, rp, -8
+ b L(mid)
+
+L(b1): ld r0, 0(up)
+ ble cr6, L(1)
+ ld r12, 8(up)
+ maddld( r8, r11, v0, r7)
+ maddhdu(r5, r11, v0, r7)
+ ld r0, 16(up)
+ mulld r9, r12, v0
+ mulhdu r7, r12, v0
+ addic up, up, 24
+ bdz L(end)
+
+ ALIGN(16)
+L(top): ld r12, 0(up)
+ std r8, 0(rp)
+ adde r9, r5, r9
+ mulld r8, r0, v0
+ mulhdu r5, r0, v0
+L(mid): ld r0, 8(up)
+ std r9, 8(rp)
+ adde r8, r7, r8
+ mulld r9, r12, v0
+ mulhdu r7, r12, v0
+ addi rp, rp, 16
+ addi up, up, 16
+ bdnz L(top)
+
+L(end): std r8, 0(rp)
+ mulld r8, r0, v0
+ adde r9, r5, r9
+ mulhdu r5, r0, v0
+ std r9, 8(rp)
+ adde r8, r7, r8
+ std r8, 16(rp)
+ addze r3, r5
+ blr
+
+L(2): mulld r8, r0, v0
+ mulhdu r5, r0, v0
+ std r9, 0(rp)
+ addc r8, r7, r8
+ std r8, 8(rp)
+ addze r3, r5
+ blr
+
+L(1): maddld( r8, r0, v0, r7)
+ std r8, 0(rp)
+ maddhdu(r3, r0, v0, r7)
+ blr
+EPILOGUE()
diff -r d11fcaf1a0cd -r a42aebaf893d mpn/powerpc64/mode64/p9/mul_2.asm
--- a/mpn/powerpc64/mode64/p9/mul_2.asm Wed Nov 21 08:22:18 2018 +0100
+++ b/mpn/powerpc64/mode64/p9/mul_2.asm Wed Nov 28 21:31:44 2018 +0100
@@ -28,6 +28,7 @@
C * Not written with any power9 pipeline understanding.
C * The 4x unrolling was not motivated by any timing tests.
C * No local scheduling for performance tweaking has been done.
+C * Decrease load scheduling!
define(`rp', `r3')
define(`up', `r4')
diff -r d11fcaf1a0cd -r a42aebaf893d mpn/powerpc64/mode64/p9/mul_basecase.asm
--- a/mpn/powerpc64/mode64/p9/mul_basecase.asm Wed Nov 21 08:22:18 2018 +0100
+++ b/mpn/powerpc64/mode64/p9/mul_basecase.asm Wed Nov 28 21:31:44 2018 +0100
@@ -282,8 +282,6 @@
mulld r5, r8, v1
mulhdu r10, r8, v1
b L(cj)
-L(do_outer):
- beq cr0, L(ret) C taken means vn = 1. We're done.
L(outer):
ld v0, 0(vp)
@@ -399,6 +397,7 @@
cmpdi cr0, r23, 0
addi rp2, rp2, 16
addi vp, vp, 16
+L(do_outer):
bne cr0, L(outer)
L(ret):
ld r22, -80(r1)
diff -r d11fcaf1a0cd -r a42aebaf893d mpn/powerpc64/mode64/p9/sqr_basecase.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/powerpc64/mode64/p9/sqr_basecase.asm Wed Nov 28 21:31:44 2018 +0100
@@ -0,0 +1,555 @@
+dnl Power9 mpn_sqr_basecase.
+
+dnl Copyright 1999-2001, 2003-2006, 2008, 2017-2018 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of either:
+dnl
+dnl * the GNU Lesser General Public License as published by the Free
+dnl Software Foundation; either version 3 of the License, or (at your
+dnl option) any later version.
+dnl
+dnl or
+dnl
+dnl * the GNU General Public License as published by the Free Software
+dnl Foundation; either version 2 of the License, or (at your option) any
+dnl later version.
+dnl
+dnl or both in parallel, as here.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+dnl for more details.
+dnl
+dnl You should have received copies of the GNU General Public License and the
+dnl GNU Lesser General Public License along with the GNU MP Library. If not,
+dnl see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 -
+C POWER4/PPC970 -
+C POWER5 -
+C POWER6 -
+C POWER7 -
+C POWER8 -
+C POWER9 1.62
+
+C TODO
+C * Completely separate evn and odd code into two outer loops. Also consider
+C unrolling these two outer loops and thereby eliminate all branches.
+C * Avoid the reloading of u1 before every loop start.
+C * Reduce register usage.
+C * Consider getting rid of cy and instead load 3 u limbs, use addc+adde+adde.
+C * Consider skewing conditional adjustments to allow mask creation with subfe
+C like in the un=3 code. It might streamline the adjustments (or not).
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`un', `r5')
+
+define(`u0', `r0')
+define(`u1', `r7')
+define(`rp2', `r24')
+define(`up2', `r25')
+define(`cy', `r6')
+
+define(`LSHU1U0',`
+ addc u0, u0, u0
+ adde u1, u1, u1
+ li cy, 0
+ addze cy, cy
+')
+define(`LSHU1U',`
+ addc u0, u0, u0
+ add u0, u0, cy
+ adde u1, u1, u1
+ li cy, 0
+ addze cy, cy
+')
+define(`LSHU1UF',`
+ addc u0, u0, u0
+ add u0, u0, cy
+ adde u1, u1, u1
+')
+define(`LSHU1UHF',`
+ add u0, u0, u0
+ add u0, u0, cy
+')
+C These are cleverer replacements, but they tend to leave CA set, disturbing
+C the main accumulation code! Breaking that false dependency might have a
+C positive performance impact. Note that the subfe here results in a mask for
+C our adjustments.
+define(`xLSHU1U0',`
+ addc u0, u0, u0
+ adde u1, u1, u1
+ subfe cy, cy, cy
+')
+define(`xLSHU1U',`
+ subfic cy, cy, 0
+ adde u0, u0, u0
+ adde u1, u1, u1
+ subfe cy, cy, cy
+')
+define(`xLSHU1U',`
+ subfic cy, cy, 0
+ adde u0, u0, u0
+')
+
+ASM_START()
+PROLOGUE(mpn_sqr_basecase)
+ ld r0, 0(up) C n = 1
+ mulld r8, r0, r0 C weight 0
+ mulhdu r9, r0, r0 C weight 1
+ std r8, 0(rp)
+ cmpdi cr0, un, 2
+ bge cr0, L(ge2)
+ std r9, 8(rp)
+ blr
+
+L(ge2): bgt cr0, L(gt2)
+ ld r6, 8(up)
+ mulld r10, r6, r6 C u1 * u1
+ mulhdu r11, r6, r6 C u1 * u1
+ mulld r4, r6, r0 C u1 * u0
+ mulhdu r5, r6, r0 C u1 * u0
+ addc r4, r4, r4
+ adde r5, r5, r5
+ addze r11, r11
More information about the gmp-commit
mailing list