[Gmp-commit] /var/hg/gmp: 4 new changesets

Wed Nov 28 20:32:40 UTC 2018

details:   /var/hg/gmp/rev/426728008c40
changeset: 17712:426728008c40
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Wed Nov 28 21:30:42 2018 +0100
description:
Add a TODO item.

details:   /var/hg/gmp/rev/45d7e26707c3
changeset: 17713:45d7e26707c3
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Wed Nov 28 21:31:26 2018 +0100
description:
Streamline a branch.

details:   /var/hg/gmp/rev/d3ca14fe79e8
changeset: 17714:d3ca14fe79e8
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Wed Nov 28 21:31:38 2018 +0100
description:
New file.

details:   /var/hg/gmp/rev/a42aebaf893d
changeset: 17715:a42aebaf893d
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Wed Nov 28 21:31:44 2018 +0100
description:
New file.

diffstat:

 mpn/powerpc64/mode64/p9/addmul_2.asm     |    1 +
 mpn/powerpc64/mode64/p9/mul_1.asm        |  126 +++++++
 mpn/powerpc64/mode64/p9/mul_2.asm        |    1 +
 mpn/powerpc64/mode64/p9/mul_basecase.asm |    3 +-
 mpn/powerpc64/mode64/p9/sqr_basecase.asm |  555 +++++++++++++++++++++++++++++++
 5 files changed, 684 insertions(+), 2 deletions(-)

diffs (truncated from 731 to 300 lines):

diff -r d11fcaf1a0cd -r a42aebaf893d mpn/powerpc64/mode64/p9/addmul_2.asm

--- a/mpn/powerpc64/mode64/p9/addmul_2.asm	Wed Nov 21 08:22:18 2018 +0100
+++ b/mpn/powerpc64/mode64/p9/addmul_2.asm	Wed Nov 28 21:31:44 2018 +0100
@@ -28,6 +28,7 @@
 C  * Not written with any power9 pipeline understanding.
 C  * The 4x unrolling was not motivated by any timing tests.
 C  * No local scheduling for performance tweaking has been done.
+C  * Decrease load scheduling!
 
 define(`rp', `r3')
 define(`up', `r4')
diff -r d11fcaf1a0cd -r a42aebaf893d mpn/powerpc64/mode64/p9/mul_1.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/powerpc64/mode64/p9/mul_1.asm	Wed Nov 28 21:31:44 2018 +0100
@@ -0,0 +1,126 @@
+dnl  Power9 mpn_mul_1.
+
+dnl  Copyright 2017, 2018 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                   cycles/limb
+C POWER3/PPC630		 ?
+C POWER4/PPC970		 ?
+C POWER5		 ?
+C POWER6		 ?
+C POWER7		 ?
+C POWER8		 ?
+C POWER9		 2.47
+
+C TODO
+C  * Schedule for Power9 pipeline.
+C  * Unroll 4x if that proves beneficial.
+C  * This is marginally faster (but much smaller) than ../mul_1.asm.
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n',  `r5')
+define(`v0', `r6')
+
+ASM_START()
+PROLOGUE(mpn_mul_1c)
+	b	L(ent)
+EPILOGUE()
+PROLOGUE(mpn_mul_1)
+	li	r7, 0
+L(ent):	ld	r11, 0(up)
+	cmpdi	cr6, n, 2
+	addi	r0, n, -1	C FIXME: postpone
+	srdi	r0, r0, 1	C FIXME: postpone
+	mtctr	r0		C FIXME: postpone
+	rldicl.	r12, n, 0,63	C r0 = n & 3, set cr0
+	bne	cr0, L(b1)
+
+L(b0):	ld	r0, 8(up)
+	maddld(	r9, r11, v0, r7)
+	maddhdu(r7, r11, v0, r7)
+	ble	cr6, L(2)
+	ld	r12, 16(up)
+	mulld	r8, r0, v0
+	mulhdu	r5, r0, v0
+	addic	up, up, 16
+	addi	rp, rp, -8
+	b	L(mid)
+
+L(b1):	ld	r0, 0(up)
+	ble	cr6, L(1)
+	ld	r12, 8(up)
+	maddld(	r8, r11, v0, r7)
+	maddhdu(r5, r11, v0, r7)
+	ld	r0, 16(up)
+	mulld	r9, r12, v0
+	mulhdu	r7, r12, v0
+	addic	up, up, 24
+	bdz	L(end)
+
+	ALIGN(16)
+L(top):	ld	r12, 0(up)
+	std	r8, 0(rp)
+	adde	r9, r5, r9
+	mulld	r8, r0, v0
+	mulhdu	r5, r0, v0
+L(mid):	ld	r0, 8(up)
+	std	r9, 8(rp)
+	adde	r8, r7, r8
+	mulld	r9, r12, v0
+	mulhdu	r7, r12, v0
+	addi	rp, rp, 16
+	addi	up, up, 16
+	bdnz	L(top)
+
+L(end):	std	r8, 0(rp)
+	mulld	r8, r0, v0
+	adde	r9, r5, r9
+	mulhdu	r5, r0, v0
+	std	r9, 8(rp)
+	adde	r8, r7, r8
+	std	r8, 16(rp)
+	addze	r3, r5
+	blr
+
+L(2):	mulld	r8, r0, v0
+	mulhdu	r5, r0, v0
+	std	r9, 0(rp)
+	addc	r8, r7, r8
+	std	r8, 8(rp)
+	addze	r3, r5
+	blr
+
+L(1):	maddld(	r8,  r0, v0, r7)
+	std	r8, 0(rp)
+	maddhdu(r3, r0, v0, r7)
+	blr
+EPILOGUE()
diff -r d11fcaf1a0cd -r a42aebaf893d mpn/powerpc64/mode64/p9/mul_2.asm
--- a/mpn/powerpc64/mode64/p9/mul_2.asm	Wed Nov 21 08:22:18 2018 +0100
+++ b/mpn/powerpc64/mode64/p9/mul_2.asm	Wed Nov 28 21:31:44 2018 +0100
@@ -28,6 +28,7 @@
 C  * Not written with any power9 pipeline understanding.
 C  * The 4x unrolling was not motivated by any timing tests.
 C  * No local scheduling for performance tweaking has been done.
+C  * Decrease load scheduling!
 
 define(`rp', `r3')
 define(`up', `r4')
diff -r d11fcaf1a0cd -r a42aebaf893d mpn/powerpc64/mode64/p9/mul_basecase.asm
--- a/mpn/powerpc64/mode64/p9/mul_basecase.asm	Wed Nov 21 08:22:18 2018 +0100
+++ b/mpn/powerpc64/mode64/p9/mul_basecase.asm	Wed Nov 28 21:31:44 2018 +0100
@@ -282,8 +282,6 @@
 	mulld	r5, r8, v1
 	mulhdu	r10, r8, v1
 	b	L(cj)
-L(do_outer):
-	beq	cr0, L(ret)		C taken means vn = 1. We're done.
 
 L(outer):
 	ld	v0, 0(vp)
@@ -399,6 +397,7 @@
 	cmpdi	cr0, r23, 0
 	addi	rp2, rp2, 16
 	addi	vp, vp, 16
+L(do_outer):
 	bne	cr0, L(outer)
 L(ret):
 	ld	r22, -80(r1)
diff -r d11fcaf1a0cd -r a42aebaf893d mpn/powerpc64/mode64/p9/sqr_basecase.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/powerpc64/mode64/p9/sqr_basecase.asm	Wed Nov 28 21:31:44 2018 +0100
@@ -0,0 +1,555 @@
+dnl  Power9 mpn_sqr_basecase.
+
+dnl  Copyright 1999-2001, 2003-2006, 2008, 2017-2018 Free Software Foundation,
+dnl  Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                  cycles/limb
+C POWER3/PPC630          -
+C POWER4/PPC970          -
+C POWER5                 -
+C POWER6                 -
+C POWER7                 -
+C POWER8                 -
+C POWER9                 1.62
+
+C TODO
+C  * Completely separate evn and odd code into two outer loops. Also consider
+C    unrolling these two outer loops and thereby eliminate all branches.
+C  * Avoid the reloading of u1 before every loop start.
+C  * Reduce register usage.
+C  * Consider getting rid of cy and instead load 3 u limbs, use addc+adde+adde.
+C  * Consider skewing conditional adjustments to allow mask creation with subfe
+C    like in the un=3 code. It might streamline the adjustments (or not).
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`un', `r5')
+
+define(`u0', `r0')
+define(`u1', `r7')
+define(`rp2', `r24')
+define(`up2', `r25')
+define(`cy',  `r6')
+
+define(`LSHU1U0',`
+	addc	u0, u0, u0
+	adde	u1, u1, u1
+	li	cy, 0
+	addze	cy, cy
+')
+define(`LSHU1U',`
+	addc	u0, u0, u0
+	add	u0, u0, cy
+	adde	u1, u1, u1
+	li	cy, 0
+	addze	cy, cy
+')
+define(`LSHU1UF',`
+	addc	u0, u0, u0
+	add	u0, u0, cy
+	adde	u1, u1, u1
+')
+define(`LSHU1UHF',`
+	add	u0, u0, u0
+	add	u0, u0, cy
+')
+C These are cleverer replacements, but they tend to leave CA set, disturbing
+C the main accumulation code! Breaking that false dependency might have a
+C positive performance impact. Note that the subfe here results in a mask for
+C our adjustments.
+define(`xLSHU1U0',`
+	addc	u0, u0, u0
+	adde	u1, u1, u1
+	subfe	cy, cy, cy
+')
+define(`xLSHU1U',`
+	subfic	cy, cy, 0
+	adde	u0, u0, u0
+	adde	u1, u1, u1
+	subfe	cy, cy, cy
+')
+define(`xLSHU1U',`
+	subfic	cy, cy, 0
+	adde	u0, u0, u0
+')
+
+ASM_START()
+PROLOGUE(mpn_sqr_basecase)
+	ld	r0, 0(up)	C n = 1
+	mulld	r8, r0, r0	C weight 0
+	mulhdu	r9, r0, r0	C weight 1
+	std	r8, 0(rp)
+	cmpdi	cr0, un, 2
+	bge	cr0, L(ge2)
+	std	r9, 8(rp)
+	blr
+
+L(ge2):	bgt	cr0, L(gt2)
+	ld	r6, 8(up)
+	mulld	r10, r6, r6	C u1 * u1
+	mulhdu	r11, r6, r6	C u1 * u1
+	mulld	r4, r6, r0	C u1 * u0
+	mulhdu	r5, r6, r0	C u1 * u0
+	addc	r4, r4, r4
+	adde	r5, r5, r5
+	addze	r11, r11