[Gmp-commit] /home/hgfiles/gmp: 4 new changesets

Sat Mar 13 23:10:09 CET 2010

details:   /home/hgfiles/gmp/rev/595322ff7a3c
changeset: 13491:595322ff7a3c
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sat Mar 13 17:59:07 2010 +0100
description:
Combine and improve ppc64 mpn_addmul_1 and mpn_submul_1 code.

details:   /home/hgfiles/gmp/rev/b646f1322679
changeset: 13492:b646f1322679
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sat Mar 13 18:00:38 2010 +0100
description:
Add more cycle counts.

details:   /home/hgfiles/gmp/rev/c68829e0fa2a
changeset: 13493:c68829e0fa2a
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sat Mar 13 18:00:50 2010 +0100
description:
Retune.

details:   /home/hgfiles/gmp/rev/252d1c121103
changeset: 13494:252d1c121103
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sat Mar 13 23:10:05 2010 +0100
description:
Rewrite Pentium 4/32 mpn_submul_1.

diffstat:

 ChangeLog                          |    9 +
 mpn/ia64/gmp-mparam.h              |    2 +-
 mpn/powerpc64/mode64/addmul_1.asm  |  185 --------------------------------
 mpn/powerpc64/mode64/aorsmul_1.asm |  209 +++++++++++++++++++++++++++++++++++++
 mpn/powerpc64/mode64/submul_1.asm  |   62 ----------
 mpn/x86/k6/gmp-mparam.h            |    8 +-
 mpn/x86/pentium4/sse2/submul_1.asm |  145 ++++++++++++++++---------
 mpn/x86_64/addmul_2.asm            |    2 +-
 mpn/x86_64/atom/gmp-mparam.h       |   10 +-
 mpn/x86_64/core2/gmp-mparam.h      |    6 +-
 mpn/x86_64/corei/gmp-mparam.h      |    8 +-
 mpn/x86_64/gmp-mparam.h            |   12 +-
 mpn/x86_64/mul_basecase.asm        |    6 +-
 mpn/x86_64/pentium4/gmp-mparam.h   |    6 +-
 14 files changed, 342 insertions(+), 328 deletions(-)

diffs (truncated from 839 to 300 lines):

diff -r c10243298cf0 -r 252d1c121103 ChangeLog

--- a/ChangeLog	Thu Mar 11 22:32:33 2010 +0100
+++ b/ChangeLog	Sat Mar 13 23:10:05 2010 +0100
@@ -1,3 +1,12 @@
+2010-03-13  Torbjorn Granlund  <tege at gmplib.org>
+
+	* mpn/x86/pentium4/sse2/submul_1.asm: Rewrite.
+
+	* mpn/powerpc64/mode64/aorsmul_1.asm: New file, faster than old code
+	for both mpn_addmul_1 and mpn_submul_1.
+	* mpn/powerpc64/mode64/addmul_1.asm: Remove.
+	* mpn/powerpc64/mode64/submul_1.asm: Remove.
+
 2010-03-11  Niels Möller  <nisse at lysator.liu.se>
 
 	* mpn/generic/gcd_lehmer.c (gcd_2): Use sub_ddmmss.
diff -r c10243298cf0 -r 252d1c121103 mpn/ia64/gmp-mparam.h
--- a/mpn/ia64/gmp-mparam.h	Thu Mar 11 22:32:33 2010 +0100
+++ b/mpn/ia64/gmp-mparam.h	Sat Mar 13 23:10:05 2010 +0100
@@ -29,7 +29,7 @@
 #define MOD_1N_TO_MOD_1_1_THRESHOLD         14
 #define MOD_1U_TO_MOD_1_1_THRESHOLD          8
 #define MOD_1_1_TO_MOD_1_2_THRESHOLD         0
-#define MOD_1_2_TO_MOD_1_4_THRESHOLD        14
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD        22
 #define PREINV_MOD_1_TO_MOD_1_THRESHOLD     22
 #define USE_PREINV_DIVREM_1                  1  /* native */
 #define DIVEXACT_1_THRESHOLD                 0  /* always (native) */
diff -r c10243298cf0 -r 252d1c121103 mpn/powerpc64/mode64/addmul_1.asm
--- a/mpn/powerpc64/mode64/addmul_1.asm	Thu Mar 11 22:32:33 2010 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,185 +0,0 @@
-dnl  PowerPC-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add
-dnl  the result to a second limb vector.
-
-dnl  Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006 Free Software
-dnl  Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of the GNU Lesser General Public License as published
-dnl  by the Free Software Foundation; either version 3 of the License, or (at
-dnl  your option) any later version.
-
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
-dnl  License for more details.
-
-dnl  You should have received a copy of the GNU Lesser General Public License
-dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C		cycles/limb
-C POWER3/PPC630:    6-18
-C POWER4/PPC970:     8
-C POWER5:            8
-
-C TODO
-C  * Reduce the number of registers used.  Some mul destination registers could
-C    be coalesced.
-C  * Delay std for preserving registers, and suppress them for n=1.
-C  * Write faster feed-in code.  If nothing else, avoid one or two up updates.
-
-C INPUT PARAMETERS
-define(`rp', `r3')
-define(`up', `r4')
-define(`n', `r5')
-define(`vl', `r6')
-
-ASM_START()
-PROLOGUE(mpn_addmul_1)
-	std	r31, -8(r1)
-	std	r30, -16(r1)
-	std	r29, -24(r1)
-	std	r28, -32(r1)
-	std	r27, -40(r1)
-	std	r26, -48(r1)
-
-	rldicl.	r0, n, 0,62	C r0 = n & 3, set cr0
-	cmpdi	cr6, r0, 2
-	addi	n, n, 3		C compute count...
-	srdi	n, n, 2		C ...for ctr
-	mtctr	n		C copy count into ctr
-	beq	cr0, L(b00)
-	blt	cr6, L(b01)
-	beq	cr6, L(b10)
-
-L(b11):	ld	r26, 0(up)
-	ld	r28, 0(rp)
-	addi	up, up, 8
-	nop
-	mulld	r0, r26, r6
-	mulhdu	r12, r26, r6
-	addc	r0, r0, r28
-	std	r0, 0(rp)
-	addi	rp, rp, 8
-	b	L(fic)
-
-L(b00):	ld	r26, 0(up)
-	ld	r27, 8(up)
-	ld	r28, 0(rp)
-	ld	r29, 8(rp)
-	addi	up, up, 16
-	nop
-	mulld	r0, r26, r6
-	mulhdu	r5, r26, r6
-	mulld	r7, r27, r6
-	mulhdu	r8, r27, r6
-	addc	r7, r7, r5
-	addze	r12, r8
-	addc	r0, r0, r28
-	std	r0, 0(rp)
-	adde	r7, r7, r29
-	std	r7, 8(rp)
-	addi	rp, rp, 16
-	b	L(fic)
-
-L(b01):	bdnz	L(gt1)
-	ld	r26, 0(up)
-	ld	r28, 0(rp)
-	mulld	r0, r26, r6
-	mulhdu	r8, r26, r6
-	addc	r0, r0, r28
-	std	r0, 0(rp)
-	b	L(ret)
-L(gt1):	ld	r26, 0(up)
-	ld	r27, 8(up)
-	mulld	r0, r26, r6
-	mulhdu	r5, r26, r6
-	ld	r26, 16(up)
-	ld	r28, 0(rp)
-	mulld	r7, r27, r6
-	mulhdu	r8, r27, r6
-	ld	r29, 8(rp)
-	ld	r30, 16(rp)
-	mulld	r9, r26, r6
-	mulhdu	r10, r26, r6
-	addc	r7, r7, r5
-	adde	r9, r9, r8
-	addze	r12, r10
-	addc	r0, r0, r28
-	std	r0, 0(rp)
-	adde	r7, r7, r29
-	std	r7, 8(rp)
-	adde	r9, r9, r30
-	std	r9, 16(rp)
-	addi	up, up, 24
-	addi	rp, rp, 24
-	b	L(fic)
-
-L(b10):	addic	r0, r0, 0
-	li	r12, 0		C cy_limb = 0
-L(fic):	ld	r26, 0(up)
-	ld	r27, 8(up)
-	addi	up, up, 16
-	bdz	L(end)
-				C registers dying
-L(top):	mulld	r0, r26, r6	C
-	mulhdu	r5, r26, r6	C 26
-	ld	r26, 0(up)	C
-	ld	r28, 0(rp)	C
-	mulld	r7, r27, r6	C
-	mulhdu	r8, r27, r6	C 27
-	ld	r27, 8(up)	C
-	ld	r29, 8(rp)	C
-	adde	r0, r0, r12	C 0 12
-	adde	r7, r7, r5	C 5 7
-	mulld	r9, r26, r6	C
-	mulhdu	r10, r26, r6	C 26
-	ld	r26, 16(up)	C
-	ld	r30, 16(rp)	C
-	mulld	r11, r27, r6	C
-	mulhdu	r12, r27, r6	C 27
-	ld	r27, 24(up)	C
-	ld	r31, 24(rp)	C
-	adde	r9, r9, r8	C 8 9
-	adde	r11, r11, r10	C 10 11
-	addze	r12, r12	C 12
-	addc	r0, r0, r28	C 0 28
-	std	r0, 0(rp)	C 0
-	adde	r7, r7, r29	C 7 29
-	std	r7, 8(rp)	C 7
-	adde	r9, r9, r30	C 9 30
-	std	r9, 16(rp)	C 9
-	adde	r11, r11, r31	C 11 31
-	std	r11, 24(rp)	C 11
-	addi	up, up, 32	C
-	addi	rp, rp, 32	C
-	bdnz	L(top)		C
-
-L(end):	mulld	r0, r26, r6
-	mulhdu	r5, r26, r6
-	ld	r28, 0(rp)
-	nop
-	mulld	r7, r27, r6
-	mulhdu	r8, r27, r6
-	ld	r29, 8(rp)
-	nop
-	adde	r0, r0, r12
-	adde	r7, r7, r5
-	addze	r8, r8
-	addc	r0, r0, r28
-	std	r0, 0(rp)
-	adde	r7, r7, r29
-	std	r7, 8(rp)
-L(ret):	addze	r3, r8
-	ld	r31, -8(r1)
-	ld	r30, -16(r1)
-	ld	r29, -24(r1)
-	ld	r28, -32(r1)
-	ld	r27, -40(r1)
-	ld	r26, -48(r1)
-	blr
-EPILOGUE()
diff -r c10243298cf0 -r 252d1c121103 mpn/powerpc64/mode64/aorsmul_1.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/powerpc64/mode64/aorsmul_1.asm	Sat Mar 13 23:10:05 2010 +0100
@@ -0,0 +1,209 @@
+dnl  PowerPC-64 mpn_addmul_1 and mpn_submul_1.
+
+dnl  Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006, 2010 Free Software
+dnl  Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		mpn_addmul_1	mpn_submul_1
+C		cycles/limb	cycles/limb
+C POWER3/PPC630:    6-18	    6-18
+C POWER4/PPC970:    8		    8.3
+C POWER5:           8		    ?
+
+C TODO
+C  * Try to reduce the number of needed live registers
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`up', `r4')
+define(`n',  `r5')
+define(`vl', `r6')
+
+ifdef(`OPERATION_addmul_1',`
+  define(ADDSUBC,	adde)
+  define(ADDSUB,	addc)
+  define(func,		mpn_addmul_1)
+  define(func_nc,	mpn_addmul_1c)
+  define(INVCY,		`')
+')
+ifdef(`OPERATION_submul_1',`
+  define(ADDSUBC,	subfe)
+  define(ADDSUB,	subfc)
+  define(func,		mpn_submul_1)
+  define(func_nc,	mpn_submul_1c)
+  define(INVCY,		`subfe	$1, $1, $1
+			addic	$1, $1,1')
+')
+
+ASM_START()
+PROLOGUE(func_nc)
+EPILOGUE()
+
+PROLOGUE(func)
+	std	r31, -8(r1)
+	rldicl.	r0, n, 0,62	C r0 = n & 3, set cr0
+	std	r30, -16(r1)
+	cmpdi	cr6, r0, 2
+	std	r29, -24(r1)
+	addi	n, n, 3		C compute count...
+	std	r28, -32(r1)
+	srdi	n, n, 2		C ...for ctr
+	std	r27, -40(r1)
+	mtctr	n		C copy count into ctr
+	beq	cr0, L(b00)
+	blt	cr6, L(b01)
+	beq	cr6, L(b10)
+
+L(b11):	ld	r9, 0(up)
+	ld	r28, 0(rp)
+	mulld	r0, r9, r6
+	mulhdu	r12, r9, r6
+	ADDSUB	r0, r0, r28
+	std	r0, 0(rp)
+	addi	rp, rp, 8