[Gmp-commit] /var/hg/gmp: 4 new changesets

Tue Nov 1 13:38:44 CET 2011

details:   /var/hg/gmp/rev/fd40089e595c
changeset: 14399:fd40089e595c
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Tue Nov 01 13:25:47 2011 +0100
description:
New file.

details:   /var/hg/gmp/rev/737ccf954dcf
changeset: 14400:737ccf954dcf
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Tue Nov 01 13:28:11 2011 +0100
description:
Move sqr_diagonal.asm into mode32 dir.

details:   /var/hg/gmp/rev/28838840da66
changeset: 14401:28838840da66
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Tue Nov 01 13:31:12 2011 +0100
description:
Rewrite sqr_diag_addlsh1 code.

details:   /var/hg/gmp/rev/d60b0a9068cd
changeset: 14402:d60b0a9068cd
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Tue Nov 01 13:35:59 2011 +0100
description:
Add more cycle counts.

diffstat:

 mpn/powerpc64/mode32/sqr_diagonal.asm     |  107 +++++++++++++
 mpn/powerpc64/mode64/sqr_diag_addlsh1.asm |  238 ++++++++++++++++++++++++++++++
 mpn/powerpc64/sqr_diagonal.asm            |  107 -------------
 mpn/s390_32/esame/sqr_basecase.asm        |   44 ++---
 mpn/s390_64/bdiv_dbm1c.asm                |    2 +-
 mpn/s390_64/lshift.asm                    |    2 +-
 mpn/s390_64/lshiftc.asm                   |    2 +-
 mpn/s390_64/rshift.asm                    |    2 +-
 mpn/s390_64/sqr_basecase.asm              |   44 ++---
 9 files changed, 383 insertions(+), 165 deletions(-)

diffs (truncated from 638 to 300 lines):

diff -r 09fabd6fab76 -r d60b0a9068cd mpn/powerpc64/mode32/sqr_diagonal.asm

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/powerpc64/mode32/sqr_diagonal.asm	Tue Nov 01 13:35:59 2011 +0100
@@ -0,0 +1,107 @@
+dnl  PowerPC-64 mpn_sqr_diagonal.
+
+dnl  Copyright 2001, 2002, 2003, 2005, 2006, 20010 Free Software Foundation,
+dnl  Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		    cycles/limb
+C POWER3/PPC630		18
+C POWER4/PPC970		 ?
+C POWER5		 7.25
+C POWER6		 9.5
+
+C INPUT PARAMETERS
+define(`rp',  r3)
+define(`up',  r4)
+define(`n',   r5)
+
+ASM_START()
+PROLOGUE(mpn_sqr_diagonal)
+ifdef(`HAVE_ABI_mode32',
+`	rldicl	n, n, 0, 32')		C zero extend n
+
+	rldicl.	r0, n, 0,62		C r0 = n & 3, set cr0
+	addi	n, n, 3			C compute count...
+	cmpdi	cr6, r0, 2
+	srdi	n, n, 2			C ...for ctr
+	mtctr	n			C copy count into ctr
+	beq	cr0, L(b00)
+	blt	cr6, L(b01)
+	beq	cr6, L(b10)
+
+L(b11):	ld	r0, 0(up)
+	ld	r10, 8(up)
+	ld	r12, 16(up)
+	addi	rp, rp, -16
+	mulld	r7, r0, r0
+	mulhdu	r8, r0, r0
+	mulld	r9, r10, r10
+	mulhdu	r10, r10, r10
+	mulld	r11, r12, r12
+	mulhdu	r12, r12, r12
+	addi	up, up, 24
+	b	L(11)
+
+	ALIGN(16)
+L(b01):	ld	r0, 0(up)
+	addi	rp, rp, -48
+	addi	up, up, 8
+	mulld	r11, r0, r0
+	mulhdu	r12, r0, r0
+	b	L(01)
+
+	ALIGN(16)
+L(b10):	ld	r0, 0(up)
+	ld	r12, 8(up)
+	addi	rp, rp, -32
+	addi	up, up, 16
+	mulld	r9, r0, r0
+	mulhdu	r10, r0, r0
+	mulld	r11, r12, r12
+	mulhdu	r12, r12, r12
+	b	L(10)
+
+	ALIGN(32)
+L(b00):
+L(top):	ld	r0, 0(up)
+	ld	r8, 8(up)
+	ld	r10, 16(up)
+	ld	r12, 24(up)
+	mulld	r5, r0, r0
+	mulhdu	r6, r0, r0
+	mulld	r7, r8, r8
+	mulhdu	r8, r8, r8
+	mulld	r9, r10, r10
+	mulhdu	r10, r10, r10
+	mulld	r11, r12, r12
+	mulhdu	r12, r12, r12
+	addi	up, up, 32
+	std	r5, 0(rp)
+	std	r6, 8(rp)
+L(11):	std	r7, 16(rp)
+	std	r8, 24(rp)
+L(10):	std	r9, 32(rp)
+	std	r10, 40(rp)
+L(01):	std	r11, 48(rp)
+	std	r12, 56(rp)
+	addi	rp, rp, 64
+	bdnz	L(top)
+
+	blr
+EPILOGUE()
diff -r 09fabd6fab76 -r d60b0a9068cd mpn/powerpc64/mode64/sqr_diag_addlsh1.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/powerpc64/mode64/sqr_diag_addlsh1.asm	Tue Nov 01 13:35:59 2011 +0100
@@ -0,0 +1,238 @@
+dnl  PowerPC-64 mpn_sqr_diag_addlsh1
+
+dnl  Copyright 2011 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C		cycles/limb
+C POWER3/PPC630   10
+C POWER4/PPC970	   6
+C POWER5	   5.375
+C POWER6	   8.5
+
+C NOTES
+C  * This was written for POWER6 and its preferences for adjacent integer
+C    multiply insns.  The cost is that we get a large set of live registers,
+C    and therefore need to save 9 callee-saves registers.  Except for the
+C    multiply insns, the code was not carefully optimised for POWER6 or any
+C    other CPU.
+C  * Perform some cross-jumping in the feed-in code, into the loop's tail.
+
+C refmpn_sqr_diag_addlsh1 (mp_ptr rp, mp_srcptr tp, mp_srcptr up, mp_size_t n)
+
+C INPUT PARAMETERS
+define(`rp',  `r3')
+define(`tp',  `r4')
+define(`up',  `r5')
+define(`n',   `r6')
+
+define(`climb',	`r0')
+
+ASM_START()
+PROLOGUE(mpn_sqr_diag_addlsh1)
+	std	r31,  -8(r1)
+	std	r30, -16(r1)
+	std	r29, -24(r1)
+	std	r28, -32(r1)
+	std	r27, -40(r1)
+	std	r26, -48(r1)
+	std	r25, -56(r1)
+	std	r24, -64(r1)
+	std	r23, -72(r1)
+
+	rldicl.	r0, n, 0,62		C r0 = n & 3, set cr0
+	cmpdi	cr6, r0, 2
+	addi	n, n, 2	 		C compute count...
+	srdi	n, n, 2			C ...for ctr
+	mtctr	n			C put loop count into ctr
+	beq	cr0, L(b0)
+	blt	cr6, L(b1)
+	beq	cr6, L(b2)
+
+L(b3):	ld	r6,   0(up)
+	ld	r7,   8(up)
+	ld	r12, 16(up)
+	addi	up, up, 24
+	mulld	r24, r6, r6
+	mulhdu	r25, r6, r6
+	mulld	r26, r7, r7
+	mulhdu	r27, r7, r7
+	mulld	r28, r12, r12
+	mulhdu	r29, r12, r12
+	ld	r10, 0(tp)
+	ld	r11, 8(tp)
+	ld	r6, 16(tp)
+	ld	r7, 24(tp)
+	addi	tp, tp, 32
+	addc	r10, r10, r10
+	adde	r11, r11, r11
+	adde	r6, r6, r6
+	adde	r7, r7, r7
+	addze	climb, r29
+	addc	r10, r10, r25
+	adde	r11, r11, r26
+	adde	r6, r6, r27
+	adde	r7, r7, r28
+	std	r24,  0(rp)
+	std	r10,  8(rp)
+	std	r11, 16(rp)
+	std	r6,  24(rp)
+	std	r7,  32(rp)
+	addi	rp, rp, 40
+	bdnz	L(top)
+	b	L(end)
+
+L(b2):	ld	r6,  0(up)
+	ld	r7,  8(up)
+	addi	up, up, 16
+	mulld	r24, r6, r6
+	mulhdu	r25, r6, r6
+	mulld	r26, r7, r7
+	mulhdu	r27, r7, r7
+	ld	r10,  0(tp)
+	ld	r11,  8(tp)
+	addi	tp, tp, 16
+	addc	r10, r10, r10
+	adde	r11, r11, r11
+	addze	climb, r27
+	addc	r10, r10, r25
+	adde	r11, r11, r26
+	std	r24,  0(rp)
+	std	r10,  8(rp)
+	std	r11, 16(rp)
+	addi	rp, rp, 24
+	bdnz	L(top)
+	b	L(end)
+
+L(b0):	ld	r6,   0(up)
+	ld	r7,   8(up)
+	ld	r12, 16(up)
+	ld	r23, 24(up)
+	addi	up, up, 32
+	mulld	r24, r6, r6
+	mulhdu	r25, r6, r6
+	mulld	r26, r7, r7
+	mulhdu	r27, r7, r7
+	mulld	r28, r12, r12
+	mulhdu	r29, r12, r12
+	mulld	r30, r23, r23
+	mulhdu	r31, r23, r23
+	ld	r10,  0(tp)
+	ld	r11,  8(tp)
+	ld	r6,  16(tp)
+	ld	r7,  24(tp)
+	ld	r12, 32(tp)
+	ld	r23, 40(tp)
+	addi	tp, tp, 48
+	addc	r10, r10, r10
+	adde	r11, r11, r11
+	adde	r6, r6, r6
+	adde	r7, r7, r7
+	adde	r12, r12, r12
+	adde	r23, r23, r23
+	addze	climb, r31
+	std	r24,  0(rp)
+	addc	r10, r10, r25
+	std	r10,  8(rp)
+	adde	r11, r11, r26
+	std	r11, 16(rp)
+	adde	r6, r6, r27
+	std	r6,  24(rp)
+	adde	r7, r7, r28
+	std	r7,  32(rp)
+	adde	r12, r12, r29
+	std	r12, 40(rp)
+	adde	r23, r23, r30
+	std	r23, 48(rp)
+	addi	rp, rp, 56
+	bdnz	L(top)
+	b	L(end)
+
+L(b1):	ld	r6, 0(up)
+	addi	up, up, 8
+	mulld	r24, r6, r6
+	mulhdu	climb, r6, r6
+	std	r24, 0(rp)
+	addic	rp, rp, 8		C clear carry as side-effect
+
+	ALIGN(32)
+L(top):	ld	r6,   0(up)
+	ld	r7,   8(up)
+	ld	r12, 16(up)
+	ld	r23, 24(up)
+	addi	up, up, 32
+	mulld	r24, r6, r6
+	mulhdu	r25, r6, r6
+	mulld	r26, r7, r7
+	mulhdu	r27, r7, r7
+	mulld	r28, r12, r12
+	mulhdu	r29, r12, r12
+	mulld	r30, r23, r23