[Gmp-commit] /var/hg/gmp: 4 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Tue Nov 1 13:38:44 CET 2011
details: /var/hg/gmp/rev/fd40089e595c
changeset: 14399:fd40089e595c
user: Torbjorn Granlund <tege at gmplib.org>
date: Tue Nov 01 13:25:47 2011 +0100
description:
New file.
details: /var/hg/gmp/rev/737ccf954dcf
changeset: 14400:737ccf954dcf
user: Torbjorn Granlund <tege at gmplib.org>
date: Tue Nov 01 13:28:11 2011 +0100
description:
Move sqr_diagonal.asm into mode32 dir.
details: /var/hg/gmp/rev/28838840da66
changeset: 14401:28838840da66
user: Torbjorn Granlund <tege at gmplib.org>
date: Tue Nov 01 13:31:12 2011 +0100
description:
Rewrite sqr_diag_addlsh1 code.
details: /var/hg/gmp/rev/d60b0a9068cd
changeset: 14402:d60b0a9068cd
user: Torbjorn Granlund <tege at gmplib.org>
date: Tue Nov 01 13:35:59 2011 +0100
description:
Add more cycle counts.
diffstat:
mpn/powerpc64/mode32/sqr_diagonal.asm | 107 +++++++++++++
mpn/powerpc64/mode64/sqr_diag_addlsh1.asm | 238 ++++++++++++++++++++++++++++++
mpn/powerpc64/sqr_diagonal.asm | 107 -------------
mpn/s390_32/esame/sqr_basecase.asm | 44 ++---
mpn/s390_64/bdiv_dbm1c.asm | 2 +-
mpn/s390_64/lshift.asm | 2 +-
mpn/s390_64/lshiftc.asm | 2 +-
mpn/s390_64/rshift.asm | 2 +-
mpn/s390_64/sqr_basecase.asm | 44 ++---
9 files changed, 383 insertions(+), 165 deletions(-)
diffs (truncated from 638 to 300 lines):
diff -r 09fabd6fab76 -r d60b0a9068cd mpn/powerpc64/mode32/sqr_diagonal.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/powerpc64/mode32/sqr_diagonal.asm Tue Nov 01 13:35:59 2011 +0100
@@ -0,0 +1,107 @@
+dnl PowerPC-64 mpn_sqr_diagonal.
+
+dnl Copyright 2001, 2002, 2003, 2005, 2006, 20010 Free Software Foundation,
+dnl Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 18
+C POWER4/PPC970 ?
+C POWER5 7.25
+C POWER6 9.5
+
+C INPUT PARAMETERS
+define(`rp', r3)
+define(`up', r4)
+define(`n', r5)
+
+ASM_START()
+PROLOGUE(mpn_sqr_diagonal)
+ifdef(`HAVE_ABI_mode32',
+` rldicl n, n, 0, 32') C zero extend n
+
+ rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
+ addi n, n, 3 C compute count...
+ cmpdi cr6, r0, 2
+ srdi n, n, 2 C ...for ctr
+ mtctr n C copy count into ctr
+ beq cr0, L(b00)
+ blt cr6, L(b01)
+ beq cr6, L(b10)
+
+L(b11): ld r0, 0(up)
+ ld r10, 8(up)
+ ld r12, 16(up)
+ addi rp, rp, -16
+ mulld r7, r0, r0
+ mulhdu r8, r0, r0
+ mulld r9, r10, r10
+ mulhdu r10, r10, r10
+ mulld r11, r12, r12
+ mulhdu r12, r12, r12
+ addi up, up, 24
+ b L(11)
+
+ ALIGN(16)
+L(b01): ld r0, 0(up)
+ addi rp, rp, -48
+ addi up, up, 8
+ mulld r11, r0, r0
+ mulhdu r12, r0, r0
+ b L(01)
+
+ ALIGN(16)
+L(b10): ld r0, 0(up)
+ ld r12, 8(up)
+ addi rp, rp, -32
+ addi up, up, 16
+ mulld r9, r0, r0
+ mulhdu r10, r0, r0
+ mulld r11, r12, r12
+ mulhdu r12, r12, r12
+ b L(10)
+
+ ALIGN(32)
+L(b00):
+L(top): ld r0, 0(up)
+ ld r8, 8(up)
+ ld r10, 16(up)
+ ld r12, 24(up)
+ mulld r5, r0, r0
+ mulhdu r6, r0, r0
+ mulld r7, r8, r8
+ mulhdu r8, r8, r8
+ mulld r9, r10, r10
+ mulhdu r10, r10, r10
+ mulld r11, r12, r12
+ mulhdu r12, r12, r12
+ addi up, up, 32
+ std r5, 0(rp)
+ std r6, 8(rp)
+L(11): std r7, 16(rp)
+ std r8, 24(rp)
+L(10): std r9, 32(rp)
+ std r10, 40(rp)
+L(01): std r11, 48(rp)
+ std r12, 56(rp)
+ addi rp, rp, 64
+ bdnz L(top)
+
+ blr
+EPILOGUE()
diff -r 09fabd6fab76 -r d60b0a9068cd mpn/powerpc64/mode64/sqr_diag_addlsh1.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/powerpc64/mode64/sqr_diag_addlsh1.asm Tue Nov 01 13:35:59 2011 +0100
@@ -0,0 +1,238 @@
+dnl PowerPC-64 mpn_sqr_diag_addlsh1
+
+dnl Copyright 2011 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C POWER3/PPC630 10
+C POWER4/PPC970 6
+C POWER5 5.375
+C POWER6 8.5
+
+C NOTES
+C * This was written for POWER6 and its preferences for adjacent integer
+C multiply insns. The cost is that we get a large set of live registers,
+C and therefore need to save 9 callee-saves registers. Except for the
+C multiply insns, the code was not carefully optimised for POWER6 or any
+C other CPU.
+C * Perform some cross-jumping in the feed-in code, into the loop's tail.
+
+C refmpn_sqr_diag_addlsh1 (mp_ptr rp, mp_srcptr tp, mp_srcptr up, mp_size_t n)
+
+C INPUT PARAMETERS
+define(`rp', `r3')
+define(`tp', `r4')
+define(`up', `r5')
+define(`n', `r6')
+
+define(`climb', `r0')
+
+ASM_START()
+PROLOGUE(mpn_sqr_diag_addlsh1)
+ std r31, -8(r1)
+ std r30, -16(r1)
+ std r29, -24(r1)
+ std r28, -32(r1)
+ std r27, -40(r1)
+ std r26, -48(r1)
+ std r25, -56(r1)
+ std r24, -64(r1)
+ std r23, -72(r1)
+
+ rldicl. r0, n, 0,62 C r0 = n & 3, set cr0
+ cmpdi cr6, r0, 2
+ addi n, n, 2 C compute count...
+ srdi n, n, 2 C ...for ctr
+ mtctr n C put loop count into ctr
+ beq cr0, L(b0)
+ blt cr6, L(b1)
+ beq cr6, L(b2)
+
+L(b3): ld r6, 0(up)
+ ld r7, 8(up)
+ ld r12, 16(up)
+ addi up, up, 24
+ mulld r24, r6, r6
+ mulhdu r25, r6, r6
+ mulld r26, r7, r7
+ mulhdu r27, r7, r7
+ mulld r28, r12, r12
+ mulhdu r29, r12, r12
+ ld r10, 0(tp)
+ ld r11, 8(tp)
+ ld r6, 16(tp)
+ ld r7, 24(tp)
+ addi tp, tp, 32
+ addc r10, r10, r10
+ adde r11, r11, r11
+ adde r6, r6, r6
+ adde r7, r7, r7
+ addze climb, r29
+ addc r10, r10, r25
+ adde r11, r11, r26
+ adde r6, r6, r27
+ adde r7, r7, r28
+ std r24, 0(rp)
+ std r10, 8(rp)
+ std r11, 16(rp)
+ std r6, 24(rp)
+ std r7, 32(rp)
+ addi rp, rp, 40
+ bdnz L(top)
+ b L(end)
+
+L(b2): ld r6, 0(up)
+ ld r7, 8(up)
+ addi up, up, 16
+ mulld r24, r6, r6
+ mulhdu r25, r6, r6
+ mulld r26, r7, r7
+ mulhdu r27, r7, r7
+ ld r10, 0(tp)
+ ld r11, 8(tp)
+ addi tp, tp, 16
+ addc r10, r10, r10
+ adde r11, r11, r11
+ addze climb, r27
+ addc r10, r10, r25
+ adde r11, r11, r26
+ std r24, 0(rp)
+ std r10, 8(rp)
+ std r11, 16(rp)
+ addi rp, rp, 24
+ bdnz L(top)
+ b L(end)
+
+L(b0): ld r6, 0(up)
+ ld r7, 8(up)
+ ld r12, 16(up)
+ ld r23, 24(up)
+ addi up, up, 32
+ mulld r24, r6, r6
+ mulhdu r25, r6, r6
+ mulld r26, r7, r7
+ mulhdu r27, r7, r7
+ mulld r28, r12, r12
+ mulhdu r29, r12, r12
+ mulld r30, r23, r23
+ mulhdu r31, r23, r23
+ ld r10, 0(tp)
+ ld r11, 8(tp)
+ ld r6, 16(tp)
+ ld r7, 24(tp)
+ ld r12, 32(tp)
+ ld r23, 40(tp)
+ addi tp, tp, 48
+ addc r10, r10, r10
+ adde r11, r11, r11
+ adde r6, r6, r6
+ adde r7, r7, r7
+ adde r12, r12, r12
+ adde r23, r23, r23
+ addze climb, r31
+ std r24, 0(rp)
+ addc r10, r10, r25
+ std r10, 8(rp)
+ adde r11, r11, r26
+ std r11, 16(rp)
+ adde r6, r6, r27
+ std r6, 24(rp)
+ adde r7, r7, r28
+ std r7, 32(rp)
+ adde r12, r12, r29
+ std r12, 40(rp)
+ adde r23, r23, r30
+ std r23, 48(rp)
+ addi rp, rp, 56
+ bdnz L(top)
+ b L(end)
+
+L(b1): ld r6, 0(up)
+ addi up, up, 8
+ mulld r24, r6, r6
+ mulhdu climb, r6, r6
+ std r24, 0(rp)
+ addic rp, rp, 8 C clear carry as side-effect
+
+ ALIGN(32)
+L(top): ld r6, 0(up)
+ ld r7, 8(up)
+ ld r12, 16(up)
+ ld r23, 24(up)
+ addi up, up, 32
+ mulld r24, r6, r6
+ mulhdu r25, r6, r6
+ mulld r26, r7, r7
+ mulhdu r27, r7, r7
+ mulld r28, r12, r12
+ mulhdu r29, r12, r12
+ mulld r30, r23, r23
More information about the gmp-commit
mailing list