[Gmp-commit] /var/hg/gmp: 4 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Thu Apr 18 18:45:06 CEST 2013
details: /var/hg/gmp/rev/6e8cd45b217f
changeset: 15736:6e8cd45b217f
user: Torbjorn Granlund <tege at gmplib.org>
date: Wed Apr 17 20:34:41 2013 +0200
description:
Remove an obsolete comment.
details: /var/hg/gmp/rev/80cf30dfce25
changeset: 15737:80cf30dfce25
user: Torbjorn Granlund <tege at gmplib.org>
date: Wed Apr 17 20:36:33 2013 +0200
description:
Rewrite powerpc32 tabselect.
details: /var/hg/gmp/rev/c075c1c2c56f
changeset: 15738:c075c1c2c56f
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Apr 18 18:44:51 2013 +0200
description:
New alpha tabselect.
details: /var/hg/gmp/rev/1dde616d03b2
changeset: 15739:1dde616d03b2
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Apr 18 18:45:02 2013 +0200
description:
ChangeLog
diffstat:
ChangeLog | 6 +
mpn/alpha/tabselect.asm | 126 ++++++++++++++++++++++++++++++++++++
mpn/powerpc32/tabselect.asm | 152 ++++++++++++++++++++++++++-----------------
mpn/powerpc64/tabselect.asm | 3 -
4 files changed, 224 insertions(+), 63 deletions(-)
diffs (truncated from 341 to 300 lines):
diff -r 3d08c3752df9 -r 1dde616d03b2 ChangeLog
--- a/ChangeLog Wed Apr 17 17:05:56 2013 +0200
+++ b/ChangeLog Thu Apr 18 18:45:02 2013 +0200
@@ -1,5 +1,11 @@
+2013-04-18 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/alpha/tabselect.asm: New file.
+
2013-04-17 Torbjorn Granlund <tege at gmplib.org>
+ * mpn/powerpc32/tabselect.asm: New file.
+
* longlong.h (arm64 count_trailing_zeros): New.
* mpn/arm64/invert_limb.asm: New file.
diff -r 3d08c3752df9 -r 1dde616d03b2 mpn/alpha/tabselect.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/alpha/tabselect.asm Thu Apr 18 18:45:02 2013 +0200
@@ -0,0 +1,126 @@
+dnl Alpha mpn_tabselect.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2011, 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: ?
+C EV5: 2.25
+C EV6: 1.64
+
+define(`rp', `r16')
+define(`tp', `r17')
+define(`n', `r18')
+define(`nents', `r19')
+define(`which', `r20')
+
+define(`i', `r21')
+define(`j', `r22')
+define(`stride', `r23')
+define(`mask', `r24')
+define(`k', `r25')
+
+
+ASM_START()
+PROLOGUE(mpn_tabselect)
+ subq n, 4, j C outer loop induction variable
+
+ blt j, L(outer_end)
+L(outer_top):
+ mov tp, r8
+ lda r0, 0(r31)
+ lda r1, 0(r31)
+ lda r2, 0(r31)
+ lda r3, 0(r31)
+ subq j, 4, j C outer loop induction variable
+ subq nents, which, k
+ mov nents, i
+
+ ALIGN(16)
+L(top): ldq r4, 0(tp)
+ ldq r5, 8(tp)
+ cmpeq k, i, mask
+ subq i, 1, i
+ subq r31, mask, mask
+ ldq r6, 16(tp)
+ ldq r7, 24(tp)
+ and r4, mask, r4
+ and r5, mask, r5
+ or r0, r4, r0
+ or r1, r5, r1
+ and r6, mask, r6
+ and r7, mask, r7
+ or r2, r6, r2
+ or r3, r7, r3
+ s8addq n, tp, tp
+ bne i, L(top)
+
+ stq r0, 0(rp)
+ stq r1, 8(rp)
+ stq r2, 16(rp)
+ stq r3, 24(rp)
+ addq r8, 32, tp
+ addq rp, 32, rp
+ bge j, L(outer_top)
+L(outer_end):
+
+ and n, 2, r0
+ beq r0, L(b0x)
+L(b1x): mov tp, r8
+ lda r0, 0(r31)
+ lda r1, 0(r31)
+ subq nents, which, k
+ mov nents, i
+ ALIGN(16)
+L(tp2): ldq r4, 0(tp)
+ ldq r5, 8(tp)
+ cmpeq k, i, mask
+ subq i, 1, i
+ subq r31, mask, mask
+ and r4, mask, r4
+ and r5, mask, r5
+ or r0, r4, r0
+ or r1, r5, r1
+ s8addq n, tp, tp
+ bne i, L(tp2)
+ stq r0, 0(rp)
+ stq r1, 8(rp)
+ addq r8, 16, tp
+ addq rp, 16, rp
+
+L(b0x): and n, 1, r0
+ beq r0, L(b00)
+L(b01): lda r0, 0(r31)
+ subq nents, which, k
+ mov nents, i
+ ALIGN(16)
+L(tp1): ldq r4, 0(tp)
+ cmpeq k, i, mask
+ subq i, 1, i
+ subq r31, mask, mask
+ and r4, mask, r4
+ or r0, r4, r0
+ s8addq n, tp, tp
+ bne i, L(tp1)
+ stq r0, 0(rp)
+
+L(b00): ret r31, (r26), 1
+EPILOGUE()
diff -r 3d08c3752df9 -r 1dde616d03b2 mpn/powerpc32/tabselect.asm
--- a/mpn/powerpc32/tabselect.asm Wed Apr 17 17:05:56 2013 +0200
+++ b/mpn/powerpc32/tabselect.asm Thu Apr 18 18:45:02 2013 +0200
@@ -1,6 +1,8 @@
dnl PowerPC-32 mpn_tabselect.
-dnl Copyright 2011 Free Software Foundation, Inc.
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2011, 2012, 2013 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -19,80 +21,110 @@
include(`../config.m4')
-C cycles/limb
-C 603e: ?
-C 604e: ?
-C 75x (G3): ?
-C 7400,7410 (G4): ?
-C 744x,745x (G4+): ?
-C power4/ppc970: 3.3
-C power5: ?
+C cycles/limb
+C 603e: ?
+C 604e: ?
+C 75x (G3): ?
+C 7400,7410 (G4): 2.5
+C 744x,745x (G4+): 2.0
+C power4/ppc970: 2.0
+C power5: ?
-C NOTES
-C * This has not been tuned for any specific processor. Its speed should not
-C be too bad, though.
-C * Using VMX could result in significant speedup for certain CPUs.
-
-C mpn_tabselect (mp_limb_t *rp, mp_limb_t *tp, mp_size_t n, mp_size_t nents, mp_size_t which)
define(`rp', `r3')
define(`tp', `r4')
define(`n', `r5')
define(`nents', `r6')
define(`which', `r7')
-define(`mask', `r8')
+define(`i', `r8')
+define(`j', `r9')
+define(`stride', `r12')
+define(`mask', `r11')
+
ASM_START()
- TEXT
- ALIGN(16)
PROLOGUE(mpn_tabselect)
- addi r0, n, 1
- srwi r0, r0, 1 C inner loop count
- andi. r9, n, 1 C set cr0 for use in inner loop
- subf which, nents, which
- slwi n, n, 2
+ addic. j, n, -4 C outer loop induction variable
+ stmw r27, -32(r1)
+ slwi stride, n, 2
-L(outer):
- mtctr r0 C put inner loop count in ctr
-
- add r9, which, nents C are we at the selected table entry?
- addic r9, r9, -1 C set CF iff not selected entry
- subfe mask, r0, r0
-
- beq cr0, L(top) C branch to loop entry if n even
-
- lwz r9, 0(tp)
- addi tp, tp, 4
- and r9, r9, mask
- lwz r11, 0(rp)
- andc r11, r11, mask
- or r9, r9, r11
- stw r9, 0(rp)
- addi rp, rp, 4
- bdz L(end)
+ blt cr0, L(outer_end)
+L(outer_top):
+ mtctr nents
+ mr r10, tp
+ li r28, 0
+ li r29, 0
+ li r30, 0
+ li r31, 0
+ addic. j, j, -4 C outer loop induction variable
+ mr i, which
ALIGN(16)
-L(top): lwz r9, 0(tp)
- lwz r10, 4(tp)
- addi tp, tp, 8
- nop
- and r9, r9, mask
- and r10, r10, mask
- lwz r11, 0(rp)
- lwz r12, 4(rp)
- andc r11, r11, mask
- andc r12, r12, mask
- or r9, r9, r11
- or r10, r10, r12
- stw r9, 0(rp)
- stw r10, 4(rp)
- addi rp, rp, 8
+L(top): addic i, i, -1 C set carry iff i != 0
+ subfe mask, mask, mask
+ lwz r0, 0(tp)
+ lwz r27, 4(tp)
+ and r0, r0, mask
+ and r27, r27, mask
+ or r28, r28, r0
+ or r29, r29, r27
+ lwz r0, 8(tp)
+ lwz r27, 12(tp)
+ and r0, r0, mask
+ and r27, r27, mask
+ or r30, r30, r0
+ or r31, r31, r27
+ add tp, tp, stride
bdnz L(top)
-L(end): subf rp, n, rp C move rp back to beginning
- cmpwi cr6, nents, 1
- addi nents, nents, -1
- bne cr6, L(outer)
+ stw r28, 0(rp)
+ stw r29, 4(rp)
+ stw r30, 8(rp)
+ stw r31, 12(rp)
+ addi tp, r10, 16
+ addi rp, rp, 16
+ bge cr0, L(outer_top)
+L(outer_end):
+ andi. r0, n, 2
+ beq cr0, L(b0x)
+L(b1x): mtctr nents
+ mr r10, tp
+ li r28, 0
+ li r29, 0
+ mr i, which
+ ALIGN(16)
+L(tp2): addic i, i, -1
+ subfe mask, mask, mask
+ lwz r0, 0(tp)
+ lwz r27, 4(tp)
+ and r0, r0, mask
+ and r27, r27, mask
More information about the gmp-commit
mailing list