[Gmp-commit] /var/hg/gmp: 6 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Fri Apr 26 00:13:55 CEST 2013
details: /var/hg/gmp/rev/66d488db0ee2
changeset: 15747:66d488db0ee2
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Apr 25 22:08:07 2013 +0200
description:
ARM A15 submul_1.
details: /var/hg/gmp/rev/229974a3a698
changeset: 15748:229974a3a698
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Apr 25 22:08:45 2013 +0200
description:
ARM A15 com.
details: /var/hg/gmp/rev/f36214a935f6
changeset: 15749:f36214a935f6
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Apr 25 22:13:22 2013 +0200
description:
Conditionally suppress conditionally used code.
details: /var/hg/gmp/rev/b79fabc0d7eb
changeset: 15750:b79fabc0d7eb
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Apr 25 22:15:58 2013 +0200
description:
Clear carry smarter.
details: /var/hg/gmp/rev/f8b08b239a9a
changeset: 15751:f8b08b239a9a
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Apr 25 22:16:44 2013 +0200
description:
Collect header comments.
details: /var/hg/gmp/rev/fe5fa317ad04
changeset: 15752:fe5fa317ad04
user: Torbjorn Granlund <tege at gmplib.org>
date: Fri Apr 26 00:13:51 2013 +0200
description:
ChangeLog
diffstat:
ChangeLog | 11 ++
mpn/arm/mod_34lsub1.asm | 4 +-
mpn/arm/v6/submul_1.asm | 10 +-
mpn/arm/v7a/cora15/com.asm | 169 ++++++++++++++++++++++++++++++++++++++++
mpn/arm/v7a/cora15/logops_n.asm | 2 +
mpn/arm/v7a/cora15/submul_1.asm | 148 +++++++++++++++++++++++++++++++++++
6 files changed, 337 insertions(+), 7 deletions(-)
diffs (truncated from 407 to 300 lines):
diff -r 46bfe0a1bb40 -r fe5fa317ad04 ChangeLog
--- a/ChangeLog Wed Apr 24 01:13:44 2013 +0200
+++ b/ChangeLog Fri Apr 26 00:13:51 2013 +0200
@@ -1,5 +1,16 @@
+2013-04-25 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/arm/mod_34lsub1.asm: Clear carry smarter.
+
+ * mpn/arm/v7a/cora15/logops_n.asm: Conditionally suppress conditionally
+ used code.
+
+ * mpn/arm/v7a/cora15/submul_1.asm: New file.
+
2013-04-24 Torbjorn Granlund <tege at gmplib.org>
+ * mpn/arm/v7a/cora15/com.asm: New file.
+
* mpn/arm/v7a/cora15/logops_n.asm: New file.
2013-04-19 Torbjorn Granlund <tege at gmplib.org>
diff -r 46bfe0a1bb40 -r fe5fa317ad04 mpn/arm/mod_34lsub1.asm
--- a/mpn/arm/mod_34lsub1.asm Wed Apr 24 01:13:44 2013 +0200
+++ b/mpn/arm/mod_34lsub1.asm Fri Apr 26 00:13:51 2013 +0200
@@ -1,6 +1,6 @@
dnl ARM mpn_mod_34lsub1 -- remainder modulo 2^24-1.
-dnl Copyright 2012 Free Software Foundation, Inc.
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -49,7 +49,7 @@
ldmia ap!, { r2, r3, r12 }
subs n, n, #3
blt L(sum) C n <= 5
- adds r0, r0, #0 C clear carry
+ cmn r0, #0 C clear carry
sub n, n, #3
b L(mid)
diff -r 46bfe0a1bb40 -r fe5fa317ad04 mpn/arm/v6/submul_1.asm
--- a/mpn/arm/v6/submul_1.asm Wed Apr 24 01:13:44 2013 +0200
+++ b/mpn/arm/v6/submul_1.asm Fri Apr 26 00:13:51 2013 +0200
@@ -27,6 +27,11 @@
C Cortex-A9 3.75
C Cortex-A15 4.0
+C This loop complements U on the fly,
+C U' = B^n - 1 - U
+C and then uses that
+C R - U*v = R + U'*v + v - B^n v
+
C TODO
C * Micro-optimise feed-in code.
C * Optimise for n=1,2 by delaying register saving.
@@ -37,11 +42,6 @@
define(`n', `r2')
define(`v0',`r3')
-C This loop complements U on the fly,
-C U' = B^n - 1 - U
-C and then uses that
-C R - U*v = R + U'*v + v - B^n v
-
ASM_START()
PROLOGUE(mpn_submul_1)
stmfd sp!, { r4, r5, r6, r7 }
diff -r 46bfe0a1bb40 -r fe5fa317ad04 mpn/arm/v7a/cora15/com.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/arm/v7a/cora15/com.asm Fri Apr 26 00:13:51 2013 +0200
@@ -0,0 +1,169 @@
+dnl ARM mpn_com optimised for A15.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C StrongARM ?
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 2.5
+C Cortex-A15 1.0
+
+C This is great A15 core register code, but it is a bit large.
+C We use FEEDIN_VARIANT 1 to save some space, but use 8-way unrolling.
+
+C Architecture requirements:
+C v5 -
+C v5t -
+C v5te ldrd strd
+C v6 -
+C v6t2 -
+C v7a -
+
+define(`FEEDIN_VARIANT', 1) C alternatives: 0 1 2
+define(`UNROLL', 4x2) C alternatives: 4 4x2
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`n', `r2')
+
+ASM_START()
+PROLOGUE(mpn_com)
+ push { r4-r5,r8-r9 }
+
+ifelse(FEEDIN_VARIANT,0,`
+ ands r12, n, #3
+ mov n, n, lsr #2
+ beq L(b00a)
+ tst r12, #1
+ beq L(bx0)
+ ldr r5, [up], #4
+ mvn r9, r5
+ str r9, [rp], #4
+ tst r12, #2
+ beq L(b00)
+L(bx0): ldrd r4, r5, [up, #0]
+ sub rp, rp, #8
+ b L(lo)
+L(b00): tst n, n
+ beq L(wd1)
+L(b00a):ldrd r4, r5, [up], #-8
+ sub rp, rp, #16
+ b L(mid)
+')
+ifelse(FEEDIN_VARIANT,1,`
+ and r12, n, #3
+ mov n, n, lsr #2
+ tst r12, #1
+ beq L(bx0)
+ ldr r5, [up], #4
+ mvn r9, r5
+ str r9, [rp], #4
+L(bx0): tst r12, #2
+ beq L(b00)
+ ldrd r4, r5, [up, #0]
+ sub rp, rp, #8
+ b L(lo)
+L(b00): tst n, n
+ beq L(wd1)
+ ldrd r4, r5, [up], #-8
+ sub rp, rp, #16
+ b L(mid)
+')
+ifelse(FEEDIN_VARIANT,2,`
+ ands r12, n, #3
+ mov n, n, lsr #2
+ beq L(b00)
+ cmp r12, #2
+ bcc L(b01)
+ beq L(b10)
+
+L(b11): ldr r5, [up], #4
+ mvn r9, r5
+ ldrd r4, r5, [up, #0]
+ str r9, [rp], #-4
+ b L(lo)
+
+L(b00): ldrd r4, r5, [up], #-8
+ sub rp, rp, #16
+ b L(mid)
+
+L(b01): ldr r5, [up], #-4
+ mvn r9, r5
+ str r9, [rp], #-12
+ tst n, n
+ beq L(wd1)
+L(gt1): ldrd r4, r5, [up, #8]
+ b L(mid)
+
+L(b10): ldrd r4, r5, [up]
+ sub rp, rp, #8
+ b L(lo)
+')
+ ALIGN(16)
+ifelse(UNROLL,4,`
+L(top): ldrd r4, r5, [up, #8]
+ strd r8, r9, [rp, #8]
+L(mid): mvn r8, r4
+ mvn r9, r5
+ ldrd r4, r5, [up, #16]!
+ strd r8, r9, [rp, #16]!
+ sub n, n, #1
+L(lo): mvn r8, r4
+ mvn r9, r5
+ tst n, n
+ bne L(top)
+')
+ifelse(UNROLL,4x2,`
+L(top): ldrd r4, r5, [up, #8]
+ strd r8, r9, [rp, #8]
+L(mid): mvn r8, r4
+ mvn r9, r5
+ ldrd r4, r5, [up, #16]
+ strd r8, r9, [rp, #16]
+ mvn r8, r4
+ mvn r9, r5
+ sub n, n, #2
+ tst n, n
+ bmi L(dne)
+ ldrd r4, r5, [up, #24]
+ strd r8, r9, [rp, #24]
+ mvn r8, r4
+ mvn r9, r5
+ ldrd r4, r5, [up, #32]!
+ strd r8, r9, [rp, #32]!
+L(lo): mvn r8, r4
+ mvn r9, r5
+ tst n, n
+ bne L(top)
+')
+
+L(end): strd r8, r9, [rp, #8]
+L(wd1): pop { r4-r5,r8-r9 }
+ bx r14
+ifelse(UNROLL,4x2,`
+L(dne): strd r8, r9, [rp, #24]
+ pop { r4-r5,r8-r9 }
+ bx r14
+')
+EPILOGUE()
diff -r 46bfe0a1bb40 -r fe5fa317ad04 mpn/arm/v7a/cora15/logops_n.asm
--- a/mpn/arm/v7a/cora15/logops_n.asm Wed Apr 24 01:13:44 2013 +0200
+++ b/mpn/arm/v7a/cora15/logops_n.asm Fri Apr 26 00:13:51 2013 +0200
@@ -232,9 +232,11 @@
strd r8, r9, [rp, #8]
L(wd1): pop { r4-r9 }
bx r14
+ifelse(UNROLL,4x2,`
L(dne): POSTOP( r8)
POSTOP( r9)
strd r8, r9, [rp, #24]
pop { r4-r9 }
bx r14
+')
EPILOGUE()
diff -r 46bfe0a1bb40 -r fe5fa317ad04 mpn/arm/v7a/cora15/submul_1.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/arm/v7a/cora15/submul_1.asm Fri Apr 26 00:13:51 2013 +0200
@@ -0,0 +1,148 @@
+dnl ARM mpn_submul_1 optimised for A15.
+
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb best
+C StrongARM: -
+C XScale ?
+C Cortex-A7 ?
+C Cortex-A8 ?
+C Cortex-A9 5.75 3.75
+C Cortex-A15 2.32 this
+
+C This code uses umlal and umaal for adding in the rp[] data, keeping the
+C recurrency path separate from any multiply instructions. It performs well on
+C A15, but not quite at the multiply bandwidth like the corresponding addmul_1
+C code.
+C
+C We don't use r12 due to ldrd and strd limitations.
+C
+C This loop complements U on the fly,
+C U' = B^n - 1 - U
+C and then uses that
+C R - U*v = R + U'*v + v - B^n v
+
More information about the gmp-commit
mailing list