[Gmp-commit] /var/hg/gmp: Add ARM addlsh1_n and sublsh1_n.
mercurial at gmplib.org
mercurial at gmplib.org
Sun Apr 29 17:37:59 CEST 2012
details: /var/hg/gmp/rev/642cd5ba0529
changeset: 14907:642cd5ba0529
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Apr 29 17:37:53 2012 +0200
description:
Add ARM addlsh1_n and sublsh1_n.
diffstat:
ChangeLog | 2 +
mpn/arm/aorslsh1_n.asm | 155 +++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 157 insertions(+), 0 deletions(-)
diffs (170 lines):
diff -r 521b092f6517 -r 642cd5ba0529 ChangeLog
--- a/ChangeLog Sun Apr 29 03:05:52 2012 +0200
+++ b/ChangeLog Sun Apr 29 17:37:53 2012 +0200
@@ -1,5 +1,7 @@
2012-04-29 Torbjorn Granlund <tege at gmplib.org>
+ * mpn/arm/aorslsh1_n.asm: New file.
+
* mpn/arm/mod_34lsub1.asm: New file.
* mpn/arm/v6t2/divrem_1.asm: New file.
diff -r 521b092f6517 -r 642cd5ba0529 mpn/arm/aorslsh1_n.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/arm/aorslsh1_n.asm Sun Apr 29 17:37:53 2012 +0200
@@ -0,0 +1,155 @@
+dnl ARM mpn_addlsh1_n and mpn_sublsh1_n
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2012 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C addlsh1_n sublsh1_n
+C cycles/limb cycles/limb
+C StrongARM ? ?
+C XScale ? ?
+C Cortex-A8 ? ?
+C Cortex-A9 3.17 3.7
+C Cortex-A15 ? ?
+
+C TODO
+C * The addlsh1_n code runs well, but is only barely faster than mpn_addmul_1.
+C The sublsh1_n code could surely be tweaked, its REVCY slows down things
+C very much. If two insns are really needed, it might help to separate them
+C for better micro-parallelism.
+
+define(`rp', `r0')
+define(`up', `r1')
+define(`vp', `r2')
+define(`n', `r3')
+
+ifdef(`OPERATION_addlsh1_n', `
+ define(`ADDSUB', adds)
+ define(`ADDSUBC', adcs)
+ define(`SETCY', `cmp $1, #1')
+ define(`RETVAL', `adc r0, $1, #2')
+ define(`SAVECY', `sbc $1, $2, #0')
+ define(`RESTCY', `adds $1, $1, #1')
+ define(`REVCY', `')
+ define(`INICYR', `mov $1, #0')
+ define(`r10r11', `r11')
+ define(`func', mpn_addlsh1_n)
+ define(`func_nc', mpn_addlsh1_nc)')
+ifdef(`OPERATION_sublsh1_n', `
+ define(`ADDSUB', subs)
+ define(`ADDSUBC', sbcs)
+ define(`SETCY', `rsbs $1, $1, #0')
+ define(`RETVAL', `adc r0, $1, #1')
+ define(`SAVECY', `sbc $1, $1, $1')
+ define(`RESTCY', `adds $1, $1, #1')
+ define(`REVCY', `sbc $1, $1, $1
+ adds $1, $1, #1')
+ define(`INICYR', `mov $1, #-1')
+ define(`r10r11', `r10')
+ define(`func', mpn_sublsh1_n)
+ define(`func_nc', mpn_sublsh1_nc)')
+
+MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
+
+ASM_START()
+PROLOGUE(func)
+ push {r4-r10r11, r14}
+
+ifdef(`OPERATION_addlsh1_n', `
+ mvn r11, #0
+')
+ INICYR( r14)
+ subs n, n, #3
+ blt L(le2) C carry clear on branch path
+
+ cmn r0, #0 C clear carry
+ ldmia vp!, {r8, r9, r10}
+ b L(mid)
+
+L(top): RESTCY( r14)
+ ADDSUBC r4, r4, r8
+ ADDSUBC r5, r5, r9
+ ADDSUBC r6, r6, r10
+ ldmia vp!, {r8, r9, r10}
+ REVCY(r14)
+ stmia rp!, {r4, r5, r6}
+ adcs r8, r8, r8
+ adcs r9, r9, r9
+ adcs r10, r10, r10
+ ldmia up!, {r4, r5, r6}
+ SAVECY( r14, r11)
+ subs n, n, #3
+ blt L(exi)
+ RESTCY( r12)
+ ADDSUBC r4, r4, r8
+ ADDSUBC r5, r5, r9
+ ADDSUBC r6, r6, r10
+ ldmia vp!, {r8, r9, r10}
+ REVCY(r12)
+ stmia rp!, {r4, r5, r6}
+L(mid): adcs r8, r8, r8
+ adcs r9, r9, r9
+ adcs r10, r10, r10
+ ldmia up!, {r4, r5, r6}
+ SAVECY( r12, r11)
+ subs n, n, #3
+ bge L(top)
+
+ mov r7, r12 C swap alternating...
+ mov r12, r14 C ...carry-save...
+ mov r14, r7 C ...registers
+
+L(exi): RESTCY( r12)
+ ADDSUBC r4, r4, r8
+ ADDSUBC r5, r5, r9
+ ADDSUBC r6, r6, r10
+ stmia rp!, {r4, r5, r6}
+
+ REVCY(r12)
+L(le2): tst n, #1 C n = {-1,-2,-3} map to [2], [1], [0]
+ beq L(e1)
+
+L(e02): tst n, #2
+ beq L(rt0)
+ ldm vp, {r8, r9}
+ adcs r8, r8, r8
+ adcs r9, r9, r9
+ ldm up, {r4, r5}
+ SAVECY( r12, r11)
+ RESTCY( r14)
+ ADDSUBC r4, r4, r8
+ ADDSUBC r5, r5, r9
+ stm rp, {r4, r5}
+ b L(rt1)
+
+L(e1): ldr r8, [vp]
+ adcs r8, r8, r8
+ ldr r4, [up]
+ SAVECY( r12, r11)
+ RESTCY( r14)
+ ADDSUBC r4, r4, r8
+ str r4, [rp]
+
+L(rt1): mov r14, r12
+ REVCY(r12)
+L(rt0): RETVAL( r14)
+ pop {r4-r10r11, r14}
+ bx r14
+EPILOGUE()
More information about the gmp-commit
mailing list