[Gmp-commit] /var/hg/gmp: 2 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Mon Mar 4 20:49:35 CET 2013
details: /var/hg/gmp/rev/654ec9828029
changeset: 15524:654ec9828029
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Mar 04 20:49:18 2013 +0100
description:
Add arm/neon lshift ad rshift.
details: /var/hg/gmp/rev/c726c5bd016e
changeset: 15525:c726c5bd016e
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Mar 04 20:49:26 2013 +0100
description:
ChangeLog.
diffstat:
ChangeLog | 7 +-
mpn/arm/neon/lorrshift.asm | 267 +++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 273 insertions(+), 1 deletions(-)
diffs (295 lines):
diff -r 14723c542cc4 -r c726c5bd016e ChangeLog
--- a/ChangeLog Sun Mar 03 21:49:40 2013 +0100
+++ b/ChangeLog Mon Mar 04 20:49:26 2013 +0100
@@ -1,3 +1,7 @@
+2013-03-04 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/arm/neon/lorrshift.asm: New file.
+
2013-03-03 Torbjorn Granlund <tege at gmplib.org>
* mpn/arm/v7a/cora15/copyd.asm: New file.
@@ -9,10 +13,11 @@
* mpn/arm64/addmul_1.asm: Remove.
* mpn/arm64/aors_n.asm: Complete rewrite.
+ * mpn/arm/tabselect.asm: New file.
* mpn/arm/neon/tabselect.asm: New file.
+
* mpn/arm/copyi.asm: Software pipeline.
* mpn/arm/copyd.asm: Likewise.
- * mpn/arm/tabselect.asm: New file.
* config.guess: Rework tmp file handling to resemble configfsf.guess's.
diff -r 14723c542cc4 -r c726c5bd016e mpn/arm/neon/lorrshift.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/arm/neon/lorrshift.asm Mon Mar 04 20:49:26 2013 +0100
@@ -0,0 +1,267 @@
+dnl ARM Neon mpn_lshift and mpn_rshift.
+
+dnl Contributed to the GNU project by Torbjörn Granlund.
+
+dnl Copyright 2013 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb cycles/limb cycles/limb good
+C aligned unaligned best seen for cpu?
+C StrongARM n/a
+C XScale n/a
+C Cortex-A8 ?
+C Cortex-A9 3 3 Y
+C Cortex-A15 1.5 1.5 Y
+
+
+C We read 64 bits at a time at 32-bit aligned addresses, and except for the
+C first and last store, we write using 64-bit aligned addresses. All shifting
+C is done on 64-bit words in 'extension' registers.
+C
+C It should be possible to read also using 64-bit alignment, by manipulating
+C the shift count for unaligned operands. Not done, since it does not seem to
+C matter for A9 or A15.
+C
+C This will not work in big-endian mode.
+
+C TODO
+C * Try using 128-bit operations. Note that Neon lacks pure 128-bit shifts,
+C which might make it tricky.
+C * Clean up and simplify.
+C * Consider sharing most of the code for lshift and rshift, since the feed-in code,
+C the loop, and most of the wind-down code are identical.
+C * Replace the basecase code with code using 'extension' registers.
+C * Optimise. It is not clear that this loop insn permutation is optimal for
+C either A9 or A15.
+
+C INPUT PARAMETERS
+define(`rp', `r0')
+define(`ap', `r1')
+define(`n', `r2')
+define(`cnt', `r3')
+
+ifdef(`OPERATION_lshift',`
+ define(`IFLSH', `$1')
+ define(`IFRSH', `')
+ define(`X',`0')
+ define(`Y',`1')
+ define(`func',`mpn_lshift')
+')
+ifdef(`OPERATION_rshift',`
+ define(`IFLSH', `')
+ define(`IFRSH', `$1')
+ define(`X',`1')
+ define(`Y',`0')
+ define(`func',`mpn_rshift')
+')
+
+MULFUNC_PROLOGUE(mpn_lshift mpn_rshift)
+
+ASM_START()
+ TEXT
+ ALIGN(64)
+PROLOGUE(func)
+IFLSH(` mov r12, n, lsl #2 ')
+IFLSH(` add rp, rp, r12 ')
+IFLSH(` add ap, ap, r12 ')
+
+ cmp n, #4 C SIMD code n limit
+ ble L(base)
+
+ifdef(`OPERATION_lshift',`
+ vdup.32 d6, r3 C left shift count is positive
+ sub r3, r3, #64 C right shift count is negative
+ vdup.32 d7, r3
+ mov r12, #-8') C lshift pointer update offset
+ifdef(`OPERATION_rshift',`
+ rsb r3, r3, #0 C right shift count is negative
+ vdup.32 d6, r3
+ add r3, r3, #64 C left shift count is positive
+ vdup.32 d7, r3
+ mov r12, #8') C rshift pointer update offset
+
+IFLSH(` sub ap, ap, #8 ')
+ vld1.32 {d19}, [ap], r12 C load initial 2 limbs
+ vshl.u64 d18, d19, d7 C retval
+
+ tst rp, #4 C is rp 64-bit aligned already?
+ beq L(rp_aligned) C yes, skip
+IFLSH(` add ap, ap, #4 ') C move back ap pointer
+IFRSH(` sub ap, ap, #4 ') C move back ap pointer
+ vshl.u64 d4, d19, d6
+ sub n, n, #1 C first limb handled
+IFLSH(` sub rp, rp, #4 ')
+ vst1.32 {d4[Y]}, [rp]IFRSH(!) C store first limb, rp gets aligned
+ vld1.32 {d19}, [ap], r12 C load ap[1] and ap[2]
+
+L(rp_aligned):
+IFLSH(` sub rp, rp, #8 ')
+ subs n, n, #6
+ blt L(two_or_three_more)
+ tst n, #2
+ beq L(2)
+
+L(1): vld1.32 {d17}, [ap], r12
+ vshl.u64 d5, d19, d6
+ vld1.32 {d16}, [ap], r12
+ vshl.u64 d0, d17, d7
+ vshl.u64 d4, d17, d6
+ sub n, n, #2
+ b L(mid)
+
+L(2): vld1.32 {d16}, [ap], r12
+ vshl.u64 d4, d19, d6
+ vld1.32 {d17}, [ap], r12
+ vshl.u64 d1, d16, d7
+ vshl.u64 d5, d16, d6
+ subs n, n, #4
+ blt L(end)
+
+L(top): vld1.32 {d16}, [ap], r12
+ vorr d2, d4, d1
+ vshl.u64 d0, d17, d7
+ vshl.u64 d4, d17, d6
+ vst1.32 {d2}, [rp:64], r12
+L(mid): vld1.32 {d17}, [ap], r12
+ vorr d3, d5, d0
+ vshl.u64 d1, d16, d7
+ vshl.u64 d5, d16, d6
+ vst1.32 {d3}, [rp:64], r12
+ subs n, n, #4
+ bge L(top)
+
+L(end): tst n, #1
+ beq L(evn)
+
+ vorr d2, d4, d1
+ vst1.32 {d2}, [rp:64], r12
+ b L(cj1)
+
+L(evn): vorr d2, d4, d1
+ vshl.u64 d0, d17, d7
+ vshl.u64 d16, d17, d6
+ vst1.32 {d2}, [rp:64], r12
+ vorr d2, d5, d0
+ b L(cj2)
+
+C Load last 2 - 3 limbs, store last 4 - 5 limbs
+L(two_or_three_more):
+ tst n, #1
+ beq L(l2)
+
+L(l3): vshl.u64 d5, d19, d6
+ vld1.32 {d17}, [ap], r12
+L(cj1): veor d16, d16, d16
+IFLSH(` add ap, ap, #4 ')
+ vld1.32 {d16[Y]}, [ap], r12
+ vshl.u64 d0, d17, d7
+ vshl.u64 d4, d17, d6
+ vorr d3, d5, d0
+ vshl.u64 d1, d16, d7
+ vshl.u64 d5, d16, d6
+ vst1.32 {d3}, [rp:64], r12
+ vorr d2, d4, d1
+ vst1.32 {d2}, [rp:64], r12
+IFLSH(` add rp, rp, #4 ')
+ vst1.32 {d5[Y]}, [rp]
+ vmov.32 r0, d18[X]
+ bx lr
+
+L(l2): vld1.32 {d16}, [ap], r12
+ vshl.u64 d4, d19, d6
+ vshl.u64 d1, d16, d7
+ vshl.u64 d16, d16, d6
+ vorr d2, d4, d1
+L(cj2): vst1.32 {d2}, [rp:64], r12
+ vst1.32 {d16}, [rp]
+ vmov.32 r0, d18[X]
+ bx lr
+
+
+define(`tnc', `r12')
+L(base):
+ push {r4, r6, r7, r8}
+ifdef(`OPERATION_lshift',`
+ ldr r4, [ap, #-4]!
+ rsb tnc, cnt, #32
+
+ lsl r7, r4, cnt
+ tst n, #1
+ beq L(ev) C n even
+
+L(od): subs n, n, #2
+ bcc L(ed1) C n = 1
+ ldr r8, [ap, #-4]!
+ b L(md) C n = 3
+
+L(ev): ldr r6, [ap, #-4]!
+ subs n, n, #2
+ beq L(ed) C n = 3
+ C n = 4
+L(tp): ldr r8, [ap, #-4]!
+ orr r7, r7, r6, lsr tnc
+ str r7, [rp, #-4]!
+ lsl r7, r6, cnt
+L(md): ldr r6, [ap, #-4]!
+ orr r7, r7, r8, lsr tnc
+ str r7, [rp, #-4]!
+ lsl r7, r8, cnt
+
+L(ed): orr r7, r7, r6, lsr tnc
+ str r7, [rp, #-4]!
+ lsl r7, r6, cnt
+L(ed1): str r7, [rp, #-4]
+ lsr r0, r4, tnc
+')
+ifdef(`OPERATION_rshift',`
+ ldr r4, [ap]
+ rsb tnc, cnt, #32
+
+ lsr r7, r4, cnt
+ tst n, #1
+ beq L(ev) C n even
+
+L(od): subs n, n, #2
+ bcc L(ed1) C n = 1
+ ldr r8, [ap, #4]!
+ b L(md) C n = 3
+
+L(ev): ldr r6, [ap, #4]!
+ subs n, n, #2
+ beq L(ed) C n = 2
+ C n = 4
+
+L(tp): ldr r8, [ap, #4]!
+ orr r7, r7, r6, lsl tnc
+ str r7, [rp], #4
+ lsr r7, r6, cnt
+L(md): ldr r6, [ap, #4]!
+ orr r7, r7, r8, lsl tnc
+ str r7, [rp], #4
+ lsr r7, r8, cnt
+
+L(ed): orr r7, r7, r6, lsl tnc
+ str r7, [rp], #4
+ lsr r7, r6, cnt
+L(ed1): str r7, [rp], #4
+ lsl r0, r4, tnc
+')
+ pop {r4, r6, r7, r8}
+ bx r14
+EPILOGUE()
More information about the gmp-commit
mailing list