[Gmp-commit] /var/hg/gmp: 5 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Sat Apr 13 21:34:51 CEST 2013
details: /var/hg/gmp/rev/e9d8a836c909
changeset: 15709:e9d8a836c909
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Apr 13 21:27:42 2013 +0200
description:
(SPEED_ROUTINE_MPN_TABSELECT): Implement special code, making .r argument be table width.
details: /var/hg/gmp/rev/032964a299a6
changeset: 15710:032964a299a6
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Apr 13 21:28:18 2013 +0200
description:
Amend.
details: /var/hg/gmp/rev/e2b2f9038bf9
changeset: 15711:e2b2f9038bf9
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Apr 13 21:30:42 2013 +0200
description:
Add a comment.
details: /var/hg/gmp/rev/844f528093f2
changeset: 15712:844f528093f2
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Apr 13 21:34:10 2013 +0200
description:
Major tabselect overhaul.
details: /var/hg/gmp/rev/7b281907ba4f
changeset: 15713:7b281907ba4f
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Apr 13 21:34:47 2013 +0200
description:
ChangeLog
diffstat:
ChangeLog | 11 ++
mpn/arm/neon/tabselect.asm | 153 +++++++++++++++++++-------------
mpn/arm/tabselect.asm | 120 +++++++++++++++----------
mpn/powerpc64/lshiftc.asm | 1 +
mpn/powerpc64/tabselect.asm | 157 +++++++++++++++++++++------------
mpn/x86_64/fastsse/README | 3 +-
mpn/x86_64/fastsse/tabselect.asm | 181 +++++++++++++++++++++++++++++++++++++++
mpn/x86_64/tabselect.asm | 156 ++++++++++++++++++++++-----------
tune/speed.h | 33 ++++++-
9 files changed, 591 insertions(+), 224 deletions(-)
diffs (truncated from 997 to 300 lines):
diff -r a6c2a0e50fb8 -r 7b281907ba4f ChangeLog
--- a/ChangeLog Sat Apr 13 01:38:58 2013 +0200
+++ b/ChangeLog Sat Apr 13 21:34:47 2013 +0200
@@ -1,3 +1,14 @@
+2013-04-13 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86_64/fastsse/tabselect.asm: New file.
+ * mpn/arm/neon/tabselect.asm: Rewrite.
+ * mpn/arm/tabselect.asm: Rewrite.
+ * mpn/powerpc64/tabselect.asm: Rewrite.
+ * mpn/x86_64/tabselect.asm: Rewrite.
+
+ * tune/speed.h (SPEED_ROUTINE_MPN_TABSELECT): Implement special code,
+ making .r argument be table width.
+
2013-04-11 David S. Miller <davem at davemloft.net>
* mpn/sparc32/sparc-defs.m4 (LEA): Remove unused local label.
diff -r a6c2a0e50fb8 -r 7b281907ba4f mpn/arm/neon/tabselect.asm
--- a/mpn/arm/neon/tabselect.asm Sat Apr 13 01:38:58 2013 +0200
+++ b/mpn/arm/neon/tabselect.asm Sat Apr 13 21:34:47 2013 +0200
@@ -1,8 +1,8 @@
-dnl ARM Neon mpn_tabselect
+dnl ARM Neon mpn_tabselect.
dnl Contributed to the GNU project by Torbjörn Granlund.
-dnl Copyright 2013 Free Software Foundation, Inc.
+dnl Copyright 2011, 2012, 2013 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -21,82 +21,109 @@
include(`../config.m4')
+
C cycles/limb
C StrongARM -
C XScale -
C Cortex-A7 ?
C Cortex-A8 ?
-C Cortex-A9 2.25
-C Cortex-A15 0.95
+C Cortex-A9 1.15
+C Cortex-A15 0.65
-C This is a basic implementation using 64-bit Neon, with shallow software
-C pipelining and no unrolling. It is probably close to optimal for A9, while
-C 128-bit Neon code might run faster on A15.
+define(`rp', `r0')
+define(`tp', `r1')
+define(`n', `r2')
+define(`nents', `r3')
+C define(`which', on stack)
-define(`rp', `r0')
-define(`tabp', `r1')
-define(`n', `r2')
-define(`nents', `r3')
-C which on stack
+define(`i', `r4')
+define(`j', `r5')
-define(`mask', `r6')
-define(`maskv', `d6')
-C
+define(`maskq', `q10')
+define(`maskd', `d20')
+
ASM_START()
PROLOGUE(mpn_tabselect)
- push {r4-r7}
- ldr r7, [sp, #16]
- sub r7, r7, nents
-L(outer):
- add mask, r7, nents
- subs mask, mask, #1
- sbc mask, mask, mask
- mov r4, rp
- mov r5, rp
- mov r12, n
+ push {r4-r5}
- vdup.32 maskv, mask
+ add r4, sp, #8
+ vld1.32 {d30[], d31[]}, [r4] C 4 `which' copies
+ vmov.i32 q14, #1 C 4 copies of 1
- tst rp, #4 C Is rp 64-bit aligned?
- beq L(ali) C Yes, skip!
- vld1.32 {d4[0]}, [tabp]! C Else perform one 32-bit...
- vld1.32 {d0[0]}, [r4]! C ...operation in...
- vbit d0, d4, maskv C ...order to make...
- subs r12, r12, #1 C ...aligned operations...
- vst1.32 {d0[0]}, [r5]! C ...on rp[] in loop.
+ subs j, n, #8
+ bmi L(outer_end)
-L(ali): subs r12, r12, #4
- blt L(ed1)
- subs r12, r12, #4
- vld1.32 {d4,d5}, [tabp]!
- b L(mid)
+L(outer_top):
+ mov i, nents
+ mov r12, tp C preserve tp
+ veor q13, q13, q13 C 4 counter copies
+ veor q2, q2, q2
+ veor q3, q3, q3
+ ALIGN(16)
+L(top): vceq.i32 maskq, q13, q15 C compare idx copies to `which' copies
+ vld1.32 {q0,q1}, [tp]
+ vadd.i32 q13, q13, q14
+ vbit q2, q0, maskq
+ vbit q3, q1, maskq
+ add tp, tp, n, lsl #2
+ subs i, i, #1
+ bne L(top)
+ vst1.32 {q2,q3}, [rp]!
+ add tp, r12, #32 C restore tp, point to next slice
+ subs j, j, #8
+ bpl L(outer_top)
+L(outer_end):
-L(top): subs r12, r12, #4
- vld1.32 {d4,d5}, [tabp]!
- vst1.32 {d0,d1}, [r5:64]!
-L(mid): vld1.32 {d0,d1}, [r4:64]!
- vbit d0, d4, maskv
- vbit d1, d5, maskv
- bge L(top)
+ tst n, #4
+ beq L(b0xx)
+L(b1xx):mov i, nents
+ mov r12, tp
+ veor q13, q13, q13
+ veor q2, q2, q2
+ ALIGN(16)
+L(tp4): vceq.i32 maskq, q13, q15
+ vld1.32 {q0}, [tp]
+ vadd.i32 q13, q13, q14
+ vbit q2, q0, maskq
+ add tp, tp, n, lsl #2
+ subs i, i, #1
+ bne L(tp4)
+ vst1.32 {q2}, [rp]!
+ add tp, r12, #16
-L(end): vst1.32 {d0,d1}, [r5:64]!
+L(b0xx):tst n, #2
+ beq L(b00x)
+L(b01x):mov i, nents
+ mov r12, tp
+ veor d26, d26, d26
+ veor d4, d4, d4
+ ALIGN(16)
+L(tp2): vceq.i32 maskd, d26, d30
+ vld1.32 {d0}, [tp]
+ vadd.i32 d26, d26, d28
+ vbit d4, d0, maskd
+ add tp, tp, n, lsl #2
+ subs i, i, #1
+ bne L(tp2)
+ vst1.32 {d4}, [rp]!
+ add tp, r12, #8
-L(ed1): tst r12, #2 C 2 or 3 more limbs to go?
- beq L(ed2) C No, skip!
- vld1.32 {d4}, [tabp]! C Handle 2 limbs
- vld1.32 {d0}, [r4:64]!
- vbit d0, d4, maskv
- vst1.32 {d0}, [r5:64]!
+L(b00x):tst n, #1
+ beq L(b000)
+L(b001):mov i, nents
+ mov r12, tp
+ veor d26, d26, d26
+ veor d4, d4, d4
+ ALIGN(16)
+L(tp1): vceq.i32 maskd, d26, d30
+ vld1.32 {d0[0]}, [tp]
+ vadd.i32 d26, d26, d28
+ vbit d4, d0, maskd
+ add tp, tp, n, lsl #2
+ subs i, i, #1
+ bne L(tp1)
+ vst1.32 {d4[0]}, [rp]!
-L(ed2): tst r12, #1 C One more limb to go?
- beq L(ed3) C No, skip!
- vld1.32 {d4[0]}, [tabp]! C Handle last limb
- vld1.32 {d0[0]}, [r4]
- vbit d0, d4, maskv
- vst1.32 {d0[0]}, [r5]
-
-L(ed3): subs nents, nents, #1
- bne L(outer)
- pop {r4-r7}
- bx lr
+L(b000):pop {r4-r5}
+ bx r14
EPILOGUE()
diff -r a6c2a0e50fb8 -r 7b281907ba4f mpn/arm/tabselect.asm
--- a/mpn/arm/tabselect.asm Sat Apr 13 01:38:58 2013 +0200
+++ b/mpn/arm/tabselect.asm Sat Apr 13 21:34:47 2013 +0200
@@ -26,11 +26,12 @@
C XScale ?
C Cortex-A7 ?
C Cortex-A8 ?
-C Cortex-A9 3.33
-C Cortex-A15 3
+C Cortex-A9 2.33
+C Cortex-A15 2.2
-C This is an OK core register implementation, with 3-way unrolling. Software
-C pipelining might be hard since we run out of registers.
+C TODO
+C * Consider using special code for small nents, either swapping the inner and
+C outer loops, or providing a few completely unrolling the inner loops.
define(`rp', `r0')
define(`tp', `r1')
@@ -38,59 +39,82 @@
define(`nents', `r3')
C which on stack
+define(`i', `r11')
+define(`j', `r12')
+define(`c', `r14')
+define(`mask', `r7')
+
ASM_START()
PROLOGUE(mpn_tabselect)
push {r4-r11, r14}
- ldr r11, [sp, #36]
- sub r11, r11, r3
-L(outer):
- add r7, r11, r3
- subs r7, r7, #1
- sbc r7, r7, r7
- mov r6, r0
- mov r14, r2
- subs r14, r14, #3
- blt L(end)
+ subs j, n, #3
+ bmi L(outer_end)
+L(outer_top):
+ ldr c, [sp, #36]
+ mov i, nents
+ push {tp}
-L(top): subs r14, r14, #3
- ldm r6, {r4,r5,r8}
- ldmia r1!, {r9,r10,r12}
- bic r4, r4, r7
- bic r5, r5, r7
- bic r8, r8, r7
- and r9, r9, r7
- and r10, r10, r7
- and r12, r12, r7
- orr r9, r9, r4
- orr r10, r10, r5
- orr r12, r12, r8
- stmia r6!, {r9,r10,r12}
+ mov r8, #0
+ mov r9, #0
+ mov r10, #0
+
+L(top): subs c, c, #1
+ ldm tp, {r4,r5,r6}
+ sbc mask, mask, mask
+ subs i, i, #1
+ add tp, tp, n, lsl #2
+ and r4, r4, mask
+ and r5, r5, mask
+ and r6, r6, mask
+ orr r8, r8, r4
+ orr r9, r9, r5
+ orr r10, r10, r6
bge L(top)
-L(end): cmp r14, #-2
- bls L(1)
- ldm r6, {r4,r5}
- ldmia r1!, {r9,r10}
- bic r4, r4, r7
- bic r5, r5, r7
- and r9, r9, r7
- and r10, r10, r7
- orr r9, r9, r4
- orr r10, r10, r5
- stmia r6!, {r9,r10}
- b L(2)
+ stmia rp!, {r8,r9,r10}
+ pop {tp}
+ add tp, tp, #12
+ subs j, j, #3
+ bpl L(outer_top)
More information about the gmp-commit
mailing list