[Gmp-commit] /var/hg/gmp: Implement temporary workaround to overlap issue.
mercurial at gmplib.org
mercurial at gmplib.org
Tue Apr 17 22:35:30 CEST 2012
details: /var/hg/gmp/rev/ed9a4b434f69
changeset: 14845:ed9a4b434f69
user: Torbjorn Granlund <tege at gmplib.org>
date: Tue Apr 17 22:35:26 2012 +0200
description:
Implement temporary workaround to overlap issue.
diffstat:
ChangeLog | 5 +++++
mpn/x86_64/fastsse/copyi-palignr.asm | 36 +++++++++++++++++++++++-------------
2 files changed, 28 insertions(+), 13 deletions(-)
diffs (85 lines):
diff -r 06616f5ca1d5 -r ed9a4b434f69 ChangeLog
--- a/ChangeLog Tue Apr 17 22:22:00 2012 +0200
+++ b/ChangeLog Tue Apr 17 22:35:26 2012 +0200
@@ -1,3 +1,8 @@
+2012-04-17 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86_64/fastsse/copyi-palignr.asm: Implement temporary workaround
+ to overlap issue.
+
2012-04-17 Marco Bodrato <bodrato at mail.dm.unipi.it>
* mpz/bin_uiui.c: Support small limbs (fallback on bin_ui).
diff -r 06616f5ca1d5 -r ed9a4b434f69 mpn/x86_64/fastsse/copyi-palignr.asm
--- a/mpn/x86_64/fastsse/copyi-palignr.asm Tue Apr 17 22:22:00 2012 +0200
+++ b/mpn/x86_64/fastsse/copyi-palignr.asm Tue Apr 17 22:35:26 2012 +0200
@@ -25,13 +25,13 @@
C aligned unaligned best seen for cpu?
C AMD K8,K9 2.0 illop 1.0/1.0 N
C AMD K10 0.85 illop Y/N
-C AMD bd1 1.39 1.45 Y/N
-C AMD bobcat 1.97 8.17 1.5/1.5 N
+C AMD bd1 1.39 ? 1.45 Y/N
+C AMD bobcat 1.97 ? 8.17 1.5/1.5 N
C Intel P4 2.26 illop Y/N
-C Intel core2 0.52 0.80 opt/0.74 Y
-C Intel NHM 0.52 0.64 opt/opt Y
-C Intel SBR 0.51 0.54 opt/0.51 Y
-C Intel atom 1.16 1.66 opt/opt Y
+C Intel core2 0.52 0.82 opt/0.74 Y
+C Intel NHM 0.52 0.65 opt/opt Y
+C Intel SBR 0.51 0.55 opt/0.51 Y
+C Intel atom 1.16 1.70 opt/opt Y
C VIA nano 1.09 1.10 opt/opt Y
C We use only 16-byte operations, except for unaligned top-most and bottom-most
@@ -114,20 +114,30 @@
1: DOS64_EXIT()
ret
-L(uent):sub $16, n
+L(uent):
+C Code handling up - rp = 8 (mod 16)
+
+C FIXME: The code below only handles overlap if it is close to complete, or
+C quite separate: up-rp < 5 or up-up > 15 limbs
+ lea -40(up), %rax C 40 = 5 * GMP_LIMB_BYTES
+ sub rp, %rax
+ cmp $80, %rax C 80 = (15-5) * GMP_LIMB_BYTES
+ jbe L(bc) C deflect to plain loop
+
+ sub $16, n
jc L(uend)
movdqa 120(up), %xmm3
- movdqa 104(up), %xmm2
+
sub $16, n
jmp L(um)
ALIGN(16)
L(utop):movdqa 120(up), %xmm3
+ movdqa %xmm0, -128(rp)
sub $16, n
- movdqa 104(up), %xmm2
- movdqa %xmm0, -128(rp)
-L(um): palignr($8, %xmm2, %xmm3)
+L(um): movdqa 104(up), %xmm2
+ palignr($8, %xmm2, %xmm3)
movdqa 88(up), %xmm1
movdqa %xmm3, 112(rp)
palignr($8, %xmm1, %xmm2)
@@ -218,11 +228,11 @@
lea 32(up), up
mov %r8, -24(rp)
mov %r9, -16(rp)
-ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
+ifelse(eval(1 || COPYI_SSE_THRESHOLD >= 8),1,
` sub $4, R32(n)')
mov %r10, -8(rp)
mov %r11, (rp)
-ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1,
+ifelse(eval(1 || COPYI_SSE_THRESHOLD >= 8),1,
` jnc L(top)')
L(end): bt $0, R32(n)
More information about the gmp-commit
mailing list