[Gmp-commit] /var/hg/gmp: Really handle overlap correctly in rp != up (mod 16...
mercurial at gmplib.org
mercurial at gmplib.org
Tue Sep 10 19:54:06 CEST 2013
details: /var/hg/gmp/rev/ba38250cfecc
changeset: 15980:ba38250cfecc
user: Torbjorn Granlund <tege at gmplib.org>
date: Tue Sep 10 19:53:54 2013 +0200
description:
Really handle overlap correctly in rp != up (mod 16) code.
diffstat:
mpn/x86_64/fastsse/copyi-palignr.asm | 64 +++++++++++++++++------------------
1 files changed, 31 insertions(+), 33 deletions(-)
diffs (134 lines):
diff -r 35f194fcbf1d -r ba38250cfecc mpn/x86_64/fastsse/copyi-palignr.asm
--- a/mpn/x86_64/fastsse/copyi-palignr.asm Tue Sep 10 00:49:36 2013 +0200
+++ b/mpn/x86_64/fastsse/copyi-palignr.asm Tue Sep 10 19:53:54 2013 +0200
@@ -28,15 +28,15 @@
C AMD bull 0.70 0.66 Y
C AMD pile 0.68 0.66 Y
C AMD steam ? ?
-C AMD bobcat 1.97 8.28 1.5/1.5 N
+C AMD bobcat 1.97 8.16 1.5/1.5 N
C AMD jaguar ? ?
C Intel P4 2.26 illop Y/N
C Intel core 0.52 0.64 opt/opt Y
C Intel NHM 0.52 0.71 opt/? Y
-C Intel SBR 0.51 0.57 opt/0.51 Y
+C Intel SBR 0.51 0.54 opt/0.51 Y
C Intel IBR ? ? Y
C Intel HWL 0.51 0.52 0.25/0.25 N
-C Intel atom 1.16 1.66 opt/opt Y
+C Intel atom 1.16 1.61 opt/opt Y
C VIA nano 1.09 1.08 opt/opt Y
C We use only 16-byte operations, except for unaligned top-most and bottom-most
@@ -54,7 +54,7 @@
C There are three instructions for loading an aligned 128-bit quantity. We use
C movaps, since it has the shortest coding.
-define(`movdqa', ``movaps'')
+dnl define(`movdqa', ``movaps'')
ifdef(`COPYI_SSE_THRESHOLD',`',`define(`COPYI_SSE_THRESHOLD', 7)')
@@ -125,52 +125,50 @@
cmp $16, n
jc L(ued0)
- movdqa 120(up), %xmm7
- movdqa 104(up), %xmm6
- movdqa 88(up), %xmm5
- movdqa 72(up), %xmm4
- movdqa 56(up), %xmm3
+ movaps 120(up), %xmm7
+ movaps 104(up), %xmm6
+ movaps 88(up), %xmm5
+ movaps 72(up), %xmm4
+ movaps 56(up), %xmm3
+ movaps 40(up), %xmm2
lea 128(up), up
sub $32, n
jc L(ued1)
ALIGN(16)
-L(utop):
- movdqa -88(up), %xmm2
+L(utop):movaps -104(up), %xmm1
sub $16, n
- movdqa -104(up), %xmm1
+ movaps -120(up), %xmm0
palignr($8, %xmm6, %xmm7)
+ movaps -136(up), %xmm8
movdqa %xmm7, 112(rp)
- movdqa -120(up), %xmm0
palignr($8, %xmm5, %xmm6)
+ movaps 120(up), %xmm7
movdqa %xmm6, 96(rp)
- movdqa -136(up), %xmm8
palignr($8, %xmm4, %xmm5)
+ movaps 104(up), %xmm6
movdqa %xmm5, 80(rp)
- movdqa 120(up), %xmm7
palignr($8, %xmm3, %xmm4)
+ movaps 88(up), %xmm5
movdqa %xmm4, 64(rp)
- movdqa 104(up), %xmm6
palignr($8, %xmm2, %xmm3)
+ movaps 72(up), %xmm4
movdqa %xmm3, 48(rp)
- movdqa 88(up), %xmm5
palignr($8, %xmm1, %xmm2)
+ movaps 56(up), %xmm3
movdqa %xmm2, 32(rp)
- movdqa 72(up), %xmm4
palignr($8, %xmm0, %xmm1)
+ movaps 40(up), %xmm2
movdqa %xmm1, 16(rp)
- movdqa 56(up), %xmm3
palignr($8, %xmm8, %xmm0)
+ lea 128(up), up
movdqa %xmm0, (rp)
- lea 128(up), up
lea 128(rp), rp
jnc L(utop)
-L(ued1):
- movdqa -88(up), %xmm2
- movdqa -104(up), %xmm1
- movdqa -120(up), %xmm0
- movdqa -136(up), %xmm8
+L(ued1):movaps -104(up), %xmm1
+ movaps -120(up), %xmm0
+ movaps -136(up), %xmm8
palignr($8, %xmm6, %xmm7)
movdqa %xmm7, 112(rp)
palignr($8, %xmm5, %xmm6)
@@ -191,11 +189,11 @@
L(ued0):test $8, R8(n)
jz 1f
- movdqa 56(up), %xmm3
- movdqa 40(up), %xmm2
- movdqa 24(up), %xmm1
- movdqa 8(up), %xmm0
- movdqa -8(up), %xmm8
+ movaps 56(up), %xmm3
+ movaps 40(up), %xmm2
+ movaps 24(up), %xmm1
+ movaps 8(up), %xmm0
+ movaps -8(up), %xmm8
palignr($8, %xmm2, %xmm3)
movdqa %xmm3, 48(rp)
palignr($8, %xmm1, %xmm2)
@@ -209,10 +207,10 @@
1: test $4, R8(n)
jz 1f
- movdqa 24(up), %xmm1
- movdqa 8(up), %xmm0
+ movaps 24(up), %xmm1
+ movaps 8(up), %xmm0
palignr($8, %xmm0, %xmm1)
- movdqa -8(up), %xmm3
+ movaps -8(up), %xmm3
movdqa %xmm1, 16(rp)
palignr($8, %xmm3, %xmm0)
lea 32(up), up
More information about the gmp-commit
mailing list