[Gmp-commit] /var/hg/gmp: 2 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Tue Sep 10 01:03:48 CEST 2013
details: /var/hg/gmp/rev/9b6be23315d5
changeset: 15978:9b6be23315d5
user: Torbjorn Granlund <tege at gmplib.org>
date: Tue Sep 10 00:48:45 2013 +0200
description:
Refresh cycle table.
details: /var/hg/gmp/rev/35f194fcbf1d
changeset: 15979:35f194fcbf1d
user: Torbjorn Granlund <tege at gmplib.org>
date: Tue Sep 10 00:49:36 2013 +0200
description:
Rewrite rp != up (mod 16) code to make it handle any allowed overlap.
diffstat:
mpn/x86_64/fastsse/copyd-palignr.asm | 15 ++-
mpn/x86_64/fastsse/copyi-palignr.asm | 122 ++++++++++++++++++++--------------
2 files changed, 83 insertions(+), 54 deletions(-)
diffs (197 lines):
diff -r d0d816cb0b8e -r 35f194fcbf1d mpn/x86_64/fastsse/copyd-palignr.asm
--- a/mpn/x86_64/fastsse/copyd-palignr.asm Mon Sep 09 01:16:54 2013 +0200
+++ b/mpn/x86_64/fastsse/copyd-palignr.asm Tue Sep 10 00:49:36 2013 +0200
@@ -25,14 +25,19 @@
C aligned unaligned best seen for cpu?
C AMD K8,K9 2.0 illop 1.0/1.0 N
C AMD K10 0.85 illop Y/N
-C AMD bd1 1.39 1.40 Y
-C AMD bobcat 1.97 8.35 1.5/1.5 N
+C AMD bull 0.70 0.70 Y
+C AMD pile 0.68 0.68 Y
+C AMD steam ? ?
+C AMD bobcat 1.97 8.24 1.5/1.5 N
+C AMD jaguar ? ?
C Intel P4 2.26 illop Y/N
-C Intel core2 0.52 0.68-0.80 opt/0.68 Y
+C Intel core 0.52 0.68-0.80 opt/0.64 Y
C Intel NHM 0.52 0.64 opt/opt Y
-C Intel SBR 0.51 0.54 opt/0.51 Y
+C Intel SBR 0.51 0.51 opt/0.51 Y
+C Intel IBR ? ? Y
+C Intel HWL 0.51 0.51 0.25/0.25 N
C Intel atom 1.16 1.66 opt/opt Y
-C VIA nano 1.09 1.07 opt/opt Y
+C VIA nano 1.08 1.06 opt/opt Y
C We use only 16-byte operations, except for unaligned top-most and bottom-most
C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16).
diff -r d0d816cb0b8e -r 35f194fcbf1d mpn/x86_64/fastsse/copyi-palignr.asm
--- a/mpn/x86_64/fastsse/copyi-palignr.asm Mon Sep 09 01:16:54 2013 +0200
+++ b/mpn/x86_64/fastsse/copyi-palignr.asm Tue Sep 10 00:49:36 2013 +0200
@@ -1,8 +1,8 @@
dnl AMD64 mpn_copyi optimised for CPUs with fast SSE copying and SSSE3.
-dnl Copyright 2012 Free Software Foundation, Inc.
+dnl Copyright 2012, 2013 Free Software Foundation, Inc.
-dnl Contributed to the GNU project by Torbjorn Granlund.
+dnl Contributed to the GNU project by Torbjörn Granlund.
dnl This file is part of the GNU MP Library.
@@ -25,14 +25,19 @@
C aligned unaligned best seen for cpu?
C AMD K8,K9 2.0 illop 1.0/1.0 N
C AMD K10 0.85 illop Y/N
-C AMD bd1 1.39 ? 1.45 Y/N
-C AMD bobcat 1.97 ? 8.17 1.5/1.5 N
+C AMD bull 0.70 0.66 Y
+C AMD pile 0.68 0.66 Y
+C AMD steam ? ?
+C AMD bobcat 1.97 8.28 1.5/1.5 N
+C AMD jaguar ? ?
C Intel P4 2.26 illop Y/N
-C Intel core2 0.52 0.82 opt/0.74 Y
-C Intel NHM 0.52 0.65 opt/opt Y
-C Intel SBR 0.51 0.55 opt/0.51 Y
-C Intel atom 1.16 1.70 opt/opt Y
-C VIA nano 1.09 1.10 opt/opt Y
+C Intel core 0.52 0.64 opt/opt Y
+C Intel NHM 0.52 0.71 opt/? Y
+C Intel SBR 0.51 0.57 opt/0.51 Y
+C Intel IBR ? ? Y
+C Intel HWL 0.51 0.52 0.25/0.25 N
+C Intel atom 1.16 1.66 opt/opt Y
+C VIA nano 1.09 1.08 opt/opt Y
C We use only 16-byte operations, except for unaligned top-most and bottom-most
C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). That
@@ -117,68 +122,87 @@
L(uent):
C Code handling up - rp = 8 (mod 16)
-C FIXME: The code below only handles overlap if it is close to complete, or
-C quite separate: up-rp < 5 or up-up > 15 limbs
- lea -40(up), %rax C 40 = 5 * GMP_LIMB_BYTES
- sub rp, %rax
- cmp $80, %rax C 80 = (15-5) * GMP_LIMB_BYTES
- jbe L(bc) C deflect to plain loop
+ cmp $16, n
+ jc L(ued0)
- sub $16, n
- jc L(uend)
-
- movdqa 120(up), %xmm3
-
- sub $16, n
- jmp L(um)
+ movdqa 120(up), %xmm7
+ movdqa 104(up), %xmm6
+ movdqa 88(up), %xmm5
+ movdqa 72(up), %xmm4
+ movdqa 56(up), %xmm3
+ lea 128(up), up
+ sub $32, n
+ jc L(ued1)
ALIGN(16)
-L(utop):movdqa 120(up), %xmm3
- movdqa %xmm0, -128(rp)
+L(utop):
+ movdqa -88(up), %xmm2
sub $16, n
-L(um): movdqa 104(up), %xmm2
+ movdqa -104(up), %xmm1
+ palignr($8, %xmm6, %xmm7)
+ movdqa %xmm7, 112(rp)
+ movdqa -120(up), %xmm0
+ palignr($8, %xmm5, %xmm6)
+ movdqa %xmm6, 96(rp)
+ movdqa -136(up), %xmm8
+ palignr($8, %xmm4, %xmm5)
+ movdqa %xmm5, 80(rp)
+ movdqa 120(up), %xmm7
+ palignr($8, %xmm3, %xmm4)
+ movdqa %xmm4, 64(rp)
+ movdqa 104(up), %xmm6
palignr($8, %xmm2, %xmm3)
- movdqa 88(up), %xmm1
- movdqa %xmm3, 112(rp)
+ movdqa %xmm3, 48(rp)
+ movdqa 88(up), %xmm5
palignr($8, %xmm1, %xmm2)
- movdqa 72(up), %xmm0
- movdqa %xmm2, 96(rp)
+ movdqa %xmm2, 32(rp)
+ movdqa 72(up), %xmm4
palignr($8, %xmm0, %xmm1)
+ movdqa %xmm1, 16(rp)
movdqa 56(up), %xmm3
- movdqa %xmm1, 80(rp)
- palignr($8, %xmm3, %xmm0)
- movdqa 40(up), %xmm2
- movdqa %xmm0, 64(rp)
- palignr($8, %xmm2, %xmm3)
- movdqa 24(up), %xmm1
- movdqa %xmm3, 48(rp)
- palignr($8, %xmm1, %xmm2)
- movdqa 8(up), %xmm0
- movdqa %xmm2, 32(rp)
- palignr($8, %xmm0, %xmm1)
- movdqa -8(up), %xmm3
- movdqa %xmm1, 16(rp)
- palignr($8, %xmm3, %xmm0)
+ palignr($8, %xmm8, %xmm0)
+ movdqa %xmm0, (rp)
lea 128(up), up
lea 128(rp), rp
jnc L(utop)
- movdqa %xmm0, -128(rp)
+L(ued1):
+ movdqa -88(up), %xmm2
+ movdqa -104(up), %xmm1
+ movdqa -120(up), %xmm0
+ movdqa -136(up), %xmm8
+ palignr($8, %xmm6, %xmm7)
+ movdqa %xmm7, 112(rp)
+ palignr($8, %xmm5, %xmm6)
+ movdqa %xmm6, 96(rp)
+ palignr($8, %xmm4, %xmm5)
+ movdqa %xmm5, 80(rp)
+ palignr($8, %xmm3, %xmm4)
+ movdqa %xmm4, 64(rp)
+ palignr($8, %xmm2, %xmm3)
+ movdqa %xmm3, 48(rp)
+ palignr($8, %xmm1, %xmm2)
+ movdqa %xmm2, 32(rp)
+ palignr($8, %xmm0, %xmm1)
+ movdqa %xmm1, 16(rp)
+ palignr($8, %xmm8, %xmm0)
+ movdqa %xmm0, (rp)
+ lea 128(rp), rp
-L(uend):test $8, R8(n)
+L(ued0):test $8, R8(n)
jz 1f
movdqa 56(up), %xmm3
movdqa 40(up), %xmm2
+ movdqa 24(up), %xmm1
+ movdqa 8(up), %xmm0
+ movdqa -8(up), %xmm8
palignr($8, %xmm2, %xmm3)
- movdqa 24(up), %xmm1
movdqa %xmm3, 48(rp)
palignr($8, %xmm1, %xmm2)
- movdqa 8(up), %xmm0
movdqa %xmm2, 32(rp)
palignr($8, %xmm0, %xmm1)
- movdqa -8(up), %xmm3
movdqa %xmm1, 16(rp)
- palignr($8, %xmm3, %xmm0)
+ palignr($8, %xmm8, %xmm0)
lea 64(up), up
movdqa %xmm0, (rp)
lea 64(rp), rp
More information about the gmp-commit
mailing list