[Gmp-commit] /var/hg/gmp: 10 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Sun Sep 15 23:39:11 CEST 2013
details: /var/hg/gmp/rev/08f8e88f4ae1
changeset: 15986:08f8e88f4ae1
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Sep 15 17:20:43 2013 +0200
description:
Rewrite for a slight speed-up for small and large operands.
details: /var/hg/gmp/rev/065727b4471e
changeset: 15987:065727b4471e
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Sep 15 23:09:28 2013 +0200
description:
Replace mul_1 code.
details: /var/hg/gmp/rev/1186865d021c
changeset: 15988:1186865d021c
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Sep 15 23:11:24 2013 +0200
description:
Complement c/l table.
details: /var/hg/gmp/rev/9ab0a10854a4
changeset: 15989:9ab0a10854a4
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Sep 15 23:12:08 2013 +0200
description:
Complement c/l table.
details: /var/hg/gmp/rev/a1c7092df8ac
changeset: 15990:a1c7092df8ac
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Sep 15 23:12:31 2013 +0200
description:
Correct c/l table.
details: /var/hg/gmp/rev/6713d83a375e
changeset: 15991:6713d83a375e
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Sep 15 23:12:58 2013 +0200
description:
Use R8 for bit testing.
details: /var/hg/gmp/rev/115a99d93773
changeset: 15992:115a99d93773
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Sep 15 23:15:18 2013 +0200
description:
Edit NEWS items
details: /var/hg/gmp/rev/8617df92ca04
changeset: 15993:8617df92ca04
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Sep 15 23:15:57 2013 +0200
description:
Modernise list of CPUs with asm support.
details: /var/hg/gmp/rev/92ed543aaeed
changeset: 15994:92ed543aaeed
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Sep 15 23:16:51 2013 +0200
description:
Fix a comment.
details: /var/hg/gmp/rev/c86c76910610
changeset: 15995:c86c76910610
user: Torbjorn Granlund <tege at gmplib.org>
date: Sun Sep 15 23:39:03 2013 +0200
description:
ChangeLog
diffstat:
ChangeLog | 10 +
NEWS | 12 +-
doc/gmp.texi | 32 +---
gmp-impl.h | 3 +-
mpn/x86_64/coreihwl/mul_basecase.asm | 256 +++++++++++++++++-----------------
mpn/x86_64/coreisbr/aorsmul_1.asm | 214 +++++++++++++++-------------
mpn/x86_64/divrem_2.asm | 14 +-
mpn/x86_64/fastsse/copyi-palignr.asm | 2 +-
mpn/x86_64/sqr_diag_addlsh1.asm | 4 +-
mpn/x86_64/tabselect.asm | 4 +-
10 files changed, 288 insertions(+), 263 deletions(-)
diffs (truncated from 821 to 300 lines):
diff -r 436888a19cec -r c86c76910610 ChangeLog
--- a/ChangeLog Fri Sep 13 22:06:55 2013 +0200
+++ b/ChangeLog Sun Sep 15 23:39:03 2013 +0200
@@ -1,5 +1,15 @@
+2013-09-15 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86_64/tabselect.asm: Use R8 for bit testing.
+
+ * mpn/x86_64/coreihwl/mul_basecase.asm: Replace mul_1 code.
+
+ * mpn/x86_64/coreisbr/aorsmul_1.asm: Rewrite.
+
2013-09-12 Torbjorn Granlund <tege at gmplib.org>
+ * mpn/ia64/gcd_1.asm: Use dep for combining table base and low bits.
+
* mpn/x86_64/fastsse/com-palignr.asm: Implement temp fix to properly
handle overlap.
diff -r 436888a19cec -r c86c76910610 NEWS
--- a/NEWS Fri Sep 13 22:06:55 2013 +0200
+++ b/NEWS Sun Sep 15 23:39:03 2013 +0200
@@ -16,7 +16,11 @@
* Major speedup for ARM, in particular ARM Cortex-A15, thanks to improved
assembly.
- * Major speedup for SPARC T4/T5 and speedup also for T3.
+ * Major speedup for SPARC T4/T5 and speedup also for T3, thanks to much new
+ assembly.
+
+ * Speedup for Intel Sandy Bridge, Ivy Bridge, Haswell, thanks to rewritten
+ and vastly expanded assembly support.
FEATURES
* Support for new Intel and AMD CPUs.
@@ -26,9 +30,9 @@
* New functions mpn_cnd_add_n and mpn_cnd_sub_n. Side-channel silent
conditional addition and subtraction.
- * Better support for applications which use the mpz_t type, but
- nevertheless need to call some of the lower-level mpn functions.
- See the documentation for mpz_limbs_read and related functions.
+ * Better support for applications which use the mpz_t type, but nevertheless
+ need to call some of the lower-level mpn functions. See the documentation
+ for mpz_limbs_read and related functions.
MISC
* None.
diff -r 436888a19cec -r c86c76910610 doc/gmp.texi
--- a/doc/gmp.texi Fri Sep 13 22:06:55 2013 +0200
+++ b/doc/gmp.texi Sun Sep 15 23:39:03 2013 +0200
@@ -473,29 +473,17 @@
There is assembly code for these CPUs:
@cindex CPU types
-ARM,
+ARM Cortex-A9, Cortex-A15, and generic ARM,
DEC Alpha 21064, 21164, and 21264,
-AMD 29000,
-AMD K6, K6-2, Athlon, and Athlon64,
-Hitachi SuperH and SH-2,
-HPPA 1.0, 1.1 and 2.0,
-Intel Pentium, Pentium Pro/II/III, Pentium 4, generic x86,
-Intel IA-64, i960,
-Motorola MC68000, MC68020, MC88100, and MC88110,
-Motorola/IBM PowerPC 32 and 64,
-National NS32000,
-IBM POWER,
-MIPS R3000, R4000,
-SPARCv7, SuperSPARC, generic SPARCv8, UltraSPARC,
-DEC VAX,
-and
-Zilog Z8000.
-Some optimizations also for
-Cray vector systems,
-Clipper,
-IBM ROMP (RT),
-and
-Pyramid AP/XP.
+AMD K8 and K10 (sold under many brands, e.g. Athlon64, Phenom, Opteron)
+Bulldozer, and Bobcat,
+Intel Pentium, Pentium Pro/II/III, Pentium 4, Core2, Nehalem, Sandy bridge, Haswell, generic x86,
+Intel IA-64,
+Motorola/IBM PowerPC 32 and 64 such as POWER970, POWER5, POWER6, and POWER7,
+MIPS 32-bit and 64-bit,
+SPARC 32-bit ad 64-bit with special support for all UltraSPARC models.
+There is also assembly code for many obsolete CPUs.
+
@cindex Home page
@cindex Web page
diff -r 436888a19cec -r c86c76910610 gmp-impl.h
--- a/gmp-impl.h Fri Sep 13 22:06:55 2013 +0200
+++ b/gmp-impl.h Sun Sep 15 23:39:03 2013 +0200
@@ -4587,7 +4587,8 @@
#if WANT_FAT_BINARY && (HAVE_HOST_CPU_FAMILY_x86 || HAVE_HOST_CPU_FAMILY_x86_64)
/* NOTE: The function pointers in this struct are also in CPUVEC_FUNCS_LIST
- in mpn/x86/x86-defs.m4. Be sure to update that when changing here. */
+ in mpn/x86/x86-defs.m4 and mpn/x86_64/x86_64-defs.m4. Be sure to update
+ those when changing here. */
struct cpuvec_t {
DECL_add_n ((*add_n));
DECL_addlsh1_n ((*addlsh1_n));
diff -r 436888a19cec -r c86c76910610 mpn/x86_64/coreihwl/mul_basecase.asm
--- a/mpn/x86_64/coreihwl/mul_basecase.asm Fri Sep 13 22:06:55 2013 +0200
+++ b/mpn/x86_64/coreihwl/mul_basecase.asm Sun Sep 15 23:39:03 2013 +0200
@@ -23,38 +23,30 @@
include(`../config.m4')
C cycles/limb mul_1 mul_2 mul_3 addmul_2
-C AMD K8,K9 ? n/a - n/a
-C AMD K10 ? n/a - n/a
-C AMD bull ? n/a - n/a
-C AMD pile ? n/a - n/a
-C AMD steam ? ? ? ?
-C AMD bobcat ? n/a - n/a
-C AMD jaguar ? ? ? ?
-C Intel P4 ? n/a - n/a
-C Intel core ? n/a - n/a
-C Intel NHM ? n/a - n/a
-C Intel SBR ? n/a - n/a
-C Intel IBR ? n/a - n/a
-C Intel HWL 2.45 1.86 - 2.15
-C Intel BWL ? ? ? ?
-C Intel atom ? n/a - n/a
-C VIA nano ? n/a - n/a
+C AMD K8,K9 n/a n/a - n/a
+C AMD K10 n/a n/a - n/a
+C AMD bull n/a n/a - n/a
+C AMD pile n/a n/a - n/a
+C AMD steam ? ? - ?
+C AMD bobcat n/a n/a - n/a
+C AMD jaguar ? ? - ?
+C Intel P4 n/a n/a - n/a
+C Intel core n/a n/a - n/a
+C Intel NHM n/a n/a - n/a
+C Intel SBR n/a n/a - n/a
+C Intel IBR n/a n/a - n/a
+C Intel HWL 1.77 1.86 - 2.15
+C Intel BWL ? ? - ?
+C Intel atom n/a n/a - n/a
+C VIA nano n/a n/a - n/a
C The inner loops of this code are the result of running a code generation and
-C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+C optimisation tool suite written by David Harvey and Torbjörn Granlund.
C TODO
-C * Merge Haswell-specific mul_1, then, if new code does not use indexing,
-C clean up pointer updates. Current Haswell mul_1.asm uses an unfortunate
-C number of regs, thus awkward to use here.
C * Adjoin a mul_3.
C * Further micro-optimise.
-C When playing with pointers, set this to $2 to fall back to conservative
-C indexing in wind-down code.
-define(`I',`$1')
-
-
define(`rp', `%rdi')
define(`up', `%rsi')
define(`un_param',`%rdx')
@@ -81,108 +73,115 @@
IFDOS(` mov 56(%rsp), %r8d ')
push %rbx
push %rbp
+ push %r12
+ push %r13
+ push %r14
mov un_param, un C free up rdx
neg un
- mov (up), %rax C shared for mul_1 and mul_2
- lea (up,un_param,8), up C point at operand end
- lea (rp,un_param,8), rp C point at rp[un-1]
-
- mov (vp), v0 C shared for mul_1 and mul_2
- mul v0 C shared for mul_1 and mul_2
+ mov un_param, n C FIXME: share
+ sar $2, n C FIXME: share
test $1, R8(vn)
jz L(do_mul_2)
+define(`w4', `%r9')
+define(`w5', `%r14')
+
+ mov (vp), %rdx
+
L(do_mul_1):
test $1, R8(un)
jnz L(m1x1)
-L(m1x0):mov %rax, w0 C un = 2, 4, 6, 8, ...
- mov %rdx, w1
- mov 8(up,un,8), %rax
- test $2, R8(un)
+L(m1x0):test $2, R8(un)
jnz L(m110)
-L(m100):lea 2(un), n C un = 4, 8, 12, ...
+L(m100):
+ mulx( (up), w5, w2)
+ mulx( 8,(up), w1, w3)
+ lea -24(rp), rp
jmp L(m1l0)
-L(m110):lea (un), n C un = 2, 6, 10, ...
+L(m110):
+ mulx( (up), w3, w4)
+ mulx( 8,(up), w1, w5)
+ lea -8(rp), rp
+ test n, n
+ jz L(cj2)
+ mulx( 16,(up), w0, w2)
+ lea 16(up), up
jmp L(m1l2)
-L(m1x1):mov %rax, w1 C un = 1, 3, 5, 7, ...
- mov %rdx, w0
- test $2, R8(un)
+L(m1x1):test $2, R8(un)
jz L(m111)
-L(m101):lea 3(un), n C un = 1, 5, 9, ...
+L(m101):
+ mulx( (up), w4, w5)
+ lea -16(rp), rp
test n, n
- js L(m1l1)
- mov %rax, -8(rp)
- mov %rdx, (rp)
- pop %rbp
- pop %rbx
- FUNC_EXIT()
- ret
+ jz L(cj1)
+ mulx( 8,(up), w0, w2)
+ lea 8(up), up
+ jmp L(m1l1)
-L(m111):lea 1(un), n C un = 3, 7, 11, ...
- mov 8(up,un,8), %rax
+L(m111):
+ mulx( (up), w2, w3)
+ mulx( 8,(up), w0, w4)
+ mulx( 16,(up), w1, w5)
+ lea 24(up), up
+ test n, n
+ jnz L(gt3)
+ add w0, w3
+ jmp L(cj3)
+L(gt3): add w0, w3
jmp L(m1l3)
- ALIGN(16) C FIXME?
-L(m1tp):mov %rdx, w0
- add %rax, w1
-L(m1l1):mov -16(up,n,8), %rax
- adc $0, w0
- mul v0
- add %rax, w0
- mov w1, -24(rp,n,8)
- mov -8(up,n,8), %rax
- mov %rdx, w1
- adc $0, w1
-L(m1l0):mul v0
- mov w0, -16(rp,n,8)
- add %rax, w1
- mov %rdx, w0
- mov (up,n,8), %rax
- adc $0, w0
-L(m1l3):mul v0
- mov w1, -8(rp,n,8)
- mov %rdx, w1
- add %rax, w0
- mov 8(up,n,8), %rax
- adc $0, w1
-L(m1l2):mul v0
- mov w0, (rp,n,8)
- add $4, n
- jnc L(m1tp)
+ ALIGN(32)
+L(m1tp):lea 32(rp), rp
+L(m1l3):mov w2, (rp)
+ mulx( (up), w0, w2)
+L(m1l2):mov w3, 8(rp)
+ adc w1, w4
+L(m1l1):adc w0, w5
+ mov w4, 16(rp)
+ mulx( 8,(up), w1, w3)
+L(m1l0):mov w5, 24(rp)
+ mulx( 16,(up), w0, w4)
+ adc w1, w2
+ mulx( 24,(up), w1, w5)
+ adc w0, w3
+ lea 32(up), up
+ dec n
+ jnz L(m1tp)
More information about the gmp-commit
mailing list