[Gmp-commit] /var/hg/gmp: Complete rewrite.
mercurial at gmplib.org
mercurial at gmplib.org
Tue Aug 6 01:22:02 CEST 2013
details: /var/hg/gmp/rev/a658ab80f7b6
changeset: 15926:a658ab80f7b6
user: Torbjorn Granlund <tege at gmplib.org>
date: Mon Aug 05 13:18:00 2013 +0200
description:
Complete rewrite.
diffstat:
mpn/x86_64/coreisbr/aors_n.asm | 232 +++++++++++++++++++++++-----------------
1 files changed, 131 insertions(+), 101 deletions(-)
diffs (277 lines):
diff -r 3b7f3825b746 -r a658ab80f7b6 mpn/x86_64/coreisbr/aors_n.asm
--- a/mpn/x86_64/coreisbr/aors_n.asm Sun Aug 04 22:16:30 2013 +0200
+++ b/mpn/x86_64/coreisbr/aors_n.asm Mon Aug 05 13:18:00 2013 +0200
@@ -1,7 +1,8 @@
-dnl X86-64 mpn_add_n, mpn_sub_n, optimized for Intel Sandy Bridge.
+dnl AMD64 mpn_add_n, mpn_sub_n optimised for Sandy bridge, Ivy bridge, and
+dnl Haswell.
-dnl Copyright 2003, 2004, 2005, 2007, 2008, 2010, 2011, 2012 Free Software
-dnl Foundation, Inc.
+dnl Copyright 2003, 2004, 2005, 2007, 2008, 2010, 2011, 2012, 2013 Free
+dnl Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -20,32 +21,46 @@
include(`../config.m4')
+C cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bull 1.82 average over 400-600
+C AMD pile 1.83 average over 400-600
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core
+C Intel NHM
+C Intel SBR 1.55 fluctuates
+C Intel IBR 1.55 fluctuates
+C Intel HWL 1.33 fluctuates
+C Intel BWL
+C Intel atom
+C VIA nano
-C cycles/limb
-C AMD K8,K9 1.85
-C AMD K10 ?
-C Intel P4 ?
-C Intel core2 5
-C Intel NHM 5.5
-C Intel SBR 1.61
-C Intel atom 3
-C VIA nano 3
+C The loop of this code was manually written. It runs close to optimally on
+C Intel SBR, IBR, and HWL far as we know, except for the fluctuation problems.
+C It also runs slightly faster on average on AMD bull and pile.
+C
+C No micro-optimisation has been done.
+C
+C N.B.! The loop alignment padding insns are executed. If editing the code,
+C make sure the padding does not become excessive. It is now a 4-byte nop.
-C INPUT PARAMETERS
-define(`rp', `%rdi')
-define(`up', `%rsi')
-define(`vp', `%rdx')
-define(`n', `%rcx')
-define(`cy', `%r8') C (only for mpn_add_nc and mpn_sub_nc)
+define(`rp', `%rdi') C rcx
+define(`up', `%rsi') C rdx
+define(`vp', `%rdx') C r8
+define(`n', `%rcx') C r9
+define(`cy', `%r8') C rsp+40 (mpn_add_nc and mpn_sub_nc)
ifdef(`OPERATION_add_n', `
- define(ADCSBB, adc)
- define(func, mpn_add_n)
- define(func_nc, mpn_add_nc)')
+ define(ADCSBB, adc)
+ define(func, mpn_add_n)
+ define(func_nc, mpn_add_nc)')
ifdef(`OPERATION_sub_n', `
- define(ADCSBB, sbb)
- define(func, mpn_sub_n)
- define(func_nc, mpn_sub_nc)')
+ define(ADCSBB, sbb)
+ define(func, mpn_sub_n)
+ define(func_nc, mpn_sub_nc)')
MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
@@ -54,101 +69,116 @@
ASM_START()
TEXT
- ALIGN(16)
+ ALIGN(32)
PROLOGUE(func)
FUNC_ENTRY(4)
xor %r8, %r8
+
L(ent): mov R32(n), R32(%rax)
shr $2, n
- and $3, R32(%rax)
- jz L(b0)
- cmp $2, R32(%rax)
- jz L(b2)
- jg L(b3)
-L(b1): mov (up), %r10
- test n, n
- jnz L(gt1)
- neg R32(%r8) C set CF from argument
- ADCSBB (vp), %r10
- mov %r10, (rp)
- mov R32(n), R32(%rax) C zero rax
- adc R32(%rax), R32(%rax)
+ test $1, R8(%rax)
+ jnz L(bx1)
+
+L(bx0): test $2, R8(%rax)
+ jnz L(b10)
+
+L(b00): neg %r8
+ mov (up), %r8
+ mov 8(up), %r9
+ ADCSBB (vp), %r8
+ ADCSBB 8(vp), %r9
+ mov 16(up), %r10
+ mov 24(up), %r11
+ lea 32(up), up
+ ADCSBB 16(vp), %r10
+ ADCSBB 24(vp), %r11
+ lea 32(vp), vp
+ lea -16(rp), rp
+ jmp L(lo0)
+
+L(b10): neg %r8
+ mov (up), %r10
+ mov 8(up), %r11
+ ADCSBB 0(vp), %r10
+ ADCSBB 8(vp), %r11
+ jrcxz L(e2)
+ mov 16(up), %r8
+ mov 24(up), %r9
+ lea 16(up), up
+ ADCSBB 16(vp), %r8
+ ADCSBB 24(vp), %r9
+ lea 16(vp), vp
+ lea (rp), rp
+ jmp L(lo2)
+
+L(e2): mov %r10, (rp)
+ mov %r11, 8(rp)
+ setc R8(%rax)
FUNC_EXIT()
ret
-L(gt1): neg R32(%r8)
- ADCSBB (vp), %r10
- mov 8(up), %r11
- lea 16(up), up
- lea -16(vp), vp
- lea -16(rp), rp
- jmp L(m1)
-L(b3): mov (up), %rax
+L(bx1): test $2, R8(%rax)
+ jnz L(b11)
+
+L(b01): neg %r8
+ mov (up), %r11
+ ADCSBB (vp), %r11
+ jrcxz L(e1)
+ mov 8(up), %r8
+ mov 16(up), %r9
+ lea 8(up), up
+ lea -8(rp), rp
+ ADCSBB 8(vp), %r8
+ ADCSBB 16(vp), %r9
+ lea 8(vp), vp
+ jmp L(lo1)
+
+L(e1): mov %r11, (rp)
+ setc R8(%rax)
+ FUNC_EXIT()
+ ret
+
+L(b11): neg %r8
+ mov (up), %r9
+ ADCSBB (vp), %r9
+ mov 8(up), %r10
+ mov 16(up), %r11
+ lea 24(up), up
+ ADCSBB 8(vp), %r10
+ ADCSBB 16(vp), %r11
+ lea 24(vp), vp
+ mov %r9, (rp)
+ lea 8(rp), rp
+ jrcxz L(end)
+
+ ALIGN(32)
+L(top): mov (up), %r8
mov 8(up), %r9
+ ADCSBB (vp), %r8
+ ADCSBB 8(vp), %r9
+L(lo2): mov %r10, (rp)
+L(lo1): mov %r11, 8(rp)
mov 16(up), %r10
- test n, n
- jnz L(gt3)
- neg R32(%r8)
- lea -32(rp), rp
- jmp L(e3)
-L(gt3): neg R32(%r8)
- ADCSBB (vp), %rax
- jmp L(m3)
-
- nop C alignment
- nop C alignment
-L(b0): mov (up), %r11
- neg R32(%r8)
- lea -24(vp), vp
- lea -24(rp), rp
- lea 8(up), up
- jmp L(m0)
-
-L(b2): mov (up), %r9
- mov 8(up), %r10
- lea -8(vp), vp
- test n, n
- jnz L(gt2)
- neg R32(%r8)
- lea -40(rp), rp
- jmp L(e2)
-L(gt2): neg R32(%r8)
- lea -8(up), up
- lea -8(rp), rp
- jmp L(m2)
-
- ALIGN(8)
-L(top): mov %r11, 24(rp)
- ADCSBB (vp), %rax
+ mov 24(up), %r11
+ lea 32(up), up
+ ADCSBB 16(vp), %r10
+ ADCSBB 24(vp), %r11
+ lea 32(vp), vp
+L(lo0): mov %r8, 16(rp)
+L(lo3): mov %r9, 24(rp)
lea 32(rp), rp
-L(m3): mov %rax, (rp)
-L(m2): ADCSBB 8(vp), %r9
- mov 24(up), %r11
- mov %r9, 8(rp)
- ADCSBB 16(vp), %r10
- lea 32(up), up
-L(m1): mov %r10, 16(rp)
-L(m0): ADCSBB 24(vp), %r11
- mov (up), %rax
- mov 8(up), %r9
- lea 32(vp), vp
dec n
- mov 16(up), %r10
jnz L(top)
- mov %r11, 24(rp)
-L(e3): ADCSBB (vp), %rax
- mov %rax, 32(rp)
-L(e2): ADCSBB 8(vp), %r9
- mov %r9, 40(rp)
-L(e1): ADCSBB 16(vp), %r10
- mov %r10, 48(rp)
- mov R32(n), R32(%rax) C zero rax
- adc R32(%rax), R32(%rax)
+L(end): mov R32(n), R32(%rax) C zero rax
+ mov %r10, (rp)
+ mov %r11, 8(rp)
+ setc R8(%rax)
FUNC_EXIT()
ret
EPILOGUE()
+ ALIGN(16)
PROLOGUE(func_nc)
FUNC_ENTRY(4)
IFDOS(` mov 56(%rsp), %r8 ')
More information about the gmp-commit
mailing list