[Gmp-commit] /var/hg/gmp: Complete rewrite.

mercurial at gmplib.org mercurial at gmplib.org
Tue Aug 6 01:22:02 CEST 2013


details:   /var/hg/gmp/rev/a658ab80f7b6
changeset: 15926:a658ab80f7b6
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Mon Aug 05 13:18:00 2013 +0200
description:
Complete rewrite.

diffstat:

 mpn/x86_64/coreisbr/aors_n.asm |  232 +++++++++++++++++++++++-----------------
 1 files changed, 131 insertions(+), 101 deletions(-)

diffs (277 lines):

diff -r 3b7f3825b746 -r a658ab80f7b6 mpn/x86_64/coreisbr/aors_n.asm
--- a/mpn/x86_64/coreisbr/aors_n.asm	Sun Aug 04 22:16:30 2013 +0200
+++ b/mpn/x86_64/coreisbr/aors_n.asm	Mon Aug 05 13:18:00 2013 +0200
@@ -1,7 +1,8 @@
-dnl  X86-64 mpn_add_n, mpn_sub_n, optimized for Intel Sandy Bridge.
+dnl  AMD64 mpn_add_n, mpn_sub_n optimised for Sandy bridge, Ivy bridge, and
+dnl  Haswell.
 
-dnl  Copyright 2003, 2004, 2005, 2007, 2008, 2010, 2011, 2012 Free Software
-dnl  Foundation, Inc.
+dnl  Copyright 2003, 2004, 2005, 2007, 2008, 2010, 2011, 2012, 2013 Free
+dnl  Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -20,32 +21,46 @@
 
 include(`../config.m4')
 
+C	     cycles/limb
+C AMD K8,K9
+C AMD K10
+C AMD bull	 1.82		average over 400-600
+C AMD pile	 1.83		average over 400-600
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core
+C Intel NHM
+C Intel SBR	 1.55		fluctuates
+C Intel IBR	 1.55		fluctuates
+C Intel HWL	 1.33		fluctuates
+C Intel BWL
+C Intel atom
+C VIA nano
 
-C	     cycles/limb
-C AMD K8,K9	 1.85
-C AMD K10	 ?
-C Intel P4	 ?
-C Intel core2	 5
-C Intel NHM	 5.5
-C Intel SBR	 1.61
-C Intel atom	 3
-C VIA nano	 3
+C The loop of this code was manually written.  It runs close to optimally on
+C Intel SBR, IBR, and HWL far as we know, except for the fluctuation problems.
+C It also runs slightly faster on average on AMD bull and pile.
+C
+C No micro-optimisation has been done.
+C
+C N.B.!  The loop alignment padding insns are executed.  If editing the code,
+C make sure the padding does not become excessive.  It is now a 4-byte nop.
 
-C INPUT PARAMETERS
-define(`rp',	`%rdi')
-define(`up',	`%rsi')
-define(`vp',	`%rdx')
-define(`n',	`%rcx')
-define(`cy',	`%r8')		C (only for mpn_add_nc and mpn_sub_nc)
+define(`rp',	`%rdi')	C rcx
+define(`up',	`%rsi')	C rdx
+define(`vp',	`%rdx')	C r8
+define(`n',	`%rcx')	C r9
+define(`cy',	`%r8')	C rsp+40    (mpn_add_nc and mpn_sub_nc)
 
 ifdef(`OPERATION_add_n', `
-	define(ADCSBB,	      adc)
-	define(func,	      mpn_add_n)
-	define(func_nc,	      mpn_add_nc)')
+  define(ADCSBB,    adc)
+  define(func,      mpn_add_n)
+  define(func_nc,   mpn_add_nc)')
 ifdef(`OPERATION_sub_n', `
-	define(ADCSBB,	      sbb)
-	define(func,	      mpn_sub_n)
-	define(func_nc,	      mpn_sub_nc)')
+  define(ADCSBB,    sbb)
+  define(func,      mpn_sub_n)
+  define(func_nc,   mpn_sub_nc)')
 
 MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
 
@@ -54,101 +69,116 @@
 
 ASM_START()
 	TEXT
-	ALIGN(16)
+	ALIGN(32)
 PROLOGUE(func)
 	FUNC_ENTRY(4)
 	xor	%r8, %r8
+
 L(ent):	mov	R32(n), R32(%rax)
 	shr	$2, n
-	and	$3, R32(%rax)
-	jz	L(b0)
-	cmp	$2, R32(%rax)
-	jz	L(b2)
-	jg	L(b3)
 
-L(b1):	mov	(up), %r10
-	test	n, n
-	jnz	L(gt1)
-	neg	R32(%r8)		C set CF from argument
-	ADCSBB	(vp), %r10
-	mov	%r10, (rp)
-	mov	R32(n), R32(%rax)	C zero rax
-	adc	R32(%rax), R32(%rax)
+	test	$1, R8(%rax)
+	jnz	L(bx1)
+
+L(bx0):	test	$2, R8(%rax)
+	jnz	L(b10)
+
+L(b00):	neg	%r8
+	mov	(up), %r8
+	mov	8(up), %r9
+	ADCSBB	(vp), %r8
+	ADCSBB	8(vp), %r9
+	mov	16(up), %r10
+	mov	24(up), %r11
+	lea	32(up), up
+	ADCSBB	16(vp), %r10
+	ADCSBB	24(vp), %r11
+	lea	32(vp), vp
+	lea	-16(rp), rp
+	jmp	L(lo0)
+
+L(b10):	neg	%r8
+	mov	(up), %r10
+	mov	8(up), %r11
+	ADCSBB	0(vp), %r10
+	ADCSBB	8(vp), %r11
+	jrcxz	L(e2)
+	mov	16(up), %r8
+	mov	24(up), %r9
+	lea	16(up), up
+	ADCSBB	16(vp), %r8
+	ADCSBB	24(vp), %r9
+	lea	16(vp), vp
+	lea	(rp), rp
+	jmp	L(lo2)
+
+L(e2):	mov	%r10, (rp)
+	mov	%r11, 8(rp)
+	setc	R8(%rax)
 	FUNC_EXIT()
 	ret
-L(gt1):	neg	R32(%r8)
-	ADCSBB	(vp), %r10
-	mov	8(up), %r11
-	lea	16(up), up
-	lea	-16(vp), vp
-	lea	-16(rp), rp
-	jmp	L(m1)
 
-L(b3):	mov	(up), %rax
+L(bx1):	test	$2, R8(%rax)
+	jnz	L(b11)
+
+L(b01):	neg	%r8
+	mov	(up), %r11
+	ADCSBB	(vp), %r11
+	jrcxz	L(e1)
+	mov	8(up), %r8
+	mov	16(up), %r9
+	lea	8(up), up
+	lea	-8(rp), rp
+	ADCSBB	8(vp), %r8
+	ADCSBB	16(vp), %r9
+	lea	8(vp), vp
+	jmp	L(lo1)
+
+L(e1):	mov	%r11, (rp)
+	setc	R8(%rax)
+	FUNC_EXIT()
+	ret
+
+L(b11):	neg	%r8
+	mov	(up), %r9
+	ADCSBB	(vp), %r9
+	mov	8(up), %r10
+	mov	16(up), %r11
+	lea	24(up), up
+	ADCSBB	8(vp), %r10
+	ADCSBB	16(vp), %r11
+	lea	24(vp), vp
+	mov	%r9, (rp)
+	lea	8(rp), rp
+	jrcxz	L(end)
+
+	ALIGN(32)
+L(top):	mov	(up), %r8
 	mov	8(up), %r9
+	ADCSBB	(vp), %r8
+	ADCSBB	8(vp), %r9
+L(lo2):	mov	%r10, (rp)
+L(lo1):	mov	%r11, 8(rp)
 	mov	16(up), %r10
-	test	n, n
-	jnz	L(gt3)
-	neg	R32(%r8)
-	lea	-32(rp), rp
-	jmp	L(e3)
-L(gt3):	neg	R32(%r8)
-	ADCSBB	(vp), %rax
-	jmp	L(m3)
-
-	nop				C alignment
-	nop				C alignment
-L(b0):	mov	(up), %r11
-	neg	R32(%r8)
-	lea	-24(vp), vp
-	lea	-24(rp), rp
-	lea	8(up), up
-	jmp	L(m0)
-
-L(b2):	mov	(up), %r9
-	mov	8(up), %r10
-	lea	-8(vp), vp
-	test	n, n
-	jnz	L(gt2)
-	neg	R32(%r8)
-	lea	-40(rp), rp
-	jmp	L(e2)
-L(gt2):	neg	R32(%r8)
-	lea	-8(up), up
-	lea	-8(rp), rp
-	jmp	L(m2)
-
-	ALIGN(8)
-L(top):	mov	%r11, 24(rp)
-	ADCSBB	(vp), %rax
+	mov	24(up), %r11
+	lea	32(up), up
+	ADCSBB	16(vp), %r10
+	ADCSBB	24(vp), %r11
+	lea	32(vp), vp
+L(lo0):	mov	%r8, 16(rp)
+L(lo3):	mov	%r9, 24(rp)
 	lea	32(rp), rp
-L(m3):	mov	%rax, (rp)
-L(m2):	ADCSBB	8(vp), %r9
-	mov	24(up), %r11
-	mov	%r9, 8(rp)
-	ADCSBB	16(vp), %r10
-	lea	32(up), up
-L(m1):	mov	%r10, 16(rp)
-L(m0):	ADCSBB	24(vp), %r11
-	mov	(up), %rax
-	mov	8(up), %r9
-	lea	32(vp), vp
 	dec	n
-	mov	16(up), %r10
 	jnz	L(top)
 
-	mov	%r11, 24(rp)
-L(e3):	ADCSBB	(vp), %rax
-	mov	%rax, 32(rp)
-L(e2):	ADCSBB	8(vp), %r9
-	mov	%r9, 40(rp)
-L(e1):	ADCSBB	16(vp), %r10
-	mov	%r10, 48(rp)
-	mov	R32(n), R32(%rax)	C zero rax
-	adc	R32(%rax), R32(%rax)
+L(end):	mov	R32(n), R32(%rax)	C zero rax
+	mov	%r10, (rp)
+	mov	%r11, 8(rp)
+	setc	R8(%rax)
 	FUNC_EXIT()
 	ret
 EPILOGUE()
+	ALIGN(16)
 PROLOGUE(func_nc)
 	FUNC_ENTRY(4)
 IFDOS(`	mov	56(%rsp), %r8	')


More information about the gmp-commit mailing list