[Gmp-commit] /var/hg/gmp: 4 new changesets

mercurial at gmplib.org mercurial at gmplib.org
Sun Apr 23 22:22:09 UTC 2017


details:   /var/hg/gmp/rev/7bd9f4521ecf
changeset: 17363:7bd9f4521ecf
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Thu Apr 20 03:00:05 2017 +0200
description:
Add more c/l numbers.

details:   /var/hg/gmp/rev/9bcf001debdc
changeset: 17364:9bcf001debdc
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Thu Apr 20 03:08:59 2017 +0200
description:
Add more c/l numbers.

details:   /var/hg/gmp/rev/b8d7c87ee026
changeset: 17365:b8d7c87ee026
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Sun Apr 23 21:01:06 2017 +0200
description:
Rewrite feed-in code and add mul_1c entry point.

details:   /var/hg/gmp/rev/daaf1eaf2767
changeset: 17366:daaf1eaf2767
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Sun Apr 23 21:39:07 2017 +0200
description:
Replace "bt" by "test".

diffstat:

 mpn/x86_64/aorsmul_1.asm      |   39 ++++----
 mpn/x86_64/coreisbr/mul_1.asm |  174 ++++++++++++++++++++++++-----------------
 mpn/x86_64/mod_34lsub1.asm    |   10 ++
 3 files changed, 132 insertions(+), 91 deletions(-)

diffs (truncated from 303 to 300 lines):

diff -r 021277dcb21f -r daaf1eaf2767 mpn/x86_64/aorsmul_1.asm
--- a/mpn/x86_64/aorsmul_1.asm	Tue Apr 18 23:47:55 2017 +0200
+++ b/mpn/x86_64/aorsmul_1.asm	Sun Apr 23 21:39:07 2017 +0200
@@ -31,25 +31,26 @@
 include(`../config.m4')
 
 C	     cycles/limb
-C AMD K8,K9      2.52
-C AMD K10        2.51
-C AMD bull       4.43
-C AMD pile       5.03    5.63
-C AMD steam
-C AMD excavator
-C AMD bobcat     6.20
-C AMD jaguar     5.57    6.56
-C Intel P4      14.9    17.1
-C Intel core2    5.15
-C Intel NHM      4.93
-C Intel SBR      3.95
-C Intel IBR      3.75
-C Intel HWL      3.62
-C Intel BWL      2.53
-C Intel SKL      2.53
-C Intel atom    21.3
-C Intel SLM      9.0
-C VIA nano       5.0
+C AMD K8,K9	 2.52
+C AMD K10	 2.51
+C AMD bd1	 4.43
+C AMD bd2	 5.03	 5.63
+C AMD bd3	 ?
+C AMD bd4	 ?
+C AMD zen	 ?
+C AMD bobcat	 6.20
+C AMD jaguar	 5.57	 6.56
+C Intel P4	14.9	17.1
+C Intel core2	 5.15
+C Intel NHM	 4.93
+C Intel SBR	 3.95
+C Intel IBR	 3.75
+C Intel HWL	 3.62
+C Intel BWL	 2.53
+C Intel SKL	 2.53
+C Intel atom	21.3
+C Intel SLM	 9.0
+C VIA nano	 5.0
 
 C The loop of this code is the result of running a code generation and
 C optimization tool suite written by David Harvey and Torbjorn Granlund.
diff -r 021277dcb21f -r daaf1eaf2767 mpn/x86_64/coreisbr/mul_1.asm
--- a/mpn/x86_64/coreisbr/mul_1.asm	Tue Apr 18 23:47:55 2017 +0200
+++ b/mpn/x86_64/coreisbr/mul_1.asm	Sun Apr 23 21:39:07 2017 +0200
@@ -2,7 +2,8 @@
 
 dnl  Contributed to the GNU project by Torbjörn Granlund.
 
-dnl  Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
+dnl  Copyright 2003-2005, 2007, 2008, 2011-2013, 2017 Free Software Foundation,
+dnl  Inc.
 
 dnl  This file is part of the GNU MP Library.
 dnl
@@ -56,114 +57,143 @@
 C The loop of this code is the result of running a code generation and
 C optimisation tool suite written by David Harvey and Torbjorn Granlund.
 
-C TODO
-C  * The loop is great, but the prologue code was quickly written.  Tune it!
-C  * Add mul_1c entry point.
-C  * We could preserve one less register under DOS64 calling conventions, using
-C    r10 instead of rsi.
-
 define(`rp',      `%rdi')   C rcx
-define(`up',      `%rsi')   C rdx
+define(`up_param',`%rsi')   C rdx
 define(`n_param', `%rdx')   C r8
 define(`v0',      `%rcx')   C r9
+define(`cin',     `%r8')    C stack
 
-define(`n',	  `%r11')
+define(`up',      `%rsi')   C same as rp_param
+define(`n',	  `%r9')
 
 ABI_SUPPORT(DOS64)
 ABI_SUPPORT(STD64)
 
-IFDOS(`	define(`up',     ``%rsi'')') dnl
-IFDOS(`	define(`rp',     ``%rcx'')') dnl
-IFDOS(`	define(`v0',     ``%r9'')') dnl
-IFDOS(`	define(`r9',     ``rdi'')') dnl
-IFDOS(`	define(`n_param',``%r8'')') dnl
-IFDOS(`	define(`n',      ``%r8'')') dnl
-IFDOS(`	define(`r8',     ``r11'')') dnl
+IFDOS(`	define(`rp',      `%rcx')')
+IFDOS(`	define(`up_param',`%rdx')')
+IFDOS(`	define(`n_param', `%r8')')
+IFDOS(`	define(`v0',      `%r9')')
+IFDOS(`	define(`cin',     `48(%rsp)')')
+
+IFDOS(`	define(`up',      `%rsi')')
+IFDOS(`	define(`n',       `%r8')')
 
 ASM_START()
 	TEXT
 	ALIGN(16)
 PROLOGUE(mpn_mul_1)
-
-IFDOS(``push	%rsi		'')
-IFDOS(``push	%rdi		'')
-IFDOS(``mov	%rdx, %rsi	'')
-
-	mov	(up), %rax
-	mov	R32(`n_param'), R32(%r10)
-IFSTD(`	mov	n_param, n		')
-
-	lea	(up,n_param,8), up
+IFDOS(`	push	%rsi		')
+	mov	(up_param), %rax
+IFSTD(`	mov	n_param, n	')
+	lea	(up_param,n_param,8), up
 	lea	-8(rp,n_param,8), rp
 	neg	n
 	mul	v0
-	and	$3, R32(%r10)
-	jz	L(b0)
-	cmp	$2, R32(%r10)
-	jb	L(b1)
-	jz	L(b2)
 
-L(b3):	add	$-1, n
-	mov	%rax, %r9
-	mov	%rdx, %r8
-	mov	16(up,n,8), %rax
+	test	$1, R8(n)
+	jz	L(x0)
+L(x1):	mov	%rax, %r11
+	mov	%rdx, %r10
+	test	$2, R8(n)
+	jnz	L(01)
+
+L(11):	mov	8(up,n,8), %rax
+	dec	n
 	jmp	L(L3)
 
-L(b1):	mov	%rax, %r9
-	mov	%rdx, %r8
-	add	$1, n
-	jnc	L(L1)
+L(01):	inc	n
+	jnz	L(L1)
 	mov	%rax, (rp)
 	mov	%rdx, %rax
-IFDOS(``pop	%rdi		'')
-IFDOS(``pop	%rsi		'')
+IFDOS(`	pop	%rsi		')
 	ret
 
-L(b2):	add	$-2, n
-	mov	%rax, %r8
-	mov	%rdx, %r9
-	mov	24(up,n,8), %rax
+L(x0):	mov	%rax, %r10
+	mov	%rdx, %r11
+	mov	8(up,n,8), %rax
+	test	$2, R8(n)
+	jz	L(L0)
+
+L(10):	add	$-2, n
 	jmp	L(L2)
 
-L(b0):	mov	%rax, %r8
-	mov	%rdx, %r9
-	mov	8(up,n,8), %rax
-	jmp	L(L0)
-
 	ALIGN(8)
-L(top):	mov	%rdx, %r8
-	add	%rax, %r9
+L(top):	mov	%rdx, %r10
+	add	%rax, %r11
 L(L1):	mov	0(up,n,8), %rax
-	adc	$0, %r8
+	adc	$0, %r10
 	mul	v0
-	add	%rax, %r8
-	mov	%r9, 0(rp,n,8)
+	add	%rax, %r10
+	mov	%r11, 0(rp,n,8)
 	mov	8(up,n,8), %rax
-	mov	%rdx, %r9
-	adc	$0, %r9
+	mov	%rdx, %r11
+L(L0c):	adc	$0, %r11
 L(L0):	mul	v0
-	mov	%r8, 8(rp,n,8)
-	add	%rax, %r9
-	mov	%rdx, %r8
-	mov	16(up,n,8), %rax
-	adc	$0, %r8
+	mov	%r10, 8(rp,n,8)
+	add	%rax, %r11
+	mov	%rdx, %r10
+L(L3c):	mov	16(up,n,8), %rax
+	adc	$0, %r10
 L(L3):	mul	v0
-	mov	%r9, 16(rp,n,8)
-	mov	%rdx, %r9
-	add	%rax, %r8
-	mov	24(up,n,8), %rax
-	adc	$0, %r9
+	mov	%r11, 16(rp,n,8)
+	mov	%rdx, %r11
+	add	%rax, %r10
+L(L2c):	mov	24(up,n,8), %rax
+	adc	$0, %r11
 L(L2):	mul	v0
-	mov	%r8, 24(rp,n,8)
+	mov	%r10, 24(rp,n,8)
 	add	$4, n
 	jnc	L(top)
 
-L(end):	add	%rax, %r9
+L(end):	add	%rax, %r11
 	mov	%rdx, %rax
 	adc	$0, %rax
-	mov	%r9, (rp)
+	mov	%r11, (rp)
 
-IFDOS(``pop	%rdi		'')
-IFDOS(``pop	%rsi		'')
+IFDOS(`	pop	%rsi		')
 	ret
 EPILOGUE()
+
+	ALIGN(16)
+PROLOGUE(mpn_mul_1c)
+IFDOS(`	push	%rsi		')
+	mov	(up_param), %rax
+IFSTD(`	mov	n_param, n	')
+	lea	(up_param,n_param,8), up
+	lea	-8(rp,n_param,8), rp
+	neg	n
+	mul	v0
+
+	test	$1, R8(n)
+	jz	L(x0c)
+L(x1c):	mov	%rax, %r11
+	mov	%rdx, %r10
+	test	$2, R8(n)
+	jnz	L(01c)
+
+L(11c):	add	cin, %r11
+	dec	n
+	jmp	L(L3c)
+
+L(01c):	add	cin, %r11
+	inc	n
+	jnz	L(L1)
+	mov	%r11, (rp)
+	mov	%rdx, %rax
+	adc	$0, %rax
+IFDOS(`	pop	%rsi		')
+	ret
+
+L(x0c):	mov	%rax, %r10
+	mov	%rdx, %r11
+	test	$2, R8(n)
+	jz	L(00c)
+
+L(10c):	add	$-2, n
+	add	cin, %r10
+	jmp	L(L2c)
+
+L(00c):	add	cin, %r10
+	mov	8(up,n,8), %rax
+	jmp	L(L0c)
+EPILOGUE()
diff -r 021277dcb21f -r daaf1eaf2767 mpn/x86_64/mod_34lsub1.asm
--- a/mpn/x86_64/mod_34lsub1.asm	Tue Apr 18 23:47:55 2017 +0200
+++ b/mpn/x86_64/mod_34lsub1.asm	Sun Apr 23 21:39:07 2017 +0200
@@ -36,12 +36,22 @@
 C AMD K8,K9	 0.67	   0.583 is possible with zero-reg instead of $0, 4-way
 C AMD K10	 0.67	   this seems hard to beat
 C AMD bd1	 1
+C AMD bd2	 ?
+C AMD bd3	 ?
+C AMD bd4	 ?
+C AMD zen	 0.62
 C AMD bobcat	 1.07
+C AMD jaguar	 1
 C Intel P4	 7.35	   terrible, use old code
 C Intel core2	 1.25	   1+epsilon with huge unrolling
 C Intel NHM	 1.15	   this seems hard to beat
 C Intel SBR	 0.93
+C Intel IBR	 0.93
+C Intel HWL	 0.82
+C Intel BWL	 0.64
+C Intel SKY	 0.60
 C Intel atom	 2.5
+C Intel SLM      1.59


More information about the gmp-commit mailing list