[Gmp-commit] /var/hg/gmp: 7 new changesets

mercurial at gmplib.org mercurial at gmplib.org
Sat Aug 3 00:39:04 CEST 2013


details:   /var/hg/gmp/rev/f1551bf500b7
changeset: 15906:f1551bf500b7
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Fri Aug 02 18:31:20 2013 +0200
description:
Complete rewrite of sandybridge addmul_2.

details:   /var/hg/gmp/rev/4c12b13e8e37
changeset: 15907:4c12b13e8e37
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Fri Aug 02 18:39:55 2013 +0200
description:
Fix typo.

details:   /var/hg/gmp/rev/8757d877662f
changeset: 15908:8757d877662f
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sat Aug 03 00:30:40 2013 +0200
description:
Provide sandybridge mul_2.

details:   /var/hg/gmp/rev/9ac67df52775
changeset: 15909:9ac67df52775
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sat Aug 03 00:35:16 2013 +0200
description:
Support DOS64.

details:   /var/hg/gmp/rev/2482bb627173
changeset: 15910:2482bb627173
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sat Aug 03 00:35:58 2013 +0200
description:
Spacing.

details:   /var/hg/gmp/rev/efb65b3b6a6c
changeset: 15911:efb65b3b6a6c
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sat Aug 03 00:38:35 2013 +0200
description:
Save some O(n) and O(1) cycles.

details:   /var/hg/gmp/rev/d62fd347f7c7
changeset: 15912:d62fd347f7c7
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sat Aug 03 00:39:01 2013 +0200
description:
ChangeLog

diffstat:

 ChangeLog                            |   10 +
 mpn/x86_64/bd1/mul_basecase.asm      |    1 +
 mpn/x86_64/coreisbr/addmul_2.asm     |  296 +++++++++++++++++-----------------
 mpn/x86_64/coreisbr/mul_2.asm        |  153 ++++++++++++++++++
 mpn/x86_64/coreisbr/mul_basecase.asm |   29 +--
 5 files changed, 328 insertions(+), 161 deletions(-)

diffs (truncated from 628 to 300 lines):

diff -r 43339e712783 -r d62fd347f7c7 ChangeLog
--- a/ChangeLog	Fri Aug 02 13:23:26 2013 +0200
+++ b/ChangeLog	Sat Aug 03 00:39:01 2013 +0200
@@ -1,3 +1,13 @@
+2013-08-03  Torbjorn Granlund  <tege at gmplib.org>
+
+	* mpn/x86_64/coreisbr/mul_basecase.asm: Save some O(n) and O(1) cycles.
+
+	* mpn/x86_64/coreisbr/mul_2.asm: New file.
+
+2013-08-02  Torbjorn Granlund  <tege at gmplib.org>
+
+	* mpn/x86_64/coreisbr/addmul_2.asm: Complete rewrite.
+
 2013-08-01  Torbjorn Granlund  <tege at gmplib.org>
 
 	* mpn/x86_64/bd1/mul_basecase.asm: New file.
diff -r 43339e712783 -r d62fd347f7c7 mpn/x86_64/bd1/mul_basecase.asm
--- a/mpn/x86_64/bd1/mul_basecase.asm	Fri Aug 02 13:23:26 2013 +0200
+++ b/mpn/x86_64/bd1/mul_basecase.asm	Sat Aug 03 00:39:01 2013 +0200
@@ -3,6 +3,7 @@
 dnl  Contributed to the GNU project by Torbjörn Granlund.
 
 dnl  Copyright 2003, 2004, 2005, 2007, 2008, 2011, 2012, 2013 Free Software
+dnl  Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
diff -r 43339e712783 -r d62fd347f7c7 mpn/x86_64/coreisbr/addmul_2.asm
--- a/mpn/x86_64/coreisbr/addmul_2.asm	Fri Aug 02 13:23:26 2013 +0200
+++ b/mpn/x86_64/coreisbr/addmul_2.asm	Sat Aug 03 00:39:01 2013 +0200
@@ -1,6 +1,9 @@
-dnl  X86-64 mpn_addmul_2 optimised for Intel Sandy Bridge.
+dnl  AMD64 mpn_addmul_2 optimised for Intel Sandy Bridge.
 
-dnl  Copyright 2008, 2011, 2012 Free Software Foundation, Inc.
+dnl  Contributed to the GNU project by Torbjörn Granlund.
+
+dnl  Copyright 2003, 2004, 2005, 2007, 2008, 2011, 2012, 2013 Free Software
+dnl  Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -19,188 +22,193 @@
 
 include(`../config.m4')
 
-C	     cycles/limb
+C	     cycles/limb	best
 C AMD K8,K9
-C AMD K10	 4.07
-C AMD bd1
-C AMD bobcat	 5.25
-C Intel P4	16.1
-C Intel core2
+C AMD K10
+C AMD bull
+C AMD pile
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core
 C Intel NHM
-C Intel SBR	 3.2
+C Intel SBR	 2.93		this
+C Intel IBR	 2.66		this
+C Intel HWL	 2.5		 2.0
+C Intel BWL
 C Intel atom
-C VIA nano	 5.23
+C VIA nano
 
 C This code is the result of running a code generation and optimisation tool
 C suite written by David Harvey and Torbjorn Granlund.
 
-C TODO
-C  * Tune feed-in and wind-down code.
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-dowm code.
+define(`I',`$1')
 
-C INPUT PARAMETERS
-define(`rp',     `%rdi')
-define(`up',     `%rsi')
-define(`n_param',`%rdx')
-define(`vp',     `%rcx')
 
-define(`v0', `%r12')
-define(`v1', `%r13')
-define(`n',  `%r11')
+define(`rp',      `%rdi')   C rcx
+define(`up',      `%rsi')   C rdx
+define(`n_param', `%rdx')   C r8
+define(`vp',      `%rcx')   C r9
+
+define(`n',	  `%rcx')
+define(`v0',      `%rbx')
+define(`v1',      `%rbp')
+define(`w0',      `%r8')
+define(`w1',      `%r9')
+define(`w2',      `%r10')
+define(`w3',      `%r11')
+define(`X0',      `%r12')
+define(`X1',      `%r13')
 
 ABI_SUPPORT(DOS64)
 ABI_SUPPORT(STD64)
 
 ASM_START()
 	TEXT
-	ALIGN(16)
+	ALIGN(32)
 PROLOGUE(mpn_addmul_2)
 	FUNC_ENTRY(4)
 	push	%rbx
+	push	%rbp
 	push	%r12
 	push	%r13
-	push	%r14
+
+	mov	(vp), v0
+	mov	8(vp), v1
 
 	mov	(up), %rax
 
 	mov	n_param, n
-	mov	0(vp), v0
-	mov	8(vp), v1
-	shr	$2, n
-	and	$3, R32(n_param)
-	jz	L(b0)
-	cmp	$2, R32(n_param)
-	jb	L(b1)
-	jz	L(b2)
+	neg	n
 
-L(b3):	mov	(rp), %r10
-	mov	$0, R32(%rcx)
+	lea	(up,n_param,8), up
+	lea	8(rp,n_param,8), rp
 	mul	v0
-	add	%rax, %r10
-	mov	%rdx, %r14
-	adc	$0, %r14
-	lea	-16(rp), rp
-	lea	-16(up), up
-	mov	$0, R32(%r9)
-	mov	$0, R32(%rbx)
+
+	test	$1, R8(n)
+	jnz	L(bx1)
+
+L(bx0):	mov	-8(rp,n,8), X0
+	mov	%rdx, w1
+	add	%rax, X0
+	adc	$0, w1
+	mov	(up,n,8), %rax
+	xor	w0, w0
+	xor	w3, w3
+	test	$2, R8(n)
+	jnz	L(b10)
+
+L(b00):	nop				C this nop make loop go faster on SBR!
+	mul	v1
+	mov	(rp,n,8), X1
+	jmp	L(lo0)
+
+L(b10):	lea	-2(n), n
+	jmp	L(lo2)
+
+L(bx1):	mov	-8(rp,n,8), X1
+	mov	%rdx, w3
+	add	%rax, X1
+	adc	$0, w3
+	mov	(up,n,8), %rax
+	xor	w1, w1
+	xor	w2, w2
+	test	$2, R8(n)
+	jz	L(b11)
+
+L(b01):	mov	(rp,n,8), X0
 	inc	n
-	jmp	L(L3)
+	jmp	L(lo1)
 
-L(b0):	mov	(rp), %r8
-	mul	v0
-	add	%rax, %r8
-	mov	%rdx, %r9
-	adc	$0, %r9
-	mov	$0, R32(%rbx)
-	lea	-8(rp), rp
-	lea	-8(up), up
-	jmp	L(L0)
-
-L(b1):	mov	(rp), %r10
-	mov	$0, R32(%rcx)
-	mul	v0
-	add	%rax, %r10
-	mov	%rdx, %r14
-	adc	$0, %r14
-	mov	%r10, 0(rp)
-	jmp	L(L1)
-
-L(b2):	mov	(rp), %r8
-	mul	v0
-	add	%rax, %r8
-	mov	$0, R32(%rbx)
-	mov	%rdx, %r9
-	adc	$0, %r9
-	lea	-24(rp), rp
-	lea	-24(up), up
-	inc	n
-	jmp	L(L2)
+L(b11):	dec	n
+	jmp	L(lo3)
 
 	ALIGN(32)
-L(top):	mov	%r10, 32(rp)
-	adc	%rbx, %r14		C 10
-	lea	32(rp), rp
-L(L1):	mov	0(up), %rax
-	adc	$0, R32(%rcx)
+L(top):
+L(lo1):	mul	v1
+	mov	%rdx, w0		C 1
+	add	%rax, X0		C 0
+	adc	$0, w0			C 1
+	add	w1, X1			C 3
+	adc	$0, w3			C 0
+	add	w2, X0			C 0
+	adc	$0, w0			C 1
+	mov	(up,n,8), %rax
+	mul	v0
+	add	%rax, X0		C 0
+	mov	%rdx, w1		C 1
+	adc	$0, w1			C 1
+	mov	(up,n,8), %rax
 	mul	v1
-	mov	$0, R32(%rbx)
-	mov	8(rp), %r8
-	add	%rax, %r8
-	mov	%rdx, %r9
-	mov	8(up), %rax
-	adc	$0, %r9
+	mov	X1, -16(rp,n,8)		C 3
+	mov	(rp,n,8), X1		C 1
+	add	w3, X0			C 0
+	adc	$0, w1			C 1
+L(lo0):	mov	%rdx, w2		C 2
+	mov	X0, -8(rp,n,8)		C 0
+	add	%rax, X1		C 1
+	adc	$0, w2			C 2
+	mov	8(up,n,8), %rax
+	add	w0, X1			C 1
+	adc	$0, w2			C 2
 	mul	v0
-	add	%rax, %r8
-	adc	%rdx, %r9
-	adc	$0, R32(%rbx)
-	add	%r14, %r8		C 0 12
-	adc	%rcx, %r9		C 1
-L(L0):	mov	8(up), %rax
-	adc	$0, R32(%rbx)
-	mov	16(rp), %r10
-	mul	v1
-	add	%rax, %r10
-	mov	%rdx, %r14
-	mov	16(up), %rax
-	mov	$0, R32(%rcx)
-	adc	$0, %r14
+	add	%rax, X1		C 1
+	mov	%rdx, w3		C 2
+	adc	$0, w3			C 2
+	mov	8(up,n,8), %rax
+L(lo3):	mul	v1
+	add	w1, X1			C 1
+	mov	8(rp,n,8), X0		C 2
+	adc	$0, w3			C 2
+	mov	%rdx, w0		C 3
+	add	%rax, X0		C 2
+	adc	$0, w0			C 3
+	mov	16(up,n,8), %rax
 	mul	v0
-	add	%rax, %r10
-	adc	%rdx, %r14
-	adc	$0, R32(%rcx)
-	mov	%r8, 8(rp)
-L(L3):	mov	24(rp), %r8
-	mov	16(up), %rax
-	mul	v1
-	add	%r9, %r10		C 3
-	adc	%rbx, %r14		C 4
-	adc	$0, R32(%rcx)
-	add	%rax, %r8
-	mov	%rdx, %r9
-	adc	$0, %r9
-	mov	24(up), %rax
+	add	w2, X0			C 2
+	mov	X1, (rp,n,8)		C 1


More information about the gmp-commit mailing list