[Gmp-commit] /var/hg/gmp: 5 new changesets

Wed Sep 18 00:47:29 CEST 2013

details:   /var/hg/gmp/rev/7ace27253c1e
changeset: 15997:7ace27253c1e
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Wed Sep 18 00:35:04 2013 +0200
description:
Provide SBR and HWL mullo_basecase.

details:   /var/hg/gmp/rev/f1003a731f99
changeset: 15998:f1003a731f99
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Wed Sep 18 00:37:50 2013 +0200
description:
Cleanup, streamline.

details:   /var/hg/gmp/rev/53a7c1b2f7ba
changeset: 15999:53a7c1b2f7ba
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Wed Sep 18 00:44:34 2013 +0200
description:
Provide mul_basecase and sqr_basecase for Conroe, Wolfdale, Nehalem, Westmere.

details:   /var/hg/gmp/rev/a0045d16ef28
changeset: 16000:a0045d16ef28
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Wed Sep 18 00:45:47 2013 +0200
description:
Cosmetic fix to mulx byte output.

details:   /var/hg/gmp/rev/eb6a8d6a0c8f
changeset: 16001:eb6a8d6a0c8f
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Wed Sep 18 00:46:07 2013 +0200
description:
ChangeLog

diffstat:

 ChangeLog                              |   12 +
 mpn/x86_64/core2/mul_basecase.asm      |  962 ++++++++++++++++++++++++++++++++
 mpn/x86_64/core2/sqr_basecase.asm      |  971 +++++++++++++++++++++++++++++++++
 mpn/x86_64/coreihwl/mullo_basecase.asm |  416 ++++++++++++++
 mpn/x86_64/coreisbr/mul_basecase.asm   |  150 ++--
 mpn/x86_64/coreisbr/mullo_basecase.asm |  360 ++++++++++++
 mpn/x86_64/coreisbr/sqr_basecase.asm   |  138 ++--
 mpn/x86_64/x86_64-defs.m4              |    2 +-
 8 files changed, 2868 insertions(+), 143 deletions(-)

diffs (truncated from 3164 to 300 lines):

diff -r 52121fb11f27 -r eb6a8d6a0c8f ChangeLog

--- a/ChangeLog	Mon Sep 16 19:57:46 2013 +0200
+++ b/ChangeLog	Wed Sep 18 00:46:07 2013 +0200
@@ -1,3 +1,15 @@
+2013-09-18  Torbjorn Granlund  <tege at gmplib.org>
+
+	* mpn/x86_64/core2/mul_basecase.asm: New file.
+	* mpn/x86_64/core2/sqr_basecase.asm: New file.
+
+	* mpn/x86_64/coreihwl/mullo_basecase.asm: New file.
+	* mpn/x86_64/coreisbr/mullo_basecase.asm: New file.
+
+2013-09-16  Torbjorn Granlund  <tege at gmplib.org>
+
+	* mpn/x86_64/fastsse/copyi-palignr.asm: Preserve xmm6-xmm8 under DOS.
+
 2013-09-15  Torbjorn Granlund  <tege at gmplib.org>
 
 	* mpn/x86_64/tabselect.asm: Use R8 for bit testing.
diff -r 52121fb11f27 -r eb6a8d6a0c8f mpn/x86_64/core2/mul_basecase.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/core2/mul_basecase.asm	Wed Sep 18 00:46:07 2013 +0200
@@ -0,0 +1,962 @@
+dnl  X86-64 mpn_mul_basecase optimised for Intel Nehalem/Westmere.
+dnl  It also seems good for Conroe/Wolfdale.
+
+dnl  Copyright 2008, 2011, 2012, 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb	mul_1		mul_2		mul_3		addmul_2
+C AMD K8,K9
+C AMD K10
+C AMD bull
+C AMD pile
+C AMD steam
+C AMD bobcat
+C AMD jaguar
+C Intel P4
+C Intel core	 4.0		 4.0		 -		4.18-4.25
+C Intel NHM	 3.75		 3.8		 -		4.06-4.2
+C Intel SBR
+C Intel IBR
+C Intel HWL
+C Intel BWL
+C Intel atom
+C VIA nano
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and TorbjÃ¶rn Granlund.
+
+C Code structure:
+C
+C
+C               m_1(0m4)        m_1(1m4)        m_1(2m4)        m_1(3m4)
+C                  |               |               |               |
+C        m_2(0m4)  |     m_2(1m4)  |     m_2(2m4)  |     m_2(3m4)  |
+C           |      /        |      /         |     /         |     /
+C           |     /         |     /          |    /          |    /
+C           |    /          |    /           |   /           |   /
+C          \|/ |/_         \|/ |/_         \|/ |/_         \|/ |/_
+C             _____           _____           _____           _____        
+C           /      \        /      \        /      \        /      \      
+C          \|/      |      \|/      |      \|/      |      \|/      |      
+C        am_2(0m4)  |	 am_2(1m4)  |	 am_2(2m4)  |	 am_2(3m4)  |	 
+C           \      /|\	    \      /|\	    \      /|\	    \      /|\	 
+C            \_____/         \_____/         \_____/         \_____/     
+
+C TODO
+C  * Tune.  None done so far.
+C  * Currently 2687 bytes, making it smaller would be nice.
+C  * Implement some basecases, say for un < 4.
+C  * Try zeroing with xor in m2 loops.
+C  * Try re-rolling the m2 loops to avoid the current 9 insn code duplication
+C    between loop header and wind-down code.
+C  * Consider adc reg,reg instead of adc $0,reg in m2 loops.  This save a byte.
+
+C When playing with pointers, set this to $2 to fall back to conservative
+C indexing in wind-down code.
+define(`I',`$1')
+
+C Define this to $1 to use late loop index variable as zero, $2 to use an
+C explicit $0.
+define(`Z',`$1')
+
+define(`rp',       `%rdi')
+define(`up',       `%rsi')
+define(`un_param', `%rdx')
+define(`vp_param', `%rcx')	C FIXME reallocate vp to rcx but watch performance!
+define(`vn_param', `%r8')
+
+define(`un',       `%r9')
+define(`vn',       `(%rsp)')
+
+define(`v0',       `%r10')
+define(`v1',       `%r11')
+define(`w0',       `%rbx')
+define(`w1',       `%rcx')
+define(`w2',       `%rbp')
+define(`w3',       `%r12')
+define(`i',        `%r13')
+define(`vp',       `%r14')
+
+define(`X0',       `%r8')
+define(`X1',       `%r15')
+
+C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+define(`ALIGNx', `ALIGN(16)')
+
+define(`N', 85)
+ifdef(`N',,`define(`N',0)')
+define(`MOV', `ifelse(eval(N & $3),0,`mov       $1, $2',`lea    ($1), $2')')
+
+ASM_START()
+	TEXT
+	ALIGN(32)
+PROLOGUE(mpn_mul_basecase)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')
+	mov	(up), %rax		C shared for mul_1 and mul_2
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+
+	mov	(vp_param), v0		C shared for mul_1 and mul_2
+
+	xor	un, un
+	sub	un_param, un		C un = -un_param
+
+	lea	(up,un_param,8), up
+	lea	(rp,un_param,8), rp
+
+	mul	v0			C shared for mul_1 and mul_2
+
+	test	$1, R8(vn_param)
+	jz	L(m2)
+
+	lea	8(vp_param), vp		C FIXME: delay until known needed
+
+	test	$1, R8(un)
+	jnz	L(m1x1)
+
+L(m1x0):test	$2, R8(un)
+	jnz	L(m1s2)
+
+L(m1s0):
+	lea	(un), i
+	mov	%rax, (rp,un,8)
+	mov	8(up,un,8), %rax
+	mov	%rdx, w0		C FIXME: Use lea?
+	lea	L(do_am0)(%rip), %rbp
+	jmp	L(m1e0)
+
+L(m1s2):
+	lea	2(un), i
+	mov	%rax, (rp,un,8)
+	mov	8(up,un,8), %rax
+	mov	%rdx, w0		C FIXME: Use lea?
+	mul	v0
+	lea	L(do_am2)(%rip), %rbp
+	test	i, i
+	jnz	L(m1e2)
+	add	%rax, w0
+	adc	$0, %rdx
+	mov	w0, I(-8(rp),8(rp,un,8))
+	mov	%rdx, I((rp),16(rp,un,8))
+	jmp	L(ret2)
+
+L(m1x1):test	$2, R8(un)
+	jz	L(m1s3)
+
+L(m1s1):
+	lea	1(un), i
+	mov	%rax, (rp,un,8)
+	test	i, i
+	jz	L(1)
+	mov	8(up,un,8), %rax
+	mov	%rdx, w1		C FIXME: Use lea?
+	lea	L(do_am1)(%rip), %rbp
+	jmp	L(m1e1)
+L(1):	mov	%rdx, I((rp),8(rp,un,8))
+	jmp	L(ret2)
+
+L(m1s3):
+	lea	-1(un), i
+	mov	%rax, (rp,un,8)
+	mov	8(up,un,8), %rax
+	mov	%rdx, w1		C FIXME: Use lea?
+	lea	L(do_am3)(%rip), %rbp
+	jmp	L(m1e3)
+
+	ALIGNx
+L(m1top):
+	mul	v0
+	mov	w1, -16(rp,i,8)
+L(m1e2):xor	R32(w1), R32(w1)
+	add	%rax, w0
+	mov	(up,i,8), %rax
+	adc	%rdx, w1
+	mov	w0, -8(rp,i,8)
+L(m1e1):xor	R32(w0), R32(w0)
+	mul	v0
+	add	%rax, w1
+	mov	8(up,i,8), %rax
+	adc	%rdx, w0
+	mov	w1, (rp,i,8)
+L(m1e0):xor	R32(w1), R32(w1)
+	mul	v0
+	add	%rax, w0
+	mov	16(up,i,8), %rax
+	adc	%rdx, w1
+	mov	w0, 8(rp,i,8)
+L(m1e3):xor	R32(w0), R32(w0)
+	mul	v0
+	add	%rax, w1
+	mov	24(up,i,8), %rax
+	adc	%rdx, w0
+	add	$4, i
+	js	L(m1top)
+
+	mul	v0
+	mov	w1, I(-16(rp),-16(rp,i,8))
+	add	%rax, w0
+	adc	$0, %rdx
+	mov	w0, I(-8(rp),-8(rp,i,8))
+	mov	%rdx, I((rp),(rp,i,8))
+
+	dec	vn_param
+	jz	L(ret2)
+	lea	-8(rp), rp
+	jmp	*%rbp
+
+L(m2):
+	mov	8(vp_param), v1
+	lea	16(vp_param), vp	C FIXME: delay until known needed
+
+	test	$1, R8(un)
+	jnz	L(bx1)
+
+L(bx0):	test	$2, R8(un)
+	jnz	L(b10)
+
+L(b00):	lea	(un), i
+	mov	%rax, (rp,un,8)
+	mov	%rdx, w1		C FIXME: Use lea?
+	mov	(up,un,8), %rax
+	mov	$0, R32(w2)
+	jmp	L(m2e0)
+
+L(b10):	lea	-2(un), i
+	mov	%rax, w2		C FIXME: Use lea?
+	mov	(up,un,8), %rax
+	mov	%rdx, w3		C FIXME: Use lea?
+	mov	$0, R32(w0)
+	jmp	L(m2e2)
+
+L(bx1):	test	$2, R8(un)
+	jz	L(b11)
+
+L(b01):	lea	1(un), i
+	mov	%rax, (rp,un,8)
+	mov	(up,un,8), %rax
+	mov	%rdx, w0		C FIXME: Use lea?
+	mov	$0, R32(w1)
+	jmp	L(m2e1)
+
+L(b11):	lea	-1(un), i
+	mov	%rax, w1		C FIXME: Use lea?
+	mov	(up,un,8), %rax
+	mov	%rdx, w2		C FIXME: Use lea?
+	mov	$0, R32(w3)
+	jmp	L(m2e3)
+
+	ALIGNx
+L(m2top0):
+	mul	v0
+	add	%rax, w3
+	mov	-8(up,i,8), %rax
+	mov	w3, -8(rp,i,8)