[Gmp-commit] /var/hg/gmp: Provide Broadwell mul_basecase and sqr_basecase.

Sun Apr 26 01:12:51 UTC 2015

details:   /var/hg/gmp/rev/d25a13804dae
changeset: 16594:d25a13804dae
user:      Torbjorn Granlund <torbjorng at google.com>
date:      Sun Apr 26 03:12:29 2015 +0200
description:
Provide Broadwell mul_basecase and sqr_basecase.

diffstat:

 ChangeLog                            |    5 +
 mpn/x86_64/coreibwl/mul_basecase.asm |  381 +++++++++++++++
 mpn/x86_64/coreibwl/sqr_basecase.asm |  848 +++++++++++++++++++++++++++++++++++
 3 files changed, 1234 insertions(+), 0 deletions(-)

diffs (truncated from 1249 to 300 lines):

diff -r 391231d8dc7d -r d25a13804dae ChangeLog

--- a/ChangeLog	Sat Apr 25 20:04:46 2015 +0200
+++ b/ChangeLog	Sun Apr 26 03:12:29 2015 +0200
@@ -1,3 +1,8 @@
+2015-04-26    <torbjorng at google.com>
+
+	* mpn/x86_64/coreibwl/mul_basecase.asm: New file.
+	* mpn/x86_64/coreibwl/sqr_basecase.asm: New file.
+
 2015-04-25 Marco Bodrato <bodrato at mail.dm.unipi.it>
 
 	* mpn/generic/invert.c: Split add in the correction test.
diff -r 391231d8dc7d -r d25a13804dae mpn/x86_64/coreibwl/mul_basecase.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/coreibwl/mul_basecase.asm	Sun Apr 26 03:12:29 2015 +0200
@@ -0,0 +1,381 @@
+dnl  AMD64 mpn_mul_basecase optimised for Intel Broadwell.
+
+dnl  Copyright 2015 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb	mul_1		addmul_1
+C AMD K8,K9	n/a		n/a
+C AMD K10	n/a		n/a
+C AMD bull	n/a		n/a
+C AMD pile	n/a		n/a
+C AMD steam	n/a		n/a
+C AMD excavator	 ?		 ?
+C AMD bobcat	n/a		n/a
+C AMD jaguar	n/a		n/a
+C Intel P4	n/a		n/a
+C Intel core2	n/a		n/a
+C Intel NHM	n/a		n/a
+C Intel SBR	n/a		n/a
+C Intel IBR	n/a		n/a
+C Intel HWL	 1.68		n/a
+C Intel BWL	 1.69	      1.8-1.9
+C Intel atom	n/a		n/a
+C Intel SLM	n/a		n/a
+C VIA nano	n/a		n/a
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C  * Tune non-loop code.  Very little effort has been spent there.
+C  * When changing this, make sure the code which falls into the inner loops
+C    does not execute too many no-ops.
+C  * Eliminate rp_save and up_save by keeping un_save as a negated, scaled
+C    counter, similar to the sqr_basecase of this directory.
+
+define(`rp',      `%rdi')
+define(`up',      `%rsi')
+define(`un_param',`%rdx')
+define(`vp_param',`%rcx')
+define(`vn',      `%r8')
+
+define(`n',       `%rcx')
+define(`rp_save', `%r13')
+define(`up_save', `%rbx')
+define(`un_save', `%rbp')
+define(`vp',      `%r14')
+define(`v0',      `%rdx')
+define(`jaddr',   `%rax')
+
+define(`w0',	`%r12')
+define(`w1',	`%r9')
+define(`w2',	`%r10')
+define(`w3',	`%r11')
+
+C %rax %rbx %rcx %rdx %rdi %rsi %rbp
+C %r8  %r9  %r10 %r11 %r12 %r13 %r14 %r15
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8d	')
+
+	cmp	$2, un_param
+	ja	L(gen)
+	mov	(vp_param), %rdx
+	mulx(	(up), %rax, %r9)	C 0 1
+	je	L(s2x)
+
+L(s11):	mov	%rax, (rp)
+	mov	%r9, 8(rp)
+	ret
+
+L(s2x):	cmp	$2, vn
+	mulx(	8,(up), %r8, %r10)	C 1 2
+	je	L(s22)
+
+L(s21):	add	%r8, %r9
+	adc	$0, %r10
+	mov	%rax, (rp)
+	mov	%r9, 8(rp)
+	mov	%r10, 16(rp)
+	ret
+
+L(s22):	add	%r8, %r9		C 1
+	adc	$0, %r10		C 2
+	mov	8(vp_param), %rdx
+	mov	%rax, (rp)
+	mulx(	(up), %r8, %r11)	C 1 2
+	mulx(	8,(up), %rax, %rdx)	C 2 3
+	add	%r11, %rax		C 2
+	adc	$0, %rdx		C 3
+	add	%r8, %r9		C 1
+	adc	%rax, %r10		C 2
+	adc	$0, %rdx		C 3
+	mov	%r9, 8(rp)
+	mov	%r10, 16(rp)
+	mov	%rdx, 24(rp)
+	ret
+
+	ALIGN(16)
+L(gen):
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+
+	mov	vp_param, vp
+	mov	un_param, un_save
+	mov	rp, rp_save
+	mov	up, up_save
+
+	mov	R32(un_save), R32(%rax)
+	shr	$3, un_save
+	and	$7, R32(%rax)		C clear CF for adc as side-effect
+					C note that rax lives very long
+	mov	un_save, n
+	mov	(vp), v0
+	lea	8(vp), vp
+
+	lea	L(mtab)(%rip), %r10
+ifdef(`PIC',
+`	movslq	(%r10,%rax,4), %r14
+	lea	(%r14, %r10), %r10
+	jmp	*%r10
+',`
+	jmp	*(%r10,%rax,8)
+')
+
+L(mf0):	mulx(	(up_save), w2, w3)
+	lea	56(up_save), up
+	lea	-8(rp_save), rp
+	jmp	L(mb0)
+
+L(mf3):	mulx(	(up_save), w0, w1)
+	lea	16(up_save), up
+	lea	16(rp_save), rp
+	inc	n
+	jmp	L(mb3)
+
+L(mf4):	mulx(	(up_save), w2, w3)
+	lea	24(up_save), up
+	lea	24(rp_save), rp
+	inc	n
+	jmp	L(mb4)
+
+L(mf5):	mulx(	(up_save), w0, w1)
+	lea	32(up_save), up
+	lea	32(rp_save), rp
+	inc	n
+	jmp	L(mb5)
+
+L(mf6):	mulx(	(up_save), w2, w3)
+	lea	40(up_save), up
+	lea	40(rp_save), rp
+	inc	n
+	jmp	L(mb6)
+
+L(mf7):	mulx(	(up_save), w0, w1)
+	lea	48(up_save), up
+	lea	48(rp_save), rp
+	inc	n
+	jmp	L(mb7)
+
+L(mf1):	mulx(	(up_save), w0, w1)
+	jmp	L(mb1)
+
+L(mf2):	mulx(	(up_save), w2, w3)
+	lea	8(up_save), up
+	lea	8(rp_save), rp
+	mulx(	(up), w0, w1)
+	test	n, n
+	jz	L(m1end)
+
+	ALIGN(32)
+L(m1top):
+	mov	w2, -8(rp)
+	adc	w3, w0
+L(mb1):	mulx(	8,(up), w2, w3)
+	adc	w1, w2
+	lea	64(up), up
+	mov	w0, (rp)
+L(mb0):	mov	w2, 8(rp)
+	mulx(	-48,(up), w0, w1)
+	lea	64(rp), rp
+	adc	w3, w0
+L(mb7):	mulx(	-40,(up), w2, w3)
+	mov	w0, -48(rp)
+	adc	w1, w2
+L(mb6):	mov	w2, -40(rp)
+	mulx(	-32,(up), w0, w1)
+	adc	w3, w0
+L(mb5):	mulx(	-24,(up), w2, w3)
+	mov	w0, -32(rp)
+	adc	w1, w2
+L(mb4):	mulx(	-16,(up), w0, w1)
+	mov	w2, -24(rp)
+	adc	w3, w0
+L(mb3):	mulx(	-8,(up), w2, w3)
+	adc	w1, w2
+	mov	w0, -16(rp)
+	dec	n
+	mulx(	(up), w0, w1)
+	jnz	L(m1top)
+
+L(m1end):
+	mov	w2, -8(rp)
+	adc	w3, w0
+	mov	w0, (rp)
+	adc	%rcx, w1		C relies on rcx = 0
+	mov	w1, 8(rp)
+
+	dec	vn
+	jz	L(done)
+
+	lea	L(atab)(%rip), %r10
+ifdef(`PIC',
+`	movslq	(%r10,%rax,4), %rax
+	lea	(%rax, %r10), jaddr
+',`
+	mov	(%r10,%rax,8), jaddr
+')
+
+L(outer):
+	mov	un_save, n
+	mov	(vp), v0
+	lea	8(vp), vp
+	lea	8(rp_save), rp_save
+	jmp	*jaddr
+
+C addmul_1
+L(f0):	mulx(	(up_save), w2, w3)
+	lea	-8(up_save), up
+	lea	-8(rp_save), rp
+	lea	-1(n), n
+	jmp	L(b0)
+
+L(f3):	mulx(	(up_save), w0, w1)
+	lea	16(up_save), up
+	lea	-48(rp_save), rp
+	jmp	L(b3)
+
+L(f4):	mulx(	(up_save), w2, w3)
+	lea	24(up_save), up
+	lea	-40(rp_save), rp
+	jmp	L(b4)
+
+L(f5):	mulx(	(up_save), w0, w1)
+	lea	32(up_save), up
+	lea	-32(rp_save), rp
+	jmp	L(b5)
+