[Gmp-commit] /var/hg/gmp: Provide mpn_sbpi1_bdiv_r for bwl/skl/zen.

mercurial at gmplib.org mercurial at gmplib.org
Sun Jan 17 21:19:29 UTC 2021


details:   /var/hg/gmp/rev/c452b2e6f681
changeset: 18198:c452b2e6f681
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Sun Jan 17 22:19:23 2021 +0100
description:
Provide mpn_sbpi1_bdiv_r for bwl/skl/zen.

diffstat:

 mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm |  710 +++++++++++++++++++++++++++++++++++
 1 files changed, 710 insertions(+), 0 deletions(-)

diffs (truncated from 714 to 300 lines):

diff -r 5e97d3815b09 -r c452b2e6f681 mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/coreibwl/sbpi1_bdiv_r.asm	Sun Jan 17 22:19:23 2021 +0100
@@ -0,0 +1,710 @@
+dnl  AMD64 mpn_sbpi1_bdiv_r optimised for Intel Broadwell.
+
+dnl  Copyright 2015, 2021 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb	mul_1		addmul_1
+C AMD K8,K9	n/a		n/a
+C AMD K10	n/a		n/a
+C AMD bd1	n/a		n/a
+C AMD bd2	n/a		n/a
+C AMD bd3	n/a		n/a
+C AMD bd4	 ?		 ?
+C AMD zn1	 ?		 ?
+C AMD zn2	 ?		 ?
+C AMD zn3	 ?		 ?
+C AMD bt1	n/a		n/a
+C AMD bt2	n/a		n/a
+C Intel P4	n/a		n/a
+C Intel PNR	n/a		n/a
+C Intel NHM	n/a		n/a
+C Intel SBR	n/a		n/a
+C Intel IBR	n/a		n/a
+C Intel HWL	 1.68		n/a
+C Intel BWL	 1.51	      1.67-1.74
+C Intel SKL	 1.52	      1.63-1.71
+C Intel atom	n/a		n/a
+C Intel SLM	n/a		n/a
+C VIA nano	n/a		n/a
+
+C The inner loops of this code are the result of running a code generation and
+C optimisation tool suite written by David Harvey and Torbjorn Granlund.
+
+C TODO
+C  * Do overlapped software pipelining.
+C  * Reduce register use, i.e., by combining n_neg and n_save.
+C  * Supporess initial store through up, it's always a zero.
+C  * Streamline up and dp setup.
+C  * When changing this, make sure the code which falls into the inner loops
+C    does not execute too many no-ops (for both PIC and non-PIC).
+
+dnl  mp_limb_t
+dnl  mpn_sbpi1_bdiv_r (mp_ptr up, mp_size_t un,
+dnl		       mp_srcptr dp, mp_size_t dn, mp_limb_t dinv)
+
+define(`up',      `%rdi')
+define(`un',      `%rsi')
+define(`dp_param',`%rdx')
+define(`dn_param',`%rcx')
+define(`dinv',    `%r8')
+
+define(`n',       `%rcx')
+define(`n_save',  `%rbp')
+define(`dp',      `%r14')
+define(`n_neg',   `%rbx')
+define(`q',       `%rdx')
+define(`jaddr',   `%rax')
+
+define(`w0',	`%r12')
+define(`w1',	`%r9')
+define(`w2',	`%r10')
+define(`w3',	`%r11')
+
+ifdef(`MAX_SPECIAL',,`
+define(`MAX_SPECIAL', 8)')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_sbpi1_bdiv_r)
+	FUNC_ENTRY(4)
+IFDOS(`	mov	56(%rsp), %r8	')
+
+	lea	L(atab)(%rip), %r10
+
+	cmp	$MAX_SPECIAL, dn_param
+	jbe	L(sma)
+
+ifelse(MAX_SPECIAL,8,,`
+forloop(i,eval(MAX_SPECIAL+1),9,`L(i):
+')')
+
+L(gen):	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+
+	mov	dp_param, dp		C free up rdx
+	xor	%r13, %r13
+
+	sub	dn_param, un		C outer loop count
+
+	lea	-8(,dn_param,8), n_neg
+	neg	n_neg
+	mov	dn_param, n_save
+	mov	R32(dn_param), R32(%rax)
+	shr	$3, n_save		C loop count
+	and	$7, R32(%rax)		C clear CF and OF as side-effect
+
+ifdef(`PIC',
+`	movslq	(%r10,%rax,4), %rax
+	lea	(%rax,%r10), jaddr
+',`
+	mov	(%r10,%rax,8), jaddr
+')
+	mov	(up), q
+	imul	dinv, q
+	jmp	L(outer)
+
+L(f0):	mulx(	(dp), w2, w3)
+	lea	-1(n), n
+	mulx(	8,(dp), w0, w1)
+	lea	-8(dp), dp
+	adcx(	w3, w0)
+	adox(	(up), w2)
+	lea	-8(up), up
+	jmp	L(b0x)
+
+L(f3):	mulx(	(dp), w0, w1)
+	mulx(	8,(dp), w2, w3)
+	adox(	(up), w0)
+	lea	-48(up), up
+	lea	16(dp), dp
+	jmp	L(b3x)
+
+L(f4):	mulx(	(dp), w2, w3)
+	mulx(	8,(dp), w0, w1)
+	lea	24(dp), dp
+	adox(	(up), w2)
+	lea	-40(up), up
+	adcx(	w3, w0)
+	jmp	L(b4x)
+
+L(f5):	mulx(	(dp), w0, w1)
+	mulx(	8,(dp), w2, w3)
+	lea	32(dp), dp
+	adcx(	w1, w2)
+	adox(	(up), w0)
+	lea	-32(up), up
+	jmp	L(b5x)
+
+L(f6):	mulx(	(dp), w2, w3)
+	mulx(	8,(dp), w0, w1)
+	lea	40(dp), dp
+	adox(	(up), w2)
+	lea	-24(up), up
+	adcx(	w3, w0)
+	jmp	L(b6x)
+
+L(f7):	mulx(	(dp), w0, w1)
+	mulx(	8,(dp), w2, w3)
+	lea	48(dp), dp
+	adcx(	w1, w2)
+	adox(	(up), w0)
+	lea	-16(up), up
+	jmp	L(b7x)
+
+L(f1):	mulx(	(dp), w0, w1)
+	mulx(	8,(dp), w2, w3)
+	adox(	(up), w0)
+	lea	-1(n), n
+	jmp	L(b1x)
+
+L(f2):	mulx(	(dp), w2, w3)
+	mulx(	8,(dp), w0, w1)
+	lea	8(dp), dp
+	adox(	(up), w2)
+	lea	8(up), up
+	adcx(	w3, w0)
+	jmp	L(b2x)
+
+L(end):	adox(	(up), w0)
+	adox(	%rcx, w1)		C relies on rcx = 0
+	mov	w0, (up)
+	adc	%rcx, w1		C relies on rcx = 0
+	mov	8(up,n_neg), q		C Compute next quotient early...
+	mulx(	dinv, q, %r12)		C ...(unused in last iteration)
+	bt	$0, R32(%r13)
+	adc	w1, 8(up)
+	setc	R8(%r13)
+	dec	un			C clear OF as side-effect
+	jz	L(done)
+
+	lea	(dp,n_neg), dp		C reset dp to D[]'s beginning
+	lea	8(up,n_neg), up		C point up to U[]'s current beginning
+L(outer):
+	mov	n_save, n
+	test	%eax, %eax		C clear CF and OF
+	jmp	*jaddr
+
+	ALIGN(16)
+L(top):	adox(	-8,(up), w2)
+	adcx(	w3, w0)
+	mov	w2, -8(up)
+	jrcxz	L(end)
+L(b2x):	mulx(	8,(dp), w2, w3)
+	adox(	(up), w0)
+	lea	-1(n), n
+	mov	w0, (up)
+L(b1x):	adcx(	w1, w2)
+	mulx(	16,(dp), w0, w1)
+	adcx(	w3, w0)
+	adox(	8,(up), w2)
+	mov	w2, 8(up)
+L(b0x):	mulx(	24,(dp), w2, w3)
+	lea	64(dp), dp
+	adcx(	w1, w2)
+	adox(	16,(up), w0)
+	mov	w0, 16(up)
+L(b7x):	mulx(	-32,(dp), w0, w1)
+	adox(	24,(up), w2)
+	adcx(	w3, w0)
+	mov	w2, 24(up)
+L(b6x):	mulx(	-24,(dp), w2, w3)
+	adcx(	w1, w2)
+	adox(	32,(up), w0)
+	mov	w0, 32(up)
+L(b5x):	mulx(	-16,(dp), w0, w1)
+	adox(	40,(up), w2)
+	adcx(	w3, w0)
+	mov	w2, 40(up)
+L(b4x):	adox(	48,(up), w0)
+	mulx(	-8,(dp), w2, w3)
+	mov	w0, 48(up)
+L(b3x):	lea	64(up), up
+	adcx(	w1, w2)
+	mulx(	(dp), w0, w1)
+	jmp	L(top)
+
+L(done):mov	%r13, %rax
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	FUNC_EXIT()
+	ret
+
+L(sma):
+ifdef(`PIC',
+`	movslq	28(%r10,dn_param,4), %rax
+	lea	(%rax,%r10), jaddr
+',`
+	mov	56(%r10,dn_param,8), jaddr
+')
+	jmp	*jaddr
+
+L(1):	mov	(dp_param), %r10
+	xor	R32(%rax), R32(%rax)
+	mov	(up), %rdx
+	dec	un
+	mov	%rdx, %r9
+L(o1):	mulx(	dinv, %rdx, %r11)	C next quotient
+	lea	8(up), up
+	mulx(	%r10, %rcx, %rdx)	C 0 1
+	add	%r9, %rcx		C 0
+	adc	%rax, %rdx		C 1
+	add	(up), %rdx		C 1
+	setc	R8(%rax)		C 2
+	mov	%rdx, %r9		C 1
+	dec	un
+	jnz	L(o1)
+	mov	%r9, (up)
+
+	FUNC_EXIT()
+	ret


More information about the gmp-commit mailing list