[Gmp-commit] /var/hg/gmp: Replace grabber with bt1 optimised code.

Sun Sep 1 00:21:16 UTC 2019

details:   /var/hg/gmp/rev/228585220bca
changeset: 17857:228585220bca
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Sun Sep 01 02:13:52 2019 +0200
description:
Replace grabber with bt1 optimised code.

diffstat:

 mpn/x86_64/bt1/gcd_11.asm |  89 ++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 85 insertions(+), 4 deletions(-)

diffs (104 lines):

diff -r 01aceb2081ea -r 228585220bca mpn/x86_64/bt1/gcd_11.asm

--- a/mpn/x86_64/bt1/gcd_11.asm	Fri Aug 30 23:53:19 2019 +0200
+++ b/mpn/x86_64/bt1/gcd_11.asm	Sun Sep 01 02:13:52 2019 +0200
@@ -1,6 +1,10 @@
-dnl  AMD64 mpn_gcd_11.
+dnl  AMD64 mpn_gcd_11 -- 1 x 1 gcd.
 
-dnl  Copyright 2012 Free Software Foundation, Inc.
+dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked for AMD64 by Torbjorn
+dnl  Granlund.
+
+dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017 Free Software
+dnl  Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 dnl
@@ -30,8 +34,85 @@
 
 include(`../config.m4')
 
+
+C	     cycles/bit
+C AMD K8,K9	 ?
+C AMD K10	 ?
+C AMD bd1	 ?
+C AMD bd2	 ?
+C AMD bd3	 ?
+C AMD bd4	 ?
+C AMD bt1	 5.4
+C AMD bt2	 ?
+C AMD zn1	 ?
+C AMD zn2	 ?
+C Intel P4	 ?
+C Intel CNR	 ?
+C Intel PNR	 ?
+C Intel NHM	 ?
+C Intel WSM	 ?
+C Intel SBR	 ?
+C Intel IBR	 ?
+C Intel HWL	 ?
+C Intel BWL	 ?
+C Intel SKL	 ?
+C Intel atom	 ?
+C Intel SLM	 ?
+C Intel GLM	 ?
+C Intel GLM+	 ?
+C VIA nano	 ?
+
+
+C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
+
+deflit(MAXSHIFT, 8)
+deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
+
+DEF_OBJECT(ctz_table,64)
+	.byte	MAXSHIFT
+forloop(i,1,MASK,
+`	.byte	m4_count_trailing_zeros(i)
+')
+END_OBJECT(ctz_table)
+
+define(`u0',    `%rdi')
+define(`v0',    `%rsi')
+
+define(`cnt',   `%rcx')
+define(`s0',    `%rax')
+define(`t0',    `%rdx')
+
 ABI_SUPPORT(DOS64)
 ABI_SUPPORT(STD64)
 
-MULFUNC_PROLOGUE(mpn_gcd_11)
-include_mpn(`x86_64/gcd_11.asm')
+ASM_START()
+	TEXT
+	ALIGN(64)
+PROLOGUE(mpn_gcd_11)
+	FUNC_ENTRY(2)
+	LEA(	ctz_table, %r10)
+	mov	v0, t0
+	sub	u0, t0
+	jz	L(end)
+
+	ALIGN(16)
+L(top):	mov	u0, s0
+	sub	v0, u0
+	cmovc	t0, u0		C u = |u - v|
+	cmovc	s0, v0		C v = min(u,v)
+	and	$MASK, R32(t0)
+	movzbl	(%r10,t0), R32(cnt)
+	jz	L(count_better)
+L(shr):	shr	R8(cnt), u0
+	mov	v0, t0
+	sub	u0, t0
+	jnz	L(top)
+
+L(end):	mov	v0, %rax
+	C rdx = 0 for the benefit of internal gcd_22 call
+	ret
+
+L(count_better):
+	bsf	u0, cnt
+	jmp	L(shr)
+EPILOGUE()