[Gmp-commit] /var/hg/gmp: 2 new changesets

Thu Aug 8 14:33:10 UTC 2019

details:   /var/hg/gmp/rev/72f92e32a0d2
changeset: 17796:72f92e32a0d2
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Thu Aug 08 14:32:57 2019 +0200
description:
Make arm cpu types match what config.guess returns.

details:   /var/hg/gmp/rev/b252c7e4f9b6
changeset: 17797:b252c7e4f9b6
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Thu Aug 08 16:29:36 2019 +0200
description:
Provide generic x86 gcd_11.

diffstat:

 config.sub         |    2 +-
 mpn/x86/gcd_11.asm |  126 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 127 insertions(+), 1 deletions(-)

diffs (142 lines):

diff -r bc1bba5df9bf -r b252c7e4f9b6 config.sub

--- a/config.sub	Thu Aug 08 12:15:59 2019 +0200
+++ b/config.sub	Thu Aug 08 16:29:36 2019 +0200
@@ -130,7 +130,7 @@
 armsa1 | armxscale | arm9tdmi | arm9te |				\
 arm10* | arm11mpcore | armsa1 | arm1136 | arm1156 | arm1176 |		\
 armcortex[arm][0-9] | armcortex[arm][0-9][0-9] |			\
-arm*neon | xgene1 | exynosm1 | thunderx)
+arm*neon | armxgene1 | armexynosm1 | armthunderx)
   test_cpu="arm";;
 
 *)
diff -r bc1bba5df9bf -r b252c7e4f9b6 mpn/x86/gcd_11.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86/gcd_11.asm	Thu Aug 08 16:29:36 2019 +0200
@@ -0,0 +1,126 @@
+dnl  x86 mpn_gcd_11 optimised for processors with slow BSF.
+
+dnl  Based on C version.
+
+dnl  Copyright 2019 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+dnl  Rudimentary code for x86-32, i.e. for CPUs without cmov.  Also, the bsf
+dnl  instruction is assumed to be so slow it is useless.  Instead a teble is
+dnl  used.
+dnl  
+dnl  The loop benefits from OoO, in-order CPUs might want a different loop.
+dnl  The ebx and ecx registers could be combined if the assigment of ecx were
+dnl  postponed until ebx died, but that would at least hurt in-order CPUs.
+
+C	     cycles/bit (approx)
+C AMD K7	 ?
+C AMD K8,K9	 ?
+C AMD K10	 ?
+C AMD bd1	 ?
+C AMD bd2	 ?
+C AMD bd3	 ?
+C AMD bd4	 ?
+C AMD bt1	 ?
+C AMD bt2	 ?
+C AMD zn1	 ?
+C AMD zn2	 ?
+C Intel P4-2	 ?
+C Intel P4-3/4	 ?
+C Intel P6/13	 ?
+C Intel CNR	 ?
+C Intel NHM	 ?
+C Intel SBR	 ?
+C Intel IBR	 ?
+C Intel HWL	 ?
+C Intel BWL	 ?
+C Intel SKL	 ?
+C Intel atom	 ?
+C Intel SLM	 ?
+C Intel GLM	 ?
+C Intel GLM+	 ?
+C VIA nano	 ?
+C Numbers measured with: speed -CD -s8-32 -t24 mpn_gcd_1
+
+deflit(MAXSHIFT, 6)
+deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
+
+DEF_OBJECT(ctz_table,64)
+	.byte	MAXSHIFT
+forloop(i,1,MASK,
+`	.byte	m4_count_trailing_zeros(i)
+')
+END_OBJECT(ctz_table)
+
+define(`u0',    `%eax')
+define(`v0',    `%edx')
+
+
+ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_gcd_11)
+	push	%edi
+	push	%esi
+	push	%ebx
+
+	mov	16(%esp), u0
+	mov	20(%esp), v0
+	LEAL(	ctz_table, %esi)
+	sub	v0, u0			C u = u - v		0
+	jz	L(end)
+
+	ALIGN(16)
+L(top):	sbb	%ebx, %ebx		C mask			1
+	mov	u0, %edi		C			1
+	mov	u0, %ecx		C			1
+	and	%ebx, %edi		C			2
+	xor	%ebx, u0		C			2
+	add	%edi, v0		C v = min(u.v)		3
+	sub	%ebx, u0		C u = |u - v|		3
+L(mid):	and	$MASK, %ecx		C			2
+	movzbl	(%esi,%ecx), %ecx	C			3
+	jz	L(shift_alot)
+	shr	%cl, u0			C			4
+	sub	v0, u0			C u = u - v		0,5
+	jnz	L(top)
+
+L(end):	mov	v0, %eax
+	pop	%ebx
+	pop	%esi
+	pop	%edi
+	ret
+
+L(shift_alot):
+	shr	$MAXSHIFT, u0
+	mov	u0, %ecx
+	jmp	L(mid)
+EPILOGUE()
+ASM_END()