changeset 17981:f5fc3e7389b4

Remove all gcd_1.asm files.
author Torbjorn Granlund <tg@gmplib.org>
date Fri, 29 Nov 2019 00:53:11 +0100
parents 69d225a203b9
children 5f237941addc
files mpn/alpha/ev67/gcd_1.asm mpn/arm/v5/gcd_1.asm mpn/arm/v6t2/gcd_1.asm mpn/arm64/gcd_1.asm mpn/ia64/gcd_1.asm mpn/powerpc64/mode64/gcd_1.asm mpn/powerpc64/mode64/p7/gcd_1.asm mpn/powerpc64/mode64/p9/gcd_1.asm mpn/sparc64/gcd_1.asm mpn/x86/k6/gcd_1.asm mpn/x86/k7/gcd_1.asm mpn/x86/p6/gcd_1.asm mpn/x86_64/bd1/gcd_1.asm mpn/x86_64/bd2/gcd_1.asm mpn/x86_64/bt2/gcd_1.asm mpn/x86_64/core2/gcd_1.asm mpn/x86_64/gcd_1.asm mpn/x86_64/k10/gcd_1.asm mpn/x86_64/nano/gcd_1.asm mpn/x86_64/zen/gcd_1.asm
diffstat 20 files changed, 0 insertions(+), 2603 deletions(-) [+]
line wrap: on
line diff
--- a/mpn/alpha/ev67/gcd_1.asm	Sun Nov 24 23:02:33 2019 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,145 +0,0 @@
-dnl  Alpha ev67 mpn_gcd_1 -- Nx1 greatest common divisor.
-
-dnl  Copyright 2003, 2004 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C ev67: 3.4 cycles/bitpair for 1x1 part
-
-
-C mp_limb_t mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y);
-C
-C In the 1x1 part, the algorithm is to change x,y to abs(x-y),min(x,y) and
-C strip trailing zeros from abs(x-y) to maintain x and y both odd.
-C
-C The trailing zeros are calculated from just x-y, since in twos-complement
-C there's the same number of trailing zeros on d or -d.  This means the cttz
-C runs in parallel with abs(x-y).
-C
-C The loop takes 5 cycles, and at 0.68 iterations per bit for two N-bit
-C operands with this algorithm gives the measured 3.4 c/l.
-C
-C The slottings shown are for SVR4 style systems, Unicos differs in the
-C initial gp setup and the LEA.
-C
-C Enhancement:
-C
-C On the jsr, !lituse_jsr! (when available) would allow the linker to relax
-C it to a bsr, but probably only in a static binary.  Plain "jsr foo" gives
-C the right object code for relaxation, and ought to be available
-C everywhere, but we prefer to schedule the GOT ldq (LEA) back earlier, for
-C the usual case of running in a shared library.
-C
-C bsr could perhaps be used explicitly anyway.  We should be able to assume
-C modexact is in the same module as us (ie. shared library or mainline).
-C Would there be any worries about the size of the displacement?  Could
-C always put modexact and gcd_1 in the same .o to be certain.
-
-ASM_START()
-PROLOGUE(mpn_gcd_1, gp)
-
-	C r16	xp
-	C r17	size
-	C r18	y
-
-	C ldah				C l
-	C lda				C u
-
-	ldq	r0, 0(r16)		C L   x = xp[0]
-	lda	r30, -32(r30)		C u   alloc stack
-
-	LEA(  r27, mpn_modexact_1c_odd)	C L   modexact addr, ldq (gp)
-	stq	r10, 16(r30)		C L   save r10
-	cttz	r18, r10		C U0  y twos
-	cmpeq	r17, 1, r5		C u   test size==1
-
-	stq	r9, 8(r30)		C L   save r9
-	clr	r19			C u   zero c for modexact
-	unop
-	unop
-
-	cttz	r0, r6			C U0  x twos
-	stq	r26, 0(r30)		C L   save ra
-
-	srl	r18, r10, r18		C U   y odd
-
-	mov	r18, r9			C l   hold y across call
-
-	cmpult	r6, r10, r2		C u   test x_twos < y_twos
-
-	cmovne	r2, r6, r10		C l   common_twos = min(x_twos,y_twos)
-	bne	r5, L(one)		C U   no modexact if size==1
-	jsr	r26, (r27), mpn_modexact_1c_odd   C L0
-
-	LDGP(	r29, 0(r26))		C u,l ldah,lda
-	cttz	r0, r6			C U0  new x twos
-	ldq	r26, 0(r30)		C L   restore ra
-
-L(one):
-	mov	r9, r1			C u   y
-	ldq	r9, 8(r30)		C L   restore r9
-	mov	r10, r2			C u   common twos
-	ldq	r10, 16(r30)		C L   restore r10
-
-	lda	r30, 32(r30)		C l   free stack
-	beq	r0, L(done)		C U   return y if x%y==0
-
-	srl	r0, r6, r0		C U   x odd
-	unop
-
-	ALIGN(16)
-L(top):
-	C r0	x
-	C r1	y
-	C r2	common twos, for use at end
-
-	subq	r0, r1, r7		C l0  d = x - y
-	cmpult	r0, r1, r16		C u0  test x >= y
-
-	subq	r1, r0, r4		C l0  new_x = y - x
-	cttz	r7, r8			C U0  d twos
-
-	cmoveq	r16, r7, r4		C l0  new_x = d if x>=y
-	cmovne	r16, r0, r1		C u0  y = x if x<y
-	unop				C l   \ force cmoveq into l0
-	unop				C u   /
-
-	C				C cmoveq2 L0, cmovne2 U0
-
-	srl	r4, r8, r0		C U0  x = new_x >> twos
-	bne	r7, L(top)		C U1  stop when d==0
-
-
-L(done):
-	sll	r1, r2, r0		C U0  return y << common_twos
-	ret	r31, (r26), 1		C L0
-
-EPILOGUE()
-ASM_END()
--- a/mpn/arm/v5/gcd_1.asm	Sun Nov 24 23:02:33 2019 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,123 +0,0 @@
-dnl  ARM v5 mpn_gcd_1.
-
-dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked for ARM by Torbjörn
-dnl  Granlund.
-
-dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C	     cycles/bit (approx)
-C StrongARM	 -
-C XScale	 ?
-C Cortex-A5	 6.45
-C Cortex-A7	 6.41
-C Cortex-A8	 5.0
-C Cortex-A9	 5.9
-C Cortex-A15	 4.40
-C Cortex-A17	 5.68
-C Cortex-A53	 4.37
-C Numbers measured with: speed -CD -s8-32 -t24 mpn_gcd_1
-
-C TODO
-C  * Optimise inner-loop better.
-
-C Threshold of when to call bmod when U is one limb.  Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`BMOD_THRES_LOG2', 6)
-
-C INPUT PARAMETERS
-define(`up',    `r0')
-define(`n',     `r1')
-define(`v0',    `r2')
-
-ifdef(`BMOD_1_TO_MOD_1_THRESHOLD',,
-  `define(`BMOD_1_TO_MOD_1_THRESHOLD',0xffffffff)')
-
-ASM_START()
-	TEXT
-	ALIGN(16)
-PROLOGUE(mpn_gcd_1)
-	push	{r4, r7, lr}
-	ldr	r3, [up]	C U low limb
-
-	orr	r3, r3, v0
-	rsb	r4, r3, #0
-	and	r4, r4, r3
-	clz	r4, r4		C min(ctz(u0),ctz(v0))
-	rsb	r4, r4, #31
-
-	rsb	r12, v0, #0
-	and	r12, r12, v0
-	clz	r12, r12
-	rsb	r12, r12, #31
-	mov	v0, v0, lsr r12
-
-	mov	r7, v0
-
-	cmp	n, #1
-	bne	L(nby1)
-
-C Both U and V are single limbs, reduce with bmod if u0 >> v0.
-	ldr	r3, [up]
-	cmp	v0, r3, lsr #BMOD_THRES_LOG2
-	bhi	L(red1)
-
-L(bmod):mov	r3, #0		C carry argument
-	bl	mpn_modexact_1c_odd
-	b	L(red0)
-
-L(nby1):cmp	n, #BMOD_1_TO_MOD_1_THRESHOLD
-	blo	L(bmod)
-
-	bl	mpn_mod_1
-
-L(red0):mov	r3, r0
-L(red1):rsbs	r12, r3, #0
-	and	r12, r12, r3
-	clz	r12, r12
-	rsb	r12, r12, #31
-	bne	L(mid)
-	b	L(end)
-
-	ALIGN(8)
-L(top):	rsb	r12, r12, #31
-	movcc	r3, r1		C if x-y < 0
-	movcc	r7, r0		C use x,y-x
-L(mid):	mov	r3, r3, lsr r12	C
-	mov	r0, r3		C
-	sub	r1, r7, r3	C
-	rsbs	r3, r7, r3	C
-	and	r12, r1, r3	C
-	clz	r12, r12	C
-	bne	L(top)		C
-
-L(end):	mov	r0, r7, lsl r4
-	pop	{r4, r7, pc}
-EPILOGUE()
--- a/mpn/arm/v6t2/gcd_1.asm	Sun Nov 24 23:02:33 2019 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,118 +0,0 @@
-dnl  ARM v6t2 mpn_gcd_1.
-
-dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked for ARM by Torbjörn
-dnl  Granlund.
-
-dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C	     cycles/bit (approx)
-C StrongARM	 -
-C XScale	 -
-C Cortex-A5	 5.75
-C Cortex-A7	 6.38
-C Cortex-A8	 5.0
-C Cortex-A9	 5.3
-C Cortex-A15	 2.92
-C Cortex-A17	 5.63
-C Cortex-A53	 4.25
-C Numbers measured with: speed -CD -s8-32 -t24 mpn_gcd_1
-
-C TODO
-C  * Optimise inner-loop better.
-C  * Push saving/restoring of callee-user regs into call code
-
-C Threshold of when to call bmod when U is one limb.  Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`BMOD_THRES_LOG2', 7)
-
-C INPUT PARAMETERS
-define(`up',    `r0')
-define(`n',     `r1')
-define(`v0',    `r2')
-
-ifdef(`BMOD_1_TO_MOD_1_THRESHOLD',,
-  `define(`BMOD_1_TO_MOD_1_THRESHOLD',0xffffffff)')
-
-ASM_START()
-	TEXT
-	ALIGN(16)
-PROLOGUE(mpn_gcd_1)
-	push	{r4, r7, lr}
-	ldr	r3, [up]	C U low limb
-
-	orr	r3, r3, v0
-	rbit	r4, r3
-	clz	r4, r4		C min(ctz(u0),ctz(v0))
-
-	rbit	r12, v0
-	clz	r12, r12
-	mov	v0, v0, lsr r12
-
-	mov	r7, v0
-
-	cmp	n, #1
-	bne	L(nby1)
-
-C Both U and V are single limbs, reduce with bmod if u0 >> v0.
-	ldr	r3, [up]
-	cmp	v0, r3, lsr #BMOD_THRES_LOG2
-	bhi	L(red1)
-
-L(bmod):mov	r3, #0		C carry argument
-	bl	mpn_modexact_1c_odd
-	b	L(red0)
-
-L(nby1):cmp	n, #BMOD_1_TO_MOD_1_THRESHOLD
-	blo	L(bmod)
-
-	bl	mpn_mod_1
-
-L(red0):mov	r3, r0
-L(red1):cmp	r3, #0
-	rbit	r12, r3
-	clz	r12, r12
-	bne	L(mid)
-	b	L(end)
-
-	ALIGN(8)
-L(top):	movcs	r3, r1		C if x-y < 0
-	movcs	r7, r0		C use x,y-x
-L(mid):	mov	r3, r3, lsr r12	C
-	mov	r0, r3		C
-	subs	r1, r7, r3	C
-	rsb	r3, r7, r3	C
-	rbit	r12, r1
-	clz	r12, r12	C
-	bne	L(top)		C
-
-L(end):	mov	r0, r7, lsl r4
-	pop	{r4, r7, pc}
-EPILOGUE()
--- a/mpn/arm64/gcd_1.asm	Sun Nov 24 23:02:33 2019 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,125 +0,0 @@
-dnl  ARM v8a mpn_gcd_1.
-
-dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked for ARM by Torbjorn
-dnl  Granlund.
-
-dnl  Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-changecom(blah)
-
-C	     cycles/bit (approx)
-C Cortex-A53	 ?
-C Cortex-A57	 ?
-
-C TODO
-C  * Optimise inner-loop better.
-C  * Push saving/restoring of callee-user regs into call code
-
-C Threshold of when to call bmod when U is one limb.  Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`BMOD_THRES_LOG2', 7)
-
-C INPUT PARAMETERS
-define(`up',    `x0')
-define(`n',     `x1')
-define(`v0',    `x2')
-
-ifdef(`BMOD_1_TO_MOD_1_THRESHOLD',,
-  `define(`BMOD_1_TO_MOD_1_THRESHOLD',30)')
-
-ASM_START()
-	TEXT
-	ALIGN(16)
-PROLOGUE(mpn_gcd_1)
-	stp	x29, x30, [sp,#-32]!
-	ldr	x3, [up]		C U low limb
-	stp     x19, x20, [sp,#16]
-
-	orr	x3, x3, v0
-	rbit	x4, x3
-	clz	x20, x4			C min(ctz(u0),ctz(v0))
-
-	rbit	x12, v0
-	clz	x12, x12
-	lsr	v0, v0, x12
-
-	mov	x19, v0
-
-	cmp	n, #1
-	b.ne	L(nby1)
-
-C Both U and V are single limbs, reduce with bmod if u0 >> v0.
-	ldr	x3, [up]
-	cmp	v0, x3, lsr #BMOD_THRES_LOG2
-	b.hi	L(red1)
-
-L(bmod):mov	x3, #0			C carry argument
-	bl	mpn_modexact_1c_odd
-	b	L(red0)
-
-L(nby1):cmp	n, #BMOD_1_TO_MOD_1_THRESHOLD
-	b.lo	L(bmod)
-
-	bl	mpn_mod_1
-
-L(red0):mov	x3, x0
-L(red1):cmp	x3, #0
-	rbit	x12, x3
-	clz	x12, x12
-	b.ne	L(mid)
-	b	L(end)
-
-	ALIGN(8)
-L(top):
-ifelse(1,1,`
-C This shorter variant makes full use of armv8 insns
-	csneg	x3, x1, x1, cs		C if x-y < 0
-	csel	x19, x4, x19, cs	C use x,y-x
-L(mid):	lsr	x4, x3, x12		C
-	subs	x1, x19, x4		C
-',`
-C This variant is akin to the 32-bit v6t2 code
-	csel	x3, x1, x3, cs		C if x-y < 0
-	csel	x19, x0, x19, cs	C use x,y-x
-L(mid):	lsr	x3, x3, x12		C
-	mov	x0, x3			C
-	subs	x1, x19, x3		C
-	sub	x3, x3, x19		C
-')
-	rbit	x12, x1
-	clz	x12, x12		C
-	b.ne	L(top)			C
-
-L(end):	lsl	x0, x19, x20
-	ldp     x19, x20, [sp,#16]
-	ldp	x29, x30, [sp],#32
-	ret
-EPILOGUE()
--- a/mpn/ia64/gcd_1.asm	Sun Nov 24 23:02:33 2019 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,238 +0,0 @@
-dnl  Itanium-2 mpn_gcd_1 -- mpn by 1 gcd.
-
-dnl  Contributed to the GNU project by Kevin Ryde, innerloop by Torbjorn
-dnl  Granlund.
-
-dnl  Copyright 2002-2005, 2012, 2013, 2015 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C           cycles/bitpair (1x1 gcd)
-C Itanium:       ?
-C Itanium 2:     5.1
-
-
-C mpn_gcd_1 (mp_srcptr xp, mp_size_t xsize, mp_limb_t y);
-C
-C The entry sequence is designed to expect xsize>1 and hence a modexact
-C call.  This ought to be more common than a 1x1 operation.  Our critical
-C path is thus stripping factors of 2 from y, calling modexact, then
-C stripping factors of 2 from the x remainder returned.
-C
-C The common factors of 2 between x and y must be determined using the
-C original x, not the remainder from the modexact.  This is done with
-C x_orig which is xp[0].  There's plenty of time to do this while the rest
-C of the modexact etc is happening.
-C
-C It's possible xp[0] is zero.  In this case the trailing zeros calculation
-C popc((x-1)&~x) gives 63, and that's clearly no less than what y will
-C have, making min(x_twos,y_twos) == y_twos.
-C
-C The main loop consists of transforming x,y to abs(x-y),min(x,y), and then
-C stripping factors of 2 from abs(x-y).  Those factors of two are
-C determined from just y-x, without the abs(), since there's the same
-C number of trailing zeros on n or -n in twos complement.  That makes the
-C dependent chain 8 cycles deep.
-C
-C The selection of x-y versus y-x for abs(x-y), and the selection of the
-C minimum of x and y, is done in parallel with the critical path.
-C
-C The algorithm takes about 0.68 iterations per bit (two N bit operands) on
-C average, hence the final 5.8 cycles/bitpair.
-C
-C Not done:
-C
-C An alternate algorithm which didn't strip all twos, but instead applied
-C tbit and predicated extr on x, and then y, was attempted.  The loop was 6
-C cycles, but the algorithm is an average 1.25 iterations per bitpair for a
-C total 7.25 c/bp, which is slower than the current approach.
-C
-C Alternatives:
-C
-C Perhaps we could do something tricky by extracting a few high bits and a
-C few low bits from the operands, and looking up a table which would give a
-C set of predicates to control some shifts or subtracts or whatever.  That
-C could knock off multiple bits per iteration.
-C
-C The right shifts are a bit of a bottleneck (shr at 2 or 3 cycles, or extr
-C only going down I0), perhaps it'd be possible to shift left instead,
-C using add.  That would mean keeping track of the lowest not-yet-zeroed
-C bit, using some sort of mask.
-C
-C TODO:
-C  * Once mod_1_N exists in assembly for Itanium, add conditional calls.
-C  * Call bmod_1 even for n=1 when up[0] >> v0 (like other gcd_1 impls).
-C  * Probably avoid popcnt also outside of loop, instead use ctz_table.
-
-ASM_START()
-	.explicit				C What does this mean?
-
-C HP's assembler requires these declarations for importing mpn_modexact_1c_odd
-	.global	mpn_modexact_1c_odd
-	.type	mpn_modexact_1c_odd,@function
-
-C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
-
-deflit(MAXSHIFT, 7)
-deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
-
-C	.section	".rodata"
-	.rodata
-	ALIGN(m4_lshift(1,MAXSHIFT))	C align table to allow using dep
-ctz_table:
-	data1	MAXSHIFT
-forloop(i,1,MASK,
-`	data1	m4_count_trailing_zeros(i)
-')
-
-PROLOGUE(mpn_gcd_1)
-
-		C r32	xp
-		C r33	xsize
-		C r34	y
-
-define(x,           r8)
-define(xp_orig,     r32)
-define(xsize,       r33)
-define(y,           r34)  define(inputs, 3)
-define(save_rp,     r35)
-define(save_pfs,    r36)
-define(x_orig,      r37)
-define(x_orig_one,  r38)
-define(y_twos,      r39)  define(locals, 5)
-define(out_xp,      r40)
-define(out_xsize,   r41)
-define(out_divisor, r42)
-define(out_carry,   r43)  define(outputs, 4)
-
-	.prologue
- {.mmi;
-ifdef(`HAVE_ABI_32',
-`		addp4	r9 = 0, xp_orig   define(xp,r9)',	C M0
-`					  define(xp,xp_orig)')
-	.save ar.pfs, save_pfs
-		alloc	save_pfs = ar.pfs, inputs, locals, outputs, 0 C M2
-	.save rp, save_rp
-		mov	save_rp = b0			C I0
-}{.mbb;	.body
-		add	r10 = -1, y			C M3  y-1
-		nop.b	0				C B0
-		nop.b	0				C B1
-	;;
-
-}{.mmi;		ld8	x = [xp]			C M0  x = xp[0] if no modexact
-		ld8	x_orig = [xp]			C M1  orig x for common twos
-		cmp.ne	p6,p0 = 1, xsize		C I0
-}{.mmi;		andcm	y_twos = r10, y			C M2  (y-1)&~y
-		mov	out_xp = xp_orig		C M3
-		mov	out_xsize = xsize		C I1
-	;;
-}{.mmi;		mov	out_carry = 0			C M0
-		nop.m	0				C M1
-		popcnt	y_twos = y_twos			C I0  y twos
-	;;
-}{.mmi;		add	x_orig_one = -1, x_orig		C M0  orig x-1
-		nop.m	0				C M1
-		shr.u	out_divisor = y, y_twos		C I0  y without twos
-}{.mib;		nop.m	0				C M2
-		shr.u	y = y, y_twos			C I1  y without twos
-	(p6)	br.call.sptk.many b0 = mpn_modexact_1c_odd  C if xsize>1
-	;;
-}
-	C modexact can leave x==0
- {.mmi;		cmp.eq	p6,p0 = 0, x			C M0  if {xp,xsize} % y == 0
-		andcm	x_orig = x_orig_one, x_orig	C M1  orig (x-1)&~x
-		add	r9 = -1, x			C I0  x-1
-	;;
-}{.mmi;		andcm	r9 = r9, x			C M0  (x-1)&~x
-		nop.m	0				C M1
-		mov	b0 = save_rp			C I0
-	;;
-}{.mii;		nop.m	0				C M0
-		popcnt	x_orig = x_orig			C I0  orig x twos
-		popcnt	r9 = r9				C I0  x twos
-	;;
-}{.mmi;		cmp.lt	p7,p0 = x_orig, y_twos		C M0  orig x_twos < y_twos
-		addl	r22 = @ltoff(ctz_table), r1
-		shr.u	x = x, r9			C I0  x odd
-	;;
-}{.mib;
-	(p7)	mov	y_twos = x_orig		C M0  common twos
-		add	r10 = -1, y		C I0  y-1
-	(p6)	br.dpnt.few L(done_y)		C B0  x%y==0 then result y
-	;;
-}
-		mov	r25 = m4_lshift(MASK, MAXSHIFT)
-		ld8	r22 = [r22]
-		br	L(ent)
-	;;
-
-	ALIGN(32)
-L(top):
-	.pred.rel "mutex", p6,p7
- {.mmi;	(p7)	mov	y = x
-	(p6)	sub	x = x, y
-		dep	r21 = r19, r22, 0, MAXSHIFT	C concat(table,lowbits)
-}{.mmi;		and	r20 = MASK, r19
-	(p7)	mov	x = r19
-		nop	0
-	;;
-}
-L(mid):
-{.mmb;		ld1	r16 = [r21]
-		cmp.eq	p10,p0 = 0, r20
-	(p10)	br.spnt.few.clr	 L(shift_alot)
-	;;
-}{.mmi;		nop	0
-		nop	0
-		shr.u	x = x, r16
-	;;
-}
-L(ent):
- {.mmi;		sub	r19 = y, x
-		cmp.gtu	p6,p7 = x, y
-		cmp.ne	p8,p0 = x, y
-}{.mmb;		nop	0
-		nop	0
-	(p8)	br.sptk.few.clr L(top)
-}
-
-L(done_y):	C result is y
-		mov	ar.pfs = save_pfs	C I0
-		shl	r8 = y, y_twos		C I   common factors of 2
-		br.ret.sptk.many b0
-
-L(shift_alot):
-		and	r20 = x, r25
-		shr.u	x = x, MAXSHIFT
-	;;
-		dep	r21 = x, r22, 0, MAXSHIFT
-		br	L(mid)
-EPILOGUE()
--- a/mpn/powerpc64/mode64/gcd_1.asm	Sun Nov 24 23:02:33 2019 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,125 +0,0 @@
-dnl  PowerPC-64 mpn_gcd_1.
-
-dnl  Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C		    cycles/bit (approx)
-C POWER3/PPC630		 ?
-C POWER4/PPC970		 8.5
-C POWER5		 ?
-C POWER6		10.1
-C POWER7		 9.4
-C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
-
-C INPUT PARAMETERS
-define(`up',    `r3')
-define(`n',     `r4')
-define(`v0',    `r5')
-
-ifdef(`BMOD_1_TO_MOD_1_THRESHOLD',,
-  `define(`BMOD_1_TO_MOD_1_THRESHOLD',30)')
-
-EXTERN_FUNC(mpn_mod_1)
-EXTERN_FUNC(mpn_modexact_1c_odd)
-
-ASM_START()
-PROLOGUE(mpn_gcd_1,toc)
-	mflr	r0
-	std	r30, -16(r1)
-	std	r31, -8(r1)
-	std	r0, 16(r1)
-	stdu	r1, -128(r1)
-
-	ld	r7, 0(up)		C U low limb
-	or	r0, r5, r7		C x | y
-
-	neg	r6, r0
-	and	r6, r6, r0
-	cntlzd	r31, r6			C common twos
-	subfic	r31, r31, 63
-
-	neg	r6, r5
-	and	r6, r6, r5
-	cntlzd	r8, r6
-	subfic	r8, r8, 63
-	srd	r5, r5, r8
-	mr	r30, r5			C v0 saved
-
-	cmpdi	r4, BMOD_1_TO_MOD_1_THRESHOLD
-	blt	L(bmod)
-	CALL(	mpn_mod_1)
-	b	L(reduced)
-L(bmod):
-	li	r6, 0
-	CALL(	mpn_modexact_1c_odd)
-L(reduced):
-
-define(`mask', `r0')dnl
-define(`a1',   `r4')dnl
-define(`a2',   `r5')dnl
-define(`d1',   `r6')dnl
-define(`d2',   `r7')dnl
-define(`cnt',  `r9')dnl
-
-	neg.	r6, r3
-	and	r6, r6, r3
-	cntlzd	cnt, r6
-	subfic	cnt, cnt, 63
-	li	r12, 63
-	bne	L(mid)
-	b	L(end)
-
-	ALIGN(16)
-L(top):
-	and	a1, r10, mask		C d - a
-	andc	a2, r11,  mask		C a - d
-	and	d1, r3, mask		C a
-	andc	d2, r30, mask		C d
-	or	r3, a1, a2		C new a
-	subf	cnt, cnt, r12
-	or	r30, d1, d2		C new d
-L(mid):	srd	r3, r3, cnt
-	sub.	r10, r30, r3		C r10 = d - a
-	subc	r11, r3, r30		C r11 = a - d
-	neg	r8, r10
-	and	r8, r8, r10
-	subfe	mask, mask, mask
-	cntlzd	cnt, r8
-	bne	L(top)
-
-L(end):	sld	r3, r30, r31
-
-	addi	r1, r1, 128
-	ld	r0, 16(r1)
-	ld	r30, -16(r1)
-	ld	r31, -8(r1)
-	mtlr	r0
-	blr
-EPILOGUE()
--- a/mpn/powerpc64/mode64/p7/gcd_1.asm	Sun Nov 24 23:02:33 2019 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,110 +0,0 @@
-dnl  PowerPC-64 mpn_gcd_1.
-
-dnl  Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C		    cycles/bit (approx)
-C POWER3/PPC630		 -
-C POWER4/PPC970		 -
-C POWER5		 -
-C POWER6		 -
-C POWER7		 7.6
-C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
-
-C INPUT PARAMETERS
-define(`up',    `r3')
-define(`n',     `r4')
-define(`v0',    `r5')
-
-EXTERN_FUNC(mpn_mod_1)
-EXTERN_FUNC(mpn_modexact_1c_odd)
-
-ASM_START()
-PROLOGUE(mpn_gcd_1,toc)
-	mflr	r0
-	std	r30, -16(r1)
-	std	r31, -8(r1)
-	std	r0, 16(r1)
-	stdu	r1, -128(r1)
-
-	ld	r7, 0(up)		C U low limb
-	or	r0, r5, r7		C x | y
-
-	neg	r6, r0
-	and	r6, r6, r0
-	cntlzd	r31, r6			C common twos
-	subfic	r31, r31, 63
-
-	neg	r6, r5
-	and	r6, r6, r5
-	cntlzd	r8, r6
-	subfic	r8, r8, 63
-	srd	r5, r5, r8
-	mr	r30, r5			C v0 saved
-
-	cmpdi	r4, BMOD_1_TO_MOD_1_THRESHOLD
-	blt	L(bmod)
-	CALL(	mpn_mod_1)
-	b	L(reduced)
-L(bmod):
-	li	r6, 0
-	CALL(	mpn_modexact_1c_odd)
-L(reduced):
-
-define(`cnt',  `r9')dnl
-
-	neg.	r6, r3
-	and	r6, r6, r3
-	cntlzd	cnt, r6
-	li	r12, 63
-	bne	L(mid)
-	b	L(end)
-
-	ALIGN(16)
-L(top):	isel	r30, r3, r30, 29	C y = min(x,y)
-	isel	r3, r10, r11, 29	C x = |y - x|
-L(mid):	subf	cnt, cnt, r12		C cnt = 63-cnt
-	srd	r3, r3, cnt
-	subf	r10, r3, r30		C r10 = y - x
-	subf	r11, r30, r3		C r11 = x - y
-	cmpld	cr7, r30, r3
-	and	r8, r11, r10		C isolate lsb
-	cntlzd	cnt, r8
-	bne	cr7, L(top)
-
-L(end):	sld	r3, r30, r31
-
-	addi	r1, r1, 128
-	ld	r0, 16(r1)
-	ld	r30, -16(r1)
-	ld	r31, -8(r1)
-	mtlr	r0
-	blr
-EPILOGUE()
--- a/mpn/powerpc64/mode64/p9/gcd_1.asm	Sun Nov 24 23:02:33 2019 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,101 +0,0 @@
-dnl  PowerPC-64 mpn_gcd_1.
-
-dnl  Copyright 2000-2002, 2005, 2009, 2011-2013, 2019 Free Software Foundation,
-dnl  Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-C		    cycles/bit (approx)
-C POWER3/PPC630		 -
-C POWER4/PPC970		 -
-C POWER5		 -
-C POWER6		 -
-C POWER7		 -
-C POWER8		 -
-C POWER9		 5.75
-C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
-
-C INPUT PARAMETERS
-define(`up',    `r3')
-define(`n',     `r4')
-define(`v0',    `r5')
-
-EXTERN_FUNC(mpn_mod_1)
-EXTERN_FUNC(mpn_modexact_1c_odd)
-
-ASM_START()
-PROLOGUE(mpn_gcd_1,toc)
-	mflr	r0
-	std	r30, -16(r1)
-	std	r31, -8(r1)
-	std	r0, 16(r1)
-	stdu	r1, -128(r1)
-
-	ld	r7, 0(up)		C U low limb
-	or	r0, r5, r7		C x | y
-	cnttzd	r31, r0			C common twos
-	cnttzd	r8, r5
-	srd	r5, r5, r8
-	mr	r30, r5			C v0 saved
-
-	cmpdi	r4, BMOD_1_TO_MOD_1_THRESHOLD
-	blt	L(bmod)
-	CALL(	mpn_mod_1)
-	b	L(reduced)
-L(bmod):
-	li	r6, 0
-	CALL(	mpn_modexact_1c_odd)
-L(reduced):
-
-define(`cnt',  `r9')dnl
-
-	cmpdi	r3, 0
-	cnttzd	cnt, r3
-	bne	L(mid)
-	b	L(end)
-
-	ALIGN(16)
-L(top):	isel	r30, r3, r30, 29	C y = min(x,y)
-	isel	r3, r10, r11, 29	C x = |y - x|
-L(mid):	srd	r3, r3, cnt
-	subf	r10, r3, r30		C r10 = y - x
-	subf	r11, r30, r3		C r11 = x - y
-	cmpld	cr7, r30, r3
-	cnttzd	cnt, r10
-	bne	cr7, L(top)
-
-L(end):	sld	r3, r30, r31
-
-	addi	r1, r1, 128
-	ld	r0, 16(r1)
-	ld	r30, -16(r1)
-	ld	r31, -8(r1)
-	mtlr	r0
-	blr
-EPILOGUE()
--- a/mpn/sparc64/gcd_1.asm	Sun Nov 24 23:02:33 2019 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,135 +0,0 @@
-dnl  SPARC64 mpn_gcd_1.
-
-dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked for SPARC by Torbjörn
-dnl  Granlund.
-
-dnl  Copyright 2000-2002, 2005, 2009, 2011-2013 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C		  cycles/bit (approx)
-C UltraSPARC 1&2:	 5.1
-C UltraSPARC 3:		 5.0
-C UltraSPARC T1:	11.4
-C UltraSPARC T3:	10
-C UltraSPARC T4:	 6
-C Numbers measured with: speed -CD -s32-64 -t32 mpn_gcd_1
-
-C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
-
-deflit(MAXSHIFT, 7)
-deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
-
-	RODATA
-	TYPE(ctz_table,object)
-ctz_table:
-	.byte	MAXSHIFT
-forloop(i,1,MASK,
-`	.byte	m4_count_trailing_zeros(i)
-')
-	SIZE(ctz_table,.-ctz_table)
-
-C Threshold of when to call bmod when U is one limb.  Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`BMOD_THRES_LOG2', 14)
-
-C INPUT PARAMETERS
-define(`up',    `%i0')
-define(`n',     `%i1')
-define(`v0',    `%i2')
-
-
-ASM_START()
-	REGISTER(%g2,#scratch)
-	REGISTER(%g3,#scratch)
-PROLOGUE(mpn_gcd_1)
-	save	%sp, -192, %sp
-	ldx	[up+0], %g1		C U low limb
-	mov	-1, %i4
-	or	v0, %g1, %g2		C x | y
-
-L(twos):
-	inc	%i4
-	andcc	%g2, 1, %g0
-	bz,a	%xcc, L(twos)
-	 srlx	%g2, 1, %g2
-
-L(divide_strip_y):
-	andcc	v0, 1, %g0
-	bz,a	%xcc, L(divide_strip_y)
-	 srlx	v0, 1, v0
-
-	cmp	n, 1			C if n > 1 we need
-	bnz	%xcc, L(bmod)		C to call bmod_1
-	 nop
-
-C Both U and V are single limbs, reduce with bmod if u0 >> v0.
-	srlx	%g1, BMOD_THRES_LOG2, %g2
-	cmp	%g2, v0
-	bleu	%xcc, L(noreduce)
-	 mov	%g1, %o0
-
-L(bmod):
-	mov	up, %o0
-	mov	n, %o1
-	mov	v0, %o2
-	call	mpn_modexact_1c_odd
-	 mov	0, %o3
-
-L(noreduce):
-
-	LEA64(ctz_table, i5, g4)
-
-	cmp	%o0, 0
-	bnz	%xcc, L(mid)
-	 and	%o0, MASK, %g3		C
-
-	return	%i7+8
-	 sllx	%o2, %o4, %o0		C CAUTION: v0 alias for o2
-
-	ALIGN(16)
-L(top):	movcc	%xcc, %l4, v0		C v = min(u,v)
-	movcc	%xcc, %l2, %o0		C u = |v - u]
-L(mid):	ldub	[%i5+%g3], %g5		C
-	brz,a,pn %g3, L(shift_alot)	C
-	 srlx	%o0, MAXSHIFT, %o0
-	srlx	%o0, %g5, %l4		C new u, odd
-	subcc	v0, %l4, %l2		C v - u, set flags for branch and movcc
-	sub	%l4, v0, %o0		C u - v
-	bnz,pt	%xcc, L(top)		C
-	 and	%l2, MASK, %g3		C extract low MAXSHIFT bits from (v-u)
-
-	return	%i7+8
-	 sllx	%o2, %o4, %o0		C CAUTION: v0 alias for o2
-
-L(shift_alot):
-	b	L(mid)
-	 and	%o0, MASK, %g3		C
-EPILOGUE()
--- a/mpn/x86/k6/gcd_1.asm	Sun Nov 24 23:02:33 2019 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,359 +0,0 @@
-dnl  AMD K6 mpn_gcd_1 -- mpn by 1 gcd.
-
-dnl  Copyright 2000-2002, 2004, 2014 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C K6: 9.5 cycles/bit (approx)   1x1 gcd
-C     11.0 cycles/limb          Nx1 reduction (modexact_1_odd)
-
-
-C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t y);
-C
-C This code is nothing very special, but offers a speedup over what gcc 2.95
-C can do with mpn/generic/gcd_1.c.
-C
-C Future:
-C
-C Using a lookup table to count trailing zeros seems a touch quicker, but
-C after a slightly longer startup.  Might be worthwhile if an mpn_gcd_2 used
-C it too.
-
-
-dnl  If size==1 and x (the larger operand) is more than DIV_THRESHOLD bits
-dnl  bigger than y, then a division x%y is done to reduce it.
-dnl
-dnl  A divl is 20 cycles and the loop runs at about 9.5 cycles/bitpair so
-dnl  there should be an advantage in the divl at about 4 or 5 bits, which is
-dnl  what's found.
-
-deflit(DIV_THRESHOLD, 5)
-
-
-defframe(PARAM_LIMB, 12)
-defframe(PARAM_SIZE,  8)
-defframe(PARAM_SRC,   4)
-
-	TEXT
-	ALIGN(16)
-
-PROLOGUE(mpn_gcd_1)
-deflit(`FRAME',0)
-
-	ASSERT(ne, `cmpl $0, PARAM_LIMB')
-	ASSERT(ae, `cmpl $1, PARAM_SIZE')
-
-
-	movl	PARAM_SRC, %eax
-	pushl	%ebx			FRAME_pushl()
-
-	movl	PARAM_LIMB, %edx
-	movl	$-1, %ecx
-
-	movl	(%eax), %ebx		C src low limb
-
-	movl	%ebx, %eax		C src low limb
-	orl	%edx, %ebx
-
-L(common_twos):
-	shrl	%ebx
-	incl	%ecx
-
-	jnc	L(common_twos)		C 1/4 chance on random data
-	shrl	%cl, %edx		C y
-
-	cmpl	$1, PARAM_SIZE
-	ja	L(size_two_or_more)
-
-
-	ASSERT(nz, `orl	%eax, %eax')	C should have src limb != 0
-
-	shrl	%cl, %eax		C x
-
-
-	C Swap if necessary to make x>=y.  Measures a touch quicker as a
-	C jump than a branch free calculation.
-	C
-	C eax	x
-	C ebx
-	C ecx	common twos
-	C edx	y
-
-	movl	%eax, %ebx
-	cmpl	%eax, %edx
-
-	jb	L(noswap)
-	movl	%edx, %eax
-
-	movl	%ebx, %edx
-	movl	%eax, %ebx
-L(noswap):
-
-
-	C See if it's worth reducing x with a divl.
-	C
-	C eax	x
-	C ebx	x
-	C ecx	common twos
-	C edx	y
-
-	shrl	$DIV_THRESHOLD, %ebx
-
-	cmpl	%ebx, %edx
-	ja	L(nodiv)
-
-
-	C Reduce x to x%y.
-	C
-	C eax	x
-	C ebx
-	C ecx	common twos
-	C edx	y
-
-	movl	%edx, %ebx
-	xorl	%edx, %edx
-
-	divl	%ebx
-
-	orl	%edx, %edx	C y
-	nop	C code alignment
-
-	movl	%ebx, %eax	C x
-	jz	L(done_shll)
-L(nodiv):
-
-
-	C eax	x
-	C ebx
-	C ecx	common twos
-	C edx	y
-	C esi
-	C edi
-	C ebp
-
-L(strip_y):
-	shrl	%edx
-	jnc	L(strip_y)
-
-	leal	1(%edx,%edx), %edx
-	movl	%ecx, %ebx	C common twos
-
-	leal	1(%eax), %ecx
-	jmp	L(strip_x_and)
-
-
-C Calculating a %cl shift based on the low bit 0 or 1 avoids doing a branch
-C on a 50/50 chance of 0 or 1.  The chance of the next bit also being 0 is
-C only 1/4.
-C
-C A second computed %cl shift was tried, but that measured a touch slower
-C than branching back.
-C
-C A branch-free abs(x-y) and min(x,y) calculation was tried, but that
-C measured about 1 cycle/bit slower.
-
-	C eax	x
-	C ebx	common twos
-	C ecx	scratch
-	C edx	y
-
-	ALIGN(4)
-L(swap):
-	addl	%eax, %edx	C x-y+y = x
-	negl	%eax		C -(x-y) = y-x
-
-L(strip_x):
-	shrl	%eax		C odd-odd = even, so always one to strip
-	ASSERT(nz)
-
-L(strip_x_leal):
-	leal	1(%eax), %ecx
-
-L(strip_x_and):
-	andl	$1, %ecx	C (x^1)&1
-
-	shrl	%cl, %eax	C shift if x even
-
-	testb	$1, %al
-	jz	L(strip_x)
-
-	ASSERT(nz,`testl $1, %eax')	C x, y odd
-	ASSERT(nz,`testl $1, %edx')
-
-	subl	%edx, %eax
-	jb	L(swap)
-	ja	L(strip_x)
-
-
-	movl	%edx, %eax
-	movl	%ebx, %ecx
-
-L(done_shll):
-	shll	%cl, %eax
-	popl	%ebx
-
-	ret
-
-
-C -----------------------------------------------------------------------------
-C Two or more limbs.
-C
-C x={src,size} is reduced modulo y using either a plain mod_1 style
-C remainder, or a modexact_1 style exact division.
-
-deflit(MODEXACT_THRESHOLD, ifdef(`PIC', 4, 4))
-
-	ALIGN(8)
-L(size_two_or_more):
-	C eax
-	C ebx
-	C ecx	common twos
-	C edx	y, without common twos
-	C esi
-	C edi
-	C ebp
-
-deflit(FRAME_TWO_OR_MORE, FRAME)
-
-	pushl	%edi		defframe_pushl(SAVE_EDI)
-	movl	PARAM_SRC, %ebx
-
-L(y_twos):
-	shrl	%edx
-	jnc	L(y_twos)
-
-	movl	%ecx, %edi		C common twos
-	movl	PARAM_SIZE, %ecx
-
-	pushl	%esi		defframe_pushl(SAVE_ESI)
-	leal	1(%edx,%edx), %esi	C y (odd)
-
-	movl	-4(%ebx,%ecx,4), %eax	C src high limb
-
-	cmpl	%edx, %eax		C carry if high<divisor
-
-	sbbl	%edx, %edx		C -1 if high<divisor
-
-	addl	%edx, %ecx		C skip one limb if high<divisor
-	andl	%eax, %edx
-
-	cmpl	$MODEXACT_THRESHOLD, %ecx
-	jae	L(modexact)
-
-
-L(divide_top):
-	C eax	scratch (quotient)
-	C ebx	src
-	C ecx	counter, size-1 to 1
-	C edx	carry (remainder)
-	C esi	divisor (odd)
-	C edi
-	C ebp
-
-	movl	-4(%ebx,%ecx,4), %eax
-	divl	%esi
-	loop	L(divide_top)
-
-
-	movl	%edx, %eax	C x
-	movl	%esi, %edx	C y (odd)
-
-	movl	%edi, %ebx	C common twos
-	popl	%esi
-
-	popl	%edi
-	leal	1(%eax), %ecx
-
-	orl	%eax, %eax
-	jnz	L(strip_x_and)
-
-
-	movl	%ebx, %ecx
-	movl	%edx, %eax
-
-	shll	%cl, %eax
-	popl	%ebx
-
-	ret
-
-
-	ALIGN(8)
-L(modexact):
-	C eax
-	C ebx	src ptr
-	C ecx	size or size-1
-	C edx
-	C esi	y odd
-	C edi	common twos
-	C ebp
-
-	movl	PARAM_SIZE, %eax
-	pushl	%esi		FRAME_pushl()
-
-	pushl	%eax		FRAME_pushl()
-
-	pushl	%ebx		FRAME_pushl()
-
-ifdef(`PIC_WITH_EBX',`
-	nop	C code alignment
-	call	L(movl_eip_ebx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-')
-	CALL(	mpn_modexact_1_odd)
-
-	movl	%esi, %edx		C y odd
-	movl	SAVE_ESI, %esi
-
-	movl	%edi, %ebx		C common twos
-	movl	SAVE_EDI, %edi
-
-	addl	$eval(FRAME - FRAME_TWO_OR_MORE), %esp
-	orl	%eax, %eax
-
-	leal	1(%eax), %ecx
-	jnz	L(strip_x_and)
-
-
-	movl	%ebx, %ecx
-	movl	%edx, %eax
-
-	shll	%cl, %eax
-	popl	%ebx
-
-	ret
-
-
-ifdef(`PIC_WITH_EBX',`
-L(movl_eip_ebx):
-	movl	(%esp), %ebx
-	ret_internal
-')
-
-EPILOGUE()
--- a/mpn/x86/k7/gcd_1.asm	Sun Nov 24 23:02:33 2019 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,193 +0,0 @@
-dnl  x86 mpn_gcd_1 optimised for AMD K7.
-
-dnl  Contributed to the GNU project by by Kevin Ryde.  Rehacked by Torbjorn
-dnl  Granlund.
-
-dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012, 2014, 2015 Free Software
-dnl  Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C	     cycles/bit (approx)
-C AMD K7	 5.31
-C AMD K8,K9	 5.33
-C AMD K10	 5.30
-C AMD bd1	 ?
-C AMD bobcat	 7.02
-C Intel P4-2	10.1
-C Intel P4-3/4	10.0
-C Intel P6/13	 5.88
-C Intel core2	 6.26
-C Intel NHM	 6.83
-C Intel SBR	 8.50
-C Intel atom	 8.90
-C VIA nano	 ?
-C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1
-
-C TODO
-C  * Tune overhead, this takes 2-3 cycles more than old code when v0 is tiny.
-C  * Stream things better through registers, avoiding some copying.
-C  * For ELF, avoid putting GOT base in both ebx and esi.  Needs special
-C    LEA/LEAL or else discrete code here.
-
-C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
-
-deflit(MAXSHIFT, 6)
-deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
-
-DEF_OBJECT(ctz_table,64)
-	.byte	MAXSHIFT
-forloop(i,1,MASK,
-`	.byte	m4_count_trailing_zeros(i)
-')
-END_OBJECT(ctz_table)
-
-C Threshold of when to call bmod when U is one limb.  Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`DIV_THRES_LOG2', 7)
-
-
-define(`up',    `%edi')
-define(`n',     `%esi')
-define(`v0',    `%edx')
-
-
-ASM_START()
-	TEXT
-	ALIGN(16)
-PROLOGUE(mpn_gcd_1)
-	push	%edi
-	push	%esi
-
-	mov	12(%esp), up
-	mov	16(%esp), n
-	mov	20(%esp), v0
-
-	mov	(up), %eax		C U low limb
-	or	v0, %eax		C x | y
-	mov	$-1, %ecx
-
-L(twos):
-	inc	%ecx
-	shr	%eax
-	jnc	L(twos)
-
-	shr	%cl, v0
-	mov	%ecx, %eax		C common twos
-
-L(divide_strip_y):
-	shr	v0
-	jnc	L(divide_strip_y)
-	adc	v0, v0
-
-	push	%eax
-	push	v0
-
-	cmp	$1, n
-	jnz	L(reduce_nby1)
-
-C Both U and V are single limbs, reduce with div if u0 >> v0.
-	mov	(up), %ecx
-	mov	%ecx, %eax
-	shr	$DIV_THRES_LOG2, %ecx
-	cmp	%ecx, v0
-	ja	L(reduced)
-
-	mov	v0, %esi
-	xor	%edx, %edx
-	div	%esi
-	mov	%edx, %eax
-	jmp	L(reduced)
-
-L(reduce_nby1):
-ifdef(`PIC_WITH_EBX',`dnl
-	push	%ebx
-	add	$-4, %esp
-	call	L(movl_eip_ebx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-')
-	push	v0			C param 3
-	push	n			C param 2
-	push	up			C param 1
-	cmp	$BMOD_1_TO_MOD_1_THRESHOLD, n
-	jl	L(bmod)
-	CALL(	mpn_mod_1)
-	jmp	L(called)
-L(bmod):
-	CALL(	mpn_modexact_1_odd)
-
-L(called):
-ifdef(`PIC_WITH_EBX',`dnl
-	add	$16, %esp		C deallocate params
-	pop	%ebx
-',`
-	add	$12, %esp		C deallocate params
-')
-L(reduced):
-	pop	%edx
-
-	LEAL(	ctz_table, %esi)
-	test	%eax, %eax
-	mov	%eax, %ecx
-	jnz	L(mid)
-	jmp	L(end)
-
-	ALIGN(16)			C               K8    BC    P4    NHM   SBR
-L(top):	cmovc(	%ecx, %eax)		C if x-y < 0	0
-	cmovc(	%edi, %edx)		C use x,y-x	0
-L(mid):	and	$MASK, %ecx		C		0
-	movzbl	(%esi,%ecx), %ecx	C		1
-	jz	L(shift_alot)		C		1
-	shr	%cl, %eax		C		3
-	mov	%eax, %edi		C		4
-	mov	%edx, %ecx		C		3
-	sub	%eax, %ecx		C		4
-	sub	%edx, %eax		C		4
-	jnz	L(top)			C		5
-
-L(end):	pop	%ecx
-	mov	%edx, %eax
-	shl	%cl, %eax
-	pop	%esi
-	pop	%edi
-	ret
-
-L(shift_alot):
-	shr	$MAXSHIFT, %eax
-	mov	%eax, %ecx
-	jmp	L(mid)
-
-ifdef(`PIC_WITH_EBX',`dnl
-L(movl_eip_ebx):
-	mov	(%esp), %ebx
-	ret
-')
-EPILOGUE()
-ASM_END()
--- a/mpn/x86/p6/gcd_1.asm	Sun Nov 24 23:02:33 2019 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,161 +0,0 @@
-dnl  x86 mpn_gcd_1 optimised for processors with fast BSF.
-
-dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked by Torbjorn Granlund.
-
-dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012, 2015 Free Software
-dnl  Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C	     cycles/bit (approx)
-C AMD K7	 7.80
-C AMD K8,K9	 7.79
-C AMD K10	 4.08
-C AMD bd1	 ?
-C AMD bobcat	 7.82
-C Intel P4-2	14.9
-C Intel P4-3/4	14.0
-C Intel P6/13	 5.09
-C Intel core2	 4.22
-C Intel NHM	 5.00
-C Intel SBR	 5.00
-C Intel atom	17.1
-C VIA nano	?
-C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1
-
-C Threshold of when to call bmod when U is one limb.  Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`BMOD_THRES_LOG2', 6)
-
-
-define(`up',    `%edi')
-define(`n',     `%esi')
-define(`v0',    `%edx')
-
-
-ASM_START()
-	TEXT
-	ALIGN(16)
-PROLOGUE(mpn_gcd_1)
-	push	%edi
-	push	%esi
-
-	mov	12(%esp), up
-	mov	16(%esp), n
-	mov	20(%esp), v0
-
-	mov	(up), %eax	C U low limb
-	or	v0, %eax
-	bsf	%eax, %eax	C min(ctz(u0),ctz(v0))
-
-	bsf	v0, %ecx
-	shr	%cl, v0
-
-	push	%eax		C preserve common twos over call
-	push	v0		C preserve v0 argument over call
-
-	cmp	$1, n
-	jnz	L(reduce_nby1)
-
-C Both U and V are single limbs, reduce with bmod if u0 >> v0.
-	mov	(up), %ecx
-	mov	%ecx, %eax
-	shr	$BMOD_THRES_LOG2, %ecx
-	cmp	%ecx, v0
-	ja	L(reduced)
-	jmp	L(bmod)
-
-L(reduce_nby1):
-	cmp	$BMOD_1_TO_MOD_1_THRESHOLD, n
-	jl	L(bmod)
-ifdef(`PIC_WITH_EBX',`dnl
-	push	%ebx
-	add	$-4, %esp
-	call	L(movl_eip_to_ebx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-')
-	push	v0		C param 3
-	push	n		C param 2
-	push	up		C param 1
-	CALL(	mpn_mod_1)
-	jmp	L(called)
-
-L(bmod):
-ifdef(`PIC_WITH_EBX',`dnl
-	push	%ebx
-	add	$-4, %esp
-	call	L(movl_eip_to_ebx)
-	add	$_GLOBAL_OFFSET_TABLE_, %ebx
-')
-	push	v0		C param 3
-	push	n		C param 2
-	push	up		C param 1
-	CALL(	mpn_modexact_1_odd)
-
-L(called):
-ifdef(`PIC_WITH_EBX',`dnl
-	add	$16, %esp	C deallocate params
-	pop	%ebx
-',`
-	add	$12, %esp	C deallocate params
-')
-L(reduced):
-	pop	%edx
-
-	bsf	%eax, %ecx
-C	test	%eax, %eax	C FIXME: does this lower latency?
-	jnz	L(mid)
-	jmp	L(end)
-
-	ALIGN(16)		C               K10   BD    C2    NHM   SBR
-L(top):	cmovc(	%esi, %eax)	C if x-y < 0    0,3   0,3   0,6   0,5   0,5
-	cmovc(	%edi, %edx)	C use x,y-x     0,3   0,3   2,8   1,7   1,7
-L(mid):	shr	%cl, %eax	C               1,7   1,6   2,8   2,8   2,8
-	mov	%edx, %esi	C               1     1     4     3     3
-	sub	%eax, %esi	C               2     2     5     4     4
-	bsf	%esi, %ecx	C               3     3     6     5     5
-	mov	%eax, %edi	C               2     2     3     3     4
-	sub	%edx, %eax	C               2     2     4     3     4
-	jnz	L(top)		C
-
-L(end):	pop	%ecx
-	mov	%edx, %eax
-	shl	%cl, %eax
-
-	pop	%esi
-	pop	%edi
-	ret
-
-ifdef(`PIC_WITH_EBX',`dnl
-L(movl_eip_to_ebx):
-	mov	(%esp), %ebx
-	ret
-')
-EPILOGUE()
--- a/mpn/x86_64/bd1/gcd_1.asm	Sun Nov 24 23:02:33 2019 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,37 +0,0 @@
-dnl  AMD64 mpn_gcd_1.
-
-dnl  Copyright 2012 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_gcd_1)
-include_mpn(`x86_64/core2/gcd_1.asm')
--- a/mpn/x86_64/bd2/gcd_1.asm	Sun Nov 24 23:02:33 2019 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,164 +0,0 @@
-dnl  AMD64 mpn_gcd_1 optimised for AMD BD2-BD4, Zen.
-
-dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked for AMD64 by Torbjorn
-dnl  Granlund.
-
-dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017, 2019 Free Software
-dnl  Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C	     cycles/bit (approx)
-C AMD K8,K9	 ?
-C AMD K10	 ?
-C AMD bd1	 ?
-C AMD bd2	 ?
-C AMD bd3	 ?
-C AMD bd4	 3.65
-C AMD bt1	 ?
-C AMD bt2	 ?
-C AMD zn1	 3.5
-C AMD zn2	 3.8
-C Intel P4	 ?
-C Intel core2	 ?
-C Intel NHM	 ?
-C Intel SBR	 ?
-C Intel IBR	 ?
-C Intel HWL	 ?
-C Intel BWL	 ?
-C Intel SKL	 ?
-C Intel atom	 ?
-C Intel SLM	 ?
-C Intel GLM	 ?
-C Intel GLM+	 ?
-C VIA nano	 ?
-C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
-
-C TODO
-C  * Optimise inner-loop for specific CPUs.
-C  * Use DIV for 1-by-1 reductions, at least for some CPUs.
-
-C Threshold of when to call bmod when U is one limb.  Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`BMOD_THRES_LOG2', 6)
-
-C INPUT PARAMETERS
-define(`up',    `%rdi')
-define(`n',     `%rsi')
-define(`v0',    `%rdx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-IFDOS(`define(`STACK_ALLOC', 40)')
-IFSTD(`define(`STACK_ALLOC', 8)')
-
-C Undo some configure cleverness.
-C The problem is that C only defines the '1c' variant, and that configure
-C therefore considers modexact_1c to be the base function.  It then adds a
-C special fat rule for mpn_modexact_1_odd, messing up things when a cpudep
-C gcd_1 exists without a corresponding cpudep mode1o.
-ifdef(`WANT_FAT_BINARY', `
-  define(`mpn_modexact_1_odd', `MPN_PREFIX`modexact_1_odd_x86_64'')')
-
-
-ASM_START()
-	TEXT
-	ALIGN(16)
-PROLOGUE(mpn_gcd_1)
-	FUNC_ENTRY(3)
-	mov	(up), %rax		C U low limb
-	or	v0, %rax		C x | y
-	bsf	%rax, %rax		C min(ctz(u0),ctz(v0))
-
-	bsf	v0, %rcx
-	shr	R8(%rcx), v0
-
-	push	%rax			C preserve common twos over call
-
-	cmp	$1, n
-	jnz	L(reduce_nby1)
-
-C Both U and V are single limbs, reduce with bmod if u0 >> v0.
-	mov	(up), %r8
-	mov	%r8, %rax
-	shr	$BMOD_THRES_LOG2, %r8
-	cmp	%r8, v0
-	ja	L(reduced)
-
-L(bmod):
-	push	v0			C preserve v0 argument over call
-	sub	$STACK_ALLOC, %rsp	C maintain ABI required rsp alignment
-IFDOS(`	mov	%rdx, %r8	')
-IFDOS(`	mov	%rsi, %rdx	')
-IFDOS(`	mov	%rdi, %rcx	')
-	ASSERT(nz, `test $15, %rsp')
-	CALL(	mpn_modexact_1_odd)
-
-L(called):
-	add	$STACK_ALLOC, %rsp
-	pop	v0
-
-L(reduced):
-	bsf	%rax, %rcx
-C	test	%rax, %rax		C FIXME: does this lower latency?
-	jnz	L(mid)
-	jmp	L(end)
-
-L(reduce_nby1):
-	cmp	$BMOD_1_TO_MOD_1_THRESHOLD, n
-	jl	L(bmod)
-
-	push	v0			C preserve v0 argument over call
-	sub	$STACK_ALLOC, %rsp	C maintain ABI required rsp alignment
-IFDOS(`	mov	%rdx, %r8	')
-IFDOS(`	mov	%rsi, %rdx	')
-IFDOS(`	mov	%rdi, %rcx	')
-	ASSERT(nz, `test $15, %rsp')
-	CALL(	mpn_mod_1)
-	jmp	L(called)
-
-	ALIGN(16)			C              K10 BD1 BD2 ZEN CNR NHM SBR
-L(top):	cmovc	%r10, %rax		C if x-y < 0   0,3 0,3 0,3 0,3 0,6 0,5 0,5
-	cmovc	%r9, v0			C use x,y-x    0,3 0,3 0,3 0,3 2,8 1,7 1,7
-L(mid):	shr	R8(%rcx), %rax		C              1,7 1,6 1,5 1,4 2,8 2,8 2,8
-	mov	v0, %r10		C              1   1   1   1   4   3   3
-	sub	%rax, %r10		C              2   2   2   1   5   4   4
-	rep;bsf	%r10, %rcx		C tzcnt!       3   3   3   2   6   5   5
-	mov	%rax, %r9		C              2   2   2   2   3   3   4
-	sub	v0, %rax		C              2   2   2   2   4   3   4
-	jnz	L(top)			C
-
-L(end):	pop	%rcx			C common twos
-	mov	v0, %rax
-	shl	R8(%rcx), %rax
-	FUNC_EXIT()
-	ret
-EPILOGUE()
--- a/mpn/x86_64/bt2/gcd_1.asm	Sun Nov 24 23:02:33 2019 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,37 +0,0 @@
-dnl  AMD64 mpn_gcd_1.
-
-dnl  Copyright 2012 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_gcd_1)
-include_mpn(`x86_64/bd2/gcd_1.asm')
--- a/mpn/x86_64/core2/gcd_1.asm	Sun Nov 24 23:02:33 2019 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,151 +0,0 @@
-dnl  AMD64 mpn_gcd_1 optimised for Intel C2, NHM, SBR and AMD K10, BD.
-
-dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked for AMD64 by Torbjorn
-dnl  Granlund.
-
-dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017 Free Software
-dnl  Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C	     cycles/bit (approx)
-C AMD K8,K9	 8.50
-C AMD K10	 4.30
-C AMD bd1	 5.00
-C AMD bobcat	10.0
-C Intel P4	18.6
-C Intel core2	 3.83
-C Intel NHM	 5.17
-C Intel SBR	 4.69
-C Intel atom	17.0
-C VIA nano	 5.44
-C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
-
-C TODO
-C  * Optimise inner-loop for specific CPUs.
-C  * Use DIV for 1-by-1 reductions, at least for some CPUs.
-
-C Threshold of when to call bmod when U is one limb.  Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`BMOD_THRES_LOG2', 6)
-
-C INPUT PARAMETERS
-define(`up',    `%rdi')
-define(`n',     `%rsi')
-define(`v0',    `%rdx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-IFDOS(`define(`STACK_ALLOC', 40)')
-IFSTD(`define(`STACK_ALLOC', 8)')
-
-C Undo some configure cleverness.
-C The problem is that C only defines the '1c' variant, and that configure
-C therefore considers modexact_1c to be the base function.  It then adds a
-C special fat rule for mpn_modexact_1_odd, messing up things when a cpudep
-C gcd_1 exists without a corresponding cpudep mode1o.
-ifdef(`WANT_FAT_BINARY', `
-  define(`mpn_modexact_1_odd', `MPN_PREFIX`modexact_1_odd_x86_64'')')
-
-
-ASM_START()
-	TEXT
-	ALIGN(16)
-PROLOGUE(mpn_gcd_1)
-	FUNC_ENTRY(3)
-	mov	(up), %rax		C U low limb
-	or	v0, %rax		C x | y
-	bsf	%rax, %rax		C min(ctz(u0),ctz(v0))
-
-	bsf	v0, %rcx
-	shr	R8(%rcx), v0
-
-	push	%rax			C preserve common twos over call
-
-	cmp	$1, n
-	jnz	L(reduce_nby1)
-
-C Both U and V are single limbs, reduce with bmod if u0 >> v0.
-	mov	(up), %r8
-	mov	%r8, %rax
-	shr	$BMOD_THRES_LOG2, %r8
-	cmp	%r8, v0
-	ja	L(reduced)
-
-L(bmod):
-	push	v0			C preserve v0 argument over call
-	sub	$STACK_ALLOC, %rsp	C maintain ABI required rsp alignment
-IFDOS(`	mov	%rdx, %r8	')
-IFDOS(`	mov	%rsi, %rdx	')
-IFDOS(`	mov	%rdi, %rcx	')
-	ASSERT(nz, `test $15, %rsp')
-	CALL(	mpn_modexact_1_odd)
-
-L(called):
-	add	$STACK_ALLOC, %rsp
-	pop	v0
-
-L(reduced):
-	bsf	%rax, %rcx
-C	test	%rax, %rax		C FIXME: does this lower latency?
-	jnz	L(mid)
-	jmp	L(end)
-
-L(reduce_nby1):
-	cmp	$BMOD_1_TO_MOD_1_THRESHOLD, n
-	jl	L(bmod)
-
-	push	v0			C preserve v0 argument over call
-	sub	$STACK_ALLOC, %rsp	C maintain ABI required rsp alignment
-IFDOS(`	mov	%rdx, %r8	')
-IFDOS(`	mov	%rsi, %rdx	')
-IFDOS(`	mov	%rdi, %rcx	')
-	ASSERT(nz, `test $15, %rsp')
-	CALL(	mpn_mod_1)
-	jmp	L(called)
-
-	ALIGN(16)			C              K10  BD   C2   NHM  SBR
-L(top):	cmovc	%r10, %rax		C if x-y < 0   0,3  0,3  0,6  0,5  0,5
-	cmovc	%r9, v0			C use x,y-x    0,3  0,3  2,8  1,7  1,7
-L(mid):	shr	R8(%rcx), %rax		C              1,7  1,6  2,8  2,8  2,8
-	mov	v0, %r10		C              1    1    4    3    3
-	sub	%rax, %r10		C              2    2    5    4    4
-	bsf	%r10, %rcx		C              3    3    6    5    5
-	mov	%rax, %r9		C              2    2    3    3    4
-	sub	v0, %rax		C              2    2    4    3    4
-	jnz	L(top)			C
-
-L(end):	pop	%rcx			C common twos
-	mov	v0, %rax
-	shl	R8(%rcx), %rax
-	FUNC_EXIT()
-	ret
-EPILOGUE()
--- a/mpn/x86_64/gcd_1.asm	Sun Nov 24 23:02:33 2019 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,170 +0,0 @@
-dnl  AMD64 mpn_gcd_1 -- mpn by 1 gcd.
-
-dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked for AMD64 by Torbjorn
-dnl  Granlund.
-
-dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012, 2017 Free Software
-dnl  Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C	     cycles/bit (approx)
-C AMD K8,K9	 5.21                 (4.95)
-C AMD K10	 5.15                 (5.00)
-C AMD bd1	 5.42                 (5.14)
-C AMD bobcat	 6.71                 (6.56)
-C Intel P4	13.5                 (12.75)
-C Intel core2	 6.20                 (6.16)
-C Intel NHM	 6.49                 (6.25)
-C Intel SBR	 7.75                 (7.57)
-C Intel atom	 8.77                 (8.54)
-C VIA nano	 6.60                 (6.20)
-C Numbers measured with: speed -CD -s16-64 -t48 mpn_gcd_1
-
-C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
-
-deflit(MAXSHIFT, 7)
-deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
-
-DEF_OBJECT(ctz_table,64)
-	.byte	MAXSHIFT
-forloop(i,1,MASK,
-`	.byte	m4_count_trailing_zeros(i)
-')
-END_OBJECT(ctz_table)
-
-C Threshold of when to call bmod when U is one limb.  Should be about
-C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
-define(`BMOD_THRES_LOG2', 8)
-
-C INPUT PARAMETERS
-define(`up',    `%rdi')
-define(`n',     `%rsi')
-define(`v0',    `%rdx')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-IFDOS(`define(`STACK_ALLOC', 40)')
-IFSTD(`define(`STACK_ALLOC', 8)')
-
-ASM_START()
-	TEXT
-	ALIGN(16)
-PROLOGUE(mpn_gcd_1)
-	FUNC_ENTRY(3)
-	mov	(up), %rax		C U low limb
-	mov	$-1, R32(%rcx)
-	or	v0, %rax		C x | y
-
-L(twos):
-	inc	R32(%rcx)
-	shr	%rax
-	jnc	L(twos)
-
-	shr	R8(%rcx), v0
-	push	%rcx			C common twos
-
-L(divide_strip_y):
-	shr	v0
-	jnc	L(divide_strip_y)
-	adc	v0, v0
-
-	cmp	$1, n
-ifelse(BMOD_1_TO_MOD_1_THRESHOLD, MP_SIZE_T_MAX,`
-	jnz	L(bmod)
-',`
-	jnz	L(reduce_nby1)
-')
-C Both U and V are single limbs, reduce with bmod if u0 >> v0.
-	mov	(up), %r8
-	mov	%r8, %rax
-	shr	$BMOD_THRES_LOG2, %r8
-	cmp	%r8, v0
-	ja	L(reduced)
-
-L(bmod):
-	push	v0			C preserve v0 argument over call
-	sub	$STACK_ALLOC, %rsp	C maintain ABI required rsp alignment
-IFDOS(`	mov	%rdx, %r8	')
-IFDOS(`	mov	%rsi, %rdx	')
-IFDOS(`	mov	%rdi, %rcx	')
-	ASSERT(nz, `test $15, %rsp')
-	CALL(	mpn_modexact_1_odd)
-
-L(called):
-	add	$STACK_ALLOC, %rsp
-	pop	v0
-
-L(reduced):
-	LEA(	ctz_table, %rsi)
-	test	%rax, %rax
-	mov	%rax, %rcx
-	jnz	L(mid)
-	jmp	L(end)
-
-ifelse(BMOD_1_TO_MOD_1_THRESHOLD, `MP_SIZE_T_MAX',,`
-L(reduce_nby1):
-	cmp	$BMOD_1_TO_MOD_1_THRESHOLD, n
-	jl	L(bmod)
-
-	push	v0			C preserve v0 argument over call
-	sub	$STACK_ALLOC, %rsp	C maintain ABI required rsp alignment
-IFDOS(`	mov	%rdx, %r8	')
-IFDOS(`	mov	%rsi, %rdx	')
-IFDOS(`	mov	%rdi, %rcx	')
-	ASSERT(nz, `test $15, %rsp')
-	CALL(	mpn_mod_1)
-	jmp	L(called)
-')
-	ALIGN(16)			C              K8   BC   P4   NHM  SBR
-L(top):	cmovc	%rcx, %rax		C if x-y < 0   0
-	cmovc	%rdi, v0		C use x,y-x    0
-L(mid):	and	$MASK, R32(%rcx)	C	       0
-	movzbl	(%rsi,%rcx), R32(%rcx)	C	       1
-	jz	L(shift_alot)		C	       1
-	shr	R8(%rcx), %rax		C	       3
-	mov	%rax, %rdi		C	       4
-	mov	v0, %rcx		C	       3
-	sub	%rax, %rcx		C	       4
-	sub	v0, %rax		C	       4
-	jnz	L(top)			C
-
-L(end):	pop	%rcx
-	mov	v0, %rax
-	shl	R8(%rcx), %rax
-	FUNC_EXIT()
-	ret
-
-L(shift_alot):
-	shr	$MAXSHIFT, %rax
-	mov	%rax, %rcx
-	jmp	L(mid)
-EPILOGUE()
--- a/mpn/x86_64/k10/gcd_1.asm	Sun Nov 24 23:02:33 2019 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,37 +0,0 @@
-dnl  AMD64 mpn_gcd_1.
-
-dnl  Copyright 2012 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_gcd_1)
-include_mpn(`x86_64/core2/gcd_1.asm')
--- a/mpn/x86_64/nano/gcd_1.asm	Sun Nov 24 23:02:33 2019 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,37 +0,0 @@
-dnl  AMD64 mpn_gcd_1.
-
-dnl  Copyright 2012 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_gcd_1)
-include_mpn(`x86_64/core2/gcd_1.asm')
--- a/mpn/x86_64/zen/gcd_1.asm	Sun Nov 24 23:02:33 2019 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,37 +0,0 @@
-dnl  AMD64 mpn_gcd_1.
-
-dnl  Copyright 2012 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of either:
-dnl
-dnl    * the GNU Lesser General Public License as published by the Free
-dnl      Software Foundation; either version 3 of the License, or (at your
-dnl      option) any later version.
-dnl
-dnl  or
-dnl
-dnl    * the GNU General Public License as published by the Free Software
-dnl      Foundation; either version 2 of the License, or (at your option) any
-dnl      later version.
-dnl
-dnl  or both in parallel, as here.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-dnl  for more details.
-dnl
-dnl  You should have received copies of the GNU General Public License and the
-dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
-dnl  see https://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-ABI_SUPPORT(DOS64)
-ABI_SUPPORT(STD64)
-
-MULFUNC_PROLOGUE(mpn_gcd_1)
-include_mpn(`x86_64/bd2/gcd_1.asm')