[Gmp-commit] /var/hg/gmp: 2 new changesets

Sat Mar 17 23:53:57 CET 2012

details:   /var/hg/gmp/rev/acaa1452a82e
changeset: 14762:acaa1452a82e
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sat Mar 17 23:52:21 2012 +0100
description:
Restore DOS64 support.

details:   /var/hg/gmp/rev/775523081472
changeset: 14763:775523081472
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sat Mar 17 23:53:55 2012 +0100
description:
Rewrite x86-32 gcd_1 support.

diffstat:

 ChangeLog                  |    3 +
 mpn/x86/k7/gcd_1.asm       |  405 +++++++++++++-------------------------------
 mpn/x86/p6/gcd_1.asm       |  142 +++++++++++++++
 mpn/x86_64/core2/gcd_1.asm |   16 +-
 mpn/x86_64/gcd_1.asm       |   16 +-
 5 files changed, 285 insertions(+), 297 deletions(-)

diffs (truncated from 727 to 300 lines):

diff -r b319b5e7816c -r 775523081472 ChangeLog

--- a/ChangeLog	Sat Mar 17 15:19:11 2012 +0100
+++ b/ChangeLog	Sat Mar 17 23:53:55 2012 +0100
@@ -1,5 +1,8 @@
 2012-03-17  Torbjorn Granlund  <tege at gmplib.org>
 
+	* mpn/x86/k7/gcd_1.asm: Rewrite.
+	* mpn/x86/p6/gcd_1.asm: New file.
+
 	* mpn/x86_64/core2/gcd_1.asm: Conditionally suppress reduction calls.
 	* mpn/x86_64/gcd_1.asm: Rewrite.
 
diff -r b319b5e7816c -r 775523081472 mpn/x86/k7/gcd_1.asm
--- a/mpn/x86/k7/gcd_1.asm	Sat Mar 17 15:19:11 2012 +0100
+++ b/mpn/x86/k7/gcd_1.asm	Sat Mar 17 23:53:55 2012 +0100
@@ -1,328 +1,169 @@
-dnl  AMD K7 mpn_gcd_1 -- mpn by 1 gcd.
+dnl  x86 mpn_gcd_1 optimised for AMD K7.
 
-dnl  Copyright 2000, 2001, 2002, 2009, 2010 Free Software Foundation, Inc.
-dnl
+dnl  Contributed to the GNU project by by Kevin Ryde.  Rehacked by Torbjorn
+dnl  Granlund.
+
+dnl  Copyright 2000, 2001, 2002, 2005, 2009, 2011, 2012 Free Software
+dnl  Foundation, Inc.
+
 dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or
-dnl  modify it under the terms of the GNU Lesser General Public License as
-dnl  published by the Free Software Foundation; either version 3 of the
-dnl  License, or (at your option) any later version.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful,
-dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-dnl  Lesser General Public License for more details.
-dnl
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
 dnl  You should have received a copy of the GNU Lesser General Public License
 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 
 include(`../config.m4')
 
 
-C K7: 6.75 cycles/bit (approx)  1x1 gcd
-C     11.0 cycles/limb          Nx1 reduction (modexact_1_odd)
+C	     cycles/bit (approx)
+C AMD K7	 5.31
+C AMD K8,K9	 5.33
+C AMD K10	 5.30
+C AMD bd1	 ?
+C AMD bobcat	 7.02
+C Intel P4-2	10.1
+C Intel P4-3/4	10.0
+C Intel P6/13	 5.88
+C Intel core2	 6.26
+C Intel NHM	 6.83
+C Intel SBR	 8.50
+C Intel atom	 8.90
+C VIA nano	 ?
+C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1
 
-C This code was modernised in 2010 to avoid most use of 'div', but not
-C completely cleaned up.  Presumably, we should remove last 'div' too,
-C and simplify the structure to save many 'mov' insns.
+C TODO
+C  * Tune overhead, this takes 2-3 cycles more than old code when v0 is tiny.
+C  * Stream things better through registers, avoiding some copying.
 
-C Reduce using x%y if x is more than DIV_THRESHOLD bits bigger than y,
-C where x is the larger of the two.  See tune/README for more.
-C
-C divl at 40 cycles compared to the gcd at about 7 cycles/bitpair
-C suggests 40/7*2=11.4 but 7 seems to be about right.
-
-deflit(DIV_THRESHOLD, 7)
-
-
-C table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
-C
-C This is mixed in with the code, but as per the k7 optimization manual it's
-C a full cache line and suitably aligned so it won't get swapped between
-C code and data.  Having it in TEXT rather than RODATA saves needing a GOT
-C entry when PIC.
-C
-C Actually, there doesn't seem to be a measurable difference between this in
-C it's own cache line or plonked in the middle of the code.  Presumably
-C since TEXT is read-only there's no worries about coherency.
+C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
 
 deflit(MAXSHIFT, 6)
 deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
 
-	TEXT
-	ALIGN(64)
-L(table):
+DEF_OBJECT(ctz_table,64)
 	.byte	MAXSHIFT
 forloop(i,1,MASK,
 `	.byte	m4_count_trailing_zeros(i)
 ')
+END_OBJECT(ctz_table)
 
+C Threshold of when to call bmod when U is one limb.  Should be about
+C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
+define(`DIV_THRES_LOG2', 7)
 
-C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t limb);
-C
 
-defframe(PARAM_LIMB,   12)
-defframe(PARAM_SIZE,    8)
-defframe(PARAM_SRC,     4)
+define(`up',    `%edi')
+define(`n',     `%esi')
+define(`v0',    `%edx')
 
-defframe(SAVE_EBX,     -4)
-defframe(SAVE_ESI,     -8)
-defframe(SAVE_EDI,    -12)
-defframe(SAVE_EBP,    -16)
-defframe(CALL_DIVISOR,-20)
-defframe(CALL_SIZE,   -24)
-defframe(CALL_SRC,    -28)
 
-deflit(STACK_SPACE, 28)
-
+ASM_START()
 	TEXT
 	ALIGN(16)
+PROLOGUE(mpn_gcd_1)
+	push	%edi
+	push	%esi
 
-PROLOGUE(mpn_gcd_1)
-deflit(`FRAME',0)
+	mov	12(%esp), up
+	mov	16(%esp), n
+	mov	20(%esp), v0
 
-	ASSERT(ne, `cmpl $0, PARAM_LIMB')	C y!=0
-	ASSERT(ae, `cmpl $1, PARAM_SIZE')	C size>=1
-
-	mov	PARAM_SRC, %eax
-	mov	PARAM_LIMB, %edx
-	sub	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
-
-	mov	%esi, SAVE_ESI
-	mov	%ebx, SAVE_EBX
-
-	mov	(%eax), %esi		C src low limb
-
-ifdef(`PIC',`
-	mov	%edi, SAVE_EDI
-	call	L(movl_eip_to_edi)
-L(here):
-	add	$L(table)-L(here), %edi
-')
-
-	mov	%esi, %ebx
-	or	%edx, %esi	C x|y
+	mov	(up), %eax		C U low limb
+	or	v0, %eax		C x | y
 	mov	$-1, %ecx
 
 L(twos):
 	inc	%ecx
-	shr	%esi
-	jnc	L(twos)		C 3/4 chance of x or y odd already
+	shr	%eax
+	jnc	L(twos)
 
-	shr	%cl, %ebx
-	shr	%cl, %edx
-	mov	%ecx, %esi	C common twos
+	shr	%cl, v0
+	mov	%ecx, %eax		C common twos
 
-	mov	PARAM_SIZE, %ecx
-	cmp	$1, %ecx
-	ja	L(divide)
+L(divide_strip_y):
+	shr	v0
+	jnc	L(divide_strip_y)
+	adc	v0, v0
 
+	push	%eax
+	push	v0
 
-	C eax
-	C ebx	x
-	C ecx
-	C edx	y
-	C esi	common twos
-	C edi	[PIC] L(table)
-	C ebp
+	cmp	$1, n
+	jnz	L(reduce_nby1)
 
+C Both U and V are single limbs, reduce with bmod if u0 >> v0.
+	mov	(up), %ecx
+	mov	%ecx, %eax
+	shr	$DIV_THRES_LOG2, %ecx
+	cmp	%ecx, v0
+	ja	L(reduced)
+
+	mov	v0, %esi
+	xor	%edx, %edx
+	div	%esi
 	mov	%edx, %eax
-	cmp	%ebx, %edx
+	jmp	L(reduced)
 
-	cmovc(	%ebx, %eax)	C swap to make x bigger than y
-	cmovc(	%edx, %ebx)
+L(reduce_nby1):
+	push	v0			C param 3
+	push	n			C param 2
+	push	up			C param 1
+	cmp	$BMOD_1_TO_MOD_1_THRESHOLD, n
+	jl	L(bmod)
+ifdef(`PIC',`
+	call	GSYM_PREFIX`'mpn_mod_1 at PLT
+',`
+	call	GSYM_PREFIX`'mpn_mod_1
+')
+	jmp	L(called)
+L(bmod):
+ifdef(`PIC',`
+	call	GSYM_PREFIX`'mpn_modexact_1_odd at PLT
+',`
+	call	GSYM_PREFIX`'mpn_modexact_1_odd
+')
+L(called):
+	add	$12, %esp		C deallocate params
+L(reduced):
+	pop	%edx
 
+	LEA(	ctz_table, %esi)
+	test	%eax, %eax
+	mov	%eax, %ecx
+	jnz	L(mid)
+	jmp	L(end)
 
-L(strip_y):
-	C eax	x
-	C ebx	y
-	C ecx
-	C edx
-	C esi	common twos
-	C edi	[PIC] L(table)
-	C ebp
+	ALIGN(16)			C               K8    BC    P4    NHM   SBR
+L(top):	cmovc(	%ecx, %eax)		C if x-y < 0	0
+	cmovc(	%edi, %edx)		C use x,y-x	0
+L(mid):	and	$MASK, %ecx		C		0
+	movzbl	(%esi,%ecx), %ecx	C		1
+	jz	L(shift_alot)		C		1
+	shr	%cl, %eax		C		3
+	mov	%eax, %edi		C		4
+	mov	%edx, %ecx		C		3
+	sub	%eax, %ecx		C		4
+	sub	%edx, %eax		C		4
+	jnz	L(top)			C		5
 
-	ASSERT(nz,`orl %ebx,%ebx')
-	shr	%ebx
-	jnc	L(strip_y)
-	rcl	%ebx
-
-
-	C eax	x
-	C ebx	y (odd)
-	C ecx
-	C edx
-	C esi	common twos
-	C edi	[PIC] L(table)
-	C ebp
-
-	mov	%eax, %ecx
-	mov	%ebx, %edx
-	shr	$DIV_THRESHOLD, %eax
-
-	cmp	%eax, %ebx
-	mov	%ecx, %eax
-	ja	L(strip_x_entry)	C do x%y if x much bigger than y
-
-