[Gmp-commit] /home/hgfiles/gmp: 2 new changesets

Tue Jan 25 17:36:12 CET 2011

details:   /home/hgfiles/gmp/rev/155ea5b007a6
changeset: 13771:155ea5b007a6
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Tue Jan 25 17:32:18 2011 +0100
description:
Native mpn{_pi1,}_bdiv_q_1 for x86/k7.

details:   /home/hgfiles/gmp/rev/df87adb1597b
changeset: 13772:df87adb1597b
user:      Marco Bodrato <bodrato at mail.dm.unipi.it>
date:      Tue Jan 25 17:33:42 2011 +0100
description:
Native mpn{_pi1,}_bdiv_q_1 for x86/pentium4.

diffstat:

 ChangeLog                          |    5 +
 mpn/x86/k7/bdiv_q_1.asm            |  233 +++++++++++++++++++++++++++++++++++++
 mpn/x86/pentium4/sse2/bdiv_q_1.asm |  222 +++++++++++++++++++++++++++++++++++
 3 files changed, 460 insertions(+), 0 deletions(-)

diffs (truncated from 475 to 300 lines):

diff -r 55daed086815 -r df87adb1597b ChangeLog

--- a/ChangeLog	Tue Jan 25 00:00:06 2011 +0100
+++ b/ChangeLog	Tue Jan 25 17:33:42 2011 +0100
@@ -1,3 +1,8 @@
+2011-01-25 Marco Bodrato <bodrato at mail.dm.unipi.it>
+
+	* mpn/x86/pentium4/sse2/bdiv_q_1.asm: New file.
+	* mpn/x86/k7/bdiv_q_1.asm: New file.
+
 2011-01-24  Torbjorn Granlund  <tege at gmplib.org>
 
 	* tune/tuneup.c (tune_mul_n, tune_sqr): Loop, re-measuring thresholds
diff -r 55daed086815 -r df87adb1597b mpn/x86/k7/bdiv_q_1.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86/k7/bdiv_q_1.asm	Tue Jan 25 17:33:42 2011 +0100
@@ -0,0 +1,233 @@
+dnl  AMD K7 mpn_bdiv_q_1 -- mpn by limb exact division.
+
+dnl  Copyright 2001, 2002, 2004, 2007, 2011 Free Software Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  Rearranged from mpn/x86/k7/dive_1.asm by Marco Bodrato.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C          cycles/limb
+C Athlon:     11.0
+C Hammer:      9.0
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t divisor);
+C
+C The dependent chain is mul+imul+sub for 11 cycles and that speed is
+C achieved with no special effort.  The load and shrld latencies are hidden
+C by out of order execution.
+C
+C It's a touch faster on size==1 to use the mul-by-inverse than divl.
+
+defframe(PARAM_SHIFT,  24)
+defframe(PARAM_INVERSE,20)
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC,    8)
+defframe(PARAM_DST,    4)
+
+defframe(SAVE_EBX,     -4)
+defframe(SAVE_ESI,     -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+defframe(VAR_INVERSE, -20)
+defframe(VAR_DST_END, -24)
+
+deflit(STACK_SPACE, 24)
+
+	TEXT
+
+C mp_limb_t
+C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C		    mp_limb_t inverse, int shift)
+	ALIGN(16)
+PROLOGUE(mpn_pi1_bdiv_q_1)
+deflit(`FRAME',0)
+
+	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
+	movl	PARAM_SHIFT, %ecx	C shift count
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_SIZE, %ebp
+
+	movl	%esi, SAVE_ESI
+	movl	PARAM_SRC, %esi
+
+	movl	%edi, SAVE_EDI
+	movl	PARAM_DST, %edi
+
+	movl	%ebx, SAVE_EBX
+
+	leal	(%esi,%ebp,4), %esi	C src end
+	leal	(%edi,%ebp,4), %edi	C dst end
+	negl	%ebp			C -size
+
+	movl	PARAM_INVERSE, %eax	C inv
+
+L(common):
+	movl	%eax, VAR_INVERSE
+	movl	(%esi,%ebp,4), %eax	C src[0]
+
+	incl	%ebp
+	jz	L(one)
+
+	movl	(%esi,%ebp,4), %edx	C src[1]
+
+	shrdl(	%cl, %edx, %eax)
+
+	movl	%edi, VAR_DST_END
+	xorl	%ebx, %ebx
+	jmp	L(entry)
+
+	ALIGN(8)
+L(top):
+	C eax	q
+	C ebx	carry bit, 0 or 1
+	C ecx	shift
+	C edx
+	C esi	src end
+	C edi	dst end
+	C ebp	counter, limbs, negative
+
+	mull	PARAM_DIVISOR		C carry limb in edx
+
+	movl	-4(%esi,%ebp,4), %eax
+	movl	(%esi,%ebp,4), %edi
+
+	shrdl(	%cl, %edi, %eax)
+
+	subl	%ebx, %eax		C apply carry bit
+	setc	%bl
+	movl	VAR_DST_END, %edi
+
+	subl	%edx, %eax		C apply carry limb
+	adcl	$0, %ebx
+
+L(entry):
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi,%ebp,4)
+	incl	%ebp
+	jnz	L(top)
+
+
+	mull	PARAM_DIVISOR		C carry limb in edx
+
+	movl	-4(%esi), %eax		C src high limb
+	shrl	%cl, %eax
+	movl	SAVE_ESI, %esi
+
+	subl	%ebx, %eax		C apply carry bit
+	movl	SAVE_EBX, %ebx
+	movl	SAVE_EBP, %ebp
+
+	subl	%edx, %eax		C apply carry limb
+
+	imull	VAR_INVERSE, %eax
+
+	movl	%eax, -4(%edi)
+	movl	SAVE_EDI, %edi
+	addl	$STACK_SPACE, %esp
+
+	ret
+
+L(one):
+	shrl	%cl, %eax
+	movl	SAVE_ESI, %esi
+	movl	SAVE_EBX, %ebx
+
+	imull	VAR_INVERSE, %eax
+
+	movl	SAVE_EBP, %ebp
+
+	movl	%eax, -4(%edi)
+	movl	SAVE_EDI, %edi
+	addl	$STACK_SPACE, %esp
+
+	ret
+EPILOGUE()
+
+C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                           mp_limb_t divisor);
+C
+
+	ALIGN(16)
+PROLOGUE(mpn_bdiv_q_1)
+deflit(`FRAME',0)
+
+	movl	PARAM_DIVISOR, %eax
+	subl	$STACK_SPACE, %esp	deflit(`FRAME',STACK_SPACE)
+	movl	$-1, %ecx		C shift count
+
+	movl	%ebp, SAVE_EBP
+	movl	PARAM_SIZE, %ebp
+
+	movl	%esi, SAVE_ESI
+	movl	%edi, SAVE_EDI
+
+	C If there's usually only one or two trailing zero bits then this
+	C should be faster than bsfl.
+L(strip_twos):
+	incl	%ecx
+	shrl	%eax
+	jnc	L(strip_twos)
+
+	movl	%ebx, SAVE_EBX
+	leal	1(%eax,%eax), %ebx	C d without twos
+	andl	$127, %eax		C d/2, 7 bits
+
+ifdef(`PIC',`
+	LEA(	binvert_limb_table, %edx)
+	movzbl	(%eax,%edx), %eax		C inv 8 bits
+',`
+	movzbl	binvert_limb_table(%eax), %eax	C inv 8 bits
+')
+
+	leal	(%eax,%eax), %edx	C 2*inv
+	movl	%ebx, PARAM_DIVISOR	C d without twos
+
+	imull	%eax, %eax		C inv*inv
+
+	movl	PARAM_SRC, %esi
+	movl	PARAM_DST, %edi
+
+	imull	%ebx, %eax		C inv*inv*d
+
+	subl	%eax, %edx		C inv = 2*inv - inv*inv*d
+	leal	(%edx,%edx), %eax	C 2*inv
+
+	imull	%edx, %edx		C inv*inv
+
+	leal	(%esi,%ebp,4), %esi	C src end
+	leal	(%edi,%ebp,4), %edi	C dst end
+	negl	%ebp			C -size
+
+	imull	%ebx, %edx		C inv*inv*d
+
+	subl	%edx, %eax		C inv = 2*inv - inv*inv*d
+
+	ASSERT(e,`	C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+	pushl	%eax	FRAME_pushl()
+	imull	PARAM_DIVISOR, %eax
+	cmpl	$1, %eax
+	popl	%eax	FRAME_popl()')
+
+	jmp	L(common)
+EPILOGUE()
diff -r 55daed086815 -r df87adb1597b mpn/x86/pentium4/sse2/bdiv_q_1.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86/pentium4/sse2/bdiv_q_1.asm	Tue Jan 25 17:33:42 2011 +0100
@@ -0,0 +1,222 @@
+dnl  Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl  Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  Rearranged from mpn/x86/pentium4/sse2/dive_1.asm by Marco Bodrato.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P4: 19.0 cycles/limb
+
+C Pairs of movd's are used to avoid unaligned loads.  Despite the loads not
+C being on the dependent chain and there being plenty of cycles available,
+C using an unaligned movq on every second iteration measured about 23 c/l.
+C
+
+defframe(PARAM_SHIFT,  24)
+defframe(PARAM_INVERSE,20)
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE,   12)
+defframe(PARAM_SRC,    8)
+defframe(PARAM_DST,    4)
+
+	TEXT
+
+C mp_limb_t
+C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C		    mp_limb_t inverse, int shift)
+	ALIGN(32)
+PROLOGUE(mpn_pi1_bdiv_q_1)
+deflit(`FRAME',0)
+