[Gmp-commit] /var/hg/gmp: Rewrite atom/32 addmul_1 and submul_1.

mercurial at gmplib.org mercurial at gmplib.org
Fri Feb 25 21:49:50 CET 2011


details:   /var/hg/gmp/rev/3ea54f250ae5
changeset: 13907:3ea54f250ae5
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Fri Feb 25 21:42:43 2011 +0100
description:
Rewrite atom/32 addmul_1 and submul_1.

diffstat:

 ChangeLog                       |    5 +
 mpn/x86/atom/aorsmul_1.asm      |  371 ----------------------------------------
 mpn/x86/atom/sse2/aorsmul_1.asm |  178 +++++++++++++++++++
 3 files changed, 183 insertions(+), 371 deletions(-)

diffs (truncated from 569 to 300 lines):

diff -r c1f2195121bd -r 3ea54f250ae5 ChangeLog
--- a/ChangeLog	Fri Feb 25 10:25:53 2011 +0100
+++ b/ChangeLog	Fri Feb 25 21:42:43 2011 +0100
@@ -1,3 +1,8 @@
+2011-02-25  Torbjorn Granlund  <tege at gmplib.org>
+
+	* mpn/x86/atom/sse2/aorsmul_1.asm: New file.
+	* mpn/x86/atom/aorsmul_1.asm: File removed.
+
 2011-02-25 Marco Bodrato <bodrato at mail.dm.unipi.it>
 
 	* mpn/x86/atom/sse2/divrem_1.asm: New file (was in x86/atom).
diff -r c1f2195121bd -r 3ea54f250ae5 mpn/x86/atom/aorsmul_1.asm
--- a/mpn/x86/atom/aorsmul_1.asm	Fri Feb 25 10:25:53 2011 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,371 +0,0 @@
-dnl  Intel Atom mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
-
-dnl  Copyright 1999, 2000, 2001, 2002, 2005, 2011 Free Software Foundation, Inc.
-dnl
-dnl  This file is part of the GNU MP Library.
-dnl
-dnl  The GNU MP Library is free software; you can redistribute it and/or
-dnl  modify it under the terms of the GNU Lesser General Public License as
-dnl  published by the Free Software Foundation; either version 3 of the
-dnl  License, or (at your option) any later version.
-dnl
-dnl  The GNU MP Library is distributed in the hope that it will be useful,
-dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-dnl  Lesser General Public License for more details.
-dnl
-dnl  You should have received a copy of the GNU Lesser General Public License
-dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C			    cycles/limb
-C P5
-C P6 model 0-8,10-12		 6.35
-C P6 model 9  (Banias)
-C P6 model 13 (Dothan)		 6.25
-C P4 model 0  (Willamette)
-C P4 model 1  (?)
-C P4 model 2  (Northwood)
-C P4 model 3  (Prescott)
-C P4 model 4  (Nocona)
-C Intel Atom			11
-C AMD K6
-C AMD K7			 3.9
-C AMD K8
-C AMD K10
-
-dnl  K7: UNROLL_COUNT  cycles/limb
-dnl           4            4.42
-dnl           8            4.16
-dnl          16            3.9
-dnl          32            3.9
-dnl          64            3.87
-dnl  Maximum possible with the current code is 64.
-
-deflit(UNROLL_COUNT, 16)
-
-
-ifdef(`OPERATION_addmul_1',`
-	define(M4_inst,        addl)
-	define(M4_function_1,  mpn_addmul_1)
-	define(M4_function_1c, mpn_addmul_1c)
-	define(M4_description, add it to)
-	define(M4_desc_retval, carry)
-',`ifdef(`OPERATION_submul_1',`
-	define(M4_inst,        subl)
-	define(M4_function_1,  mpn_submul_1)
-	define(M4_function_1c, mpn_submul_1c)
-	define(M4_description, subtract it from)
-	define(M4_desc_retval, borrow)
-',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
-')')')
-
-MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
-
-
-C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C                            mp_limb_t mult);
-C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C                             mp_limb_t mult, mp_limb_t carry);
-C
-C Calculate src,size multiplied by mult and M4_description dst,size.
-C Return the M4_desc_retval limb from the top of the result.
-
-ifdef(`PIC',`
-deflit(UNROLL_THRESHOLD, 9)
-',`
-deflit(UNROLL_THRESHOLD, 6)
-')
-
-defframe(PARAM_CARRY,     20)
-defframe(PARAM_MULTIPLIER,16)
-defframe(PARAM_SIZE,      12)
-defframe(PARAM_SRC,       8)
-defframe(PARAM_DST,       4)
-deflit(`FRAME',0)
-
-defframe(SAVE_EBX, -4)
-defframe(SAVE_ESI, -8)
-defframe(SAVE_EDI, -12)
-defframe(SAVE_EBP, -16)
-deflit(SAVE_SIZE, 16)
-
-ASM_START()
-	TEXT
-	ALIGN(32)
-PROLOGUE(M4_function_1)
-	movl	PARAM_SIZE, %edx
-	xorl	%ecx, %ecx
-
-	decl	%edx
-	movl	PARAM_SRC, %eax
-	jnz	L(start_1)
-
-	movl	(%eax), %eax
-	movl	PARAM_DST, %ecx
-
-	mull	PARAM_MULTIPLIER
-
-	M4_inst	%eax, (%ecx)
-	movl	%edx, %eax
-	adcl	$0, %eax
-
-	ret
-EPILOGUE()
-
-	ALIGN(16)
-PROLOGUE(M4_function_1c)
-	movl	PARAM_SIZE, %edx
-
-	decl	%edx
-	movl	PARAM_SRC, %eax
-	jnz	L(more_than_one_limb)
-
-	movl	(%eax), %eax
-	movl	PARAM_DST, %ecx
-
-	mull	PARAM_MULTIPLIER
-
-	addl	PARAM_CARRY, %eax
-
-	adcl	$0, %edx
-	M4_inst	%eax, (%ecx)
-
-	movl	%edx, %eax
-	adcl	$0, %eax
-
-	ret
-
-
-	C offset 0x44 so close enough to aligned
-L(more_than_one_limb):
-	movl	PARAM_CARRY, %ecx
-L(start_1):
-	C eax	src
-	C ecx	initial carry
-	C edx	size-1
-	subl	$SAVE_SIZE, %esp
-deflit(`FRAME',16)
-
-	movl	%ebx, SAVE_EBX
-	movl	%edx, %ebx	C size-1
-	movl	%esi, SAVE_ESI
-
-	movl	%eax, %esi
-	movl	%ebp, SAVE_EBP
-	cmpl	$UNROLL_THRESHOLD, %edx
-
-	movl	PARAM_MULTIPLIER, %ebp
-	movl	%edi, SAVE_EDI
-
-	movl	(%esi), %eax	C src low limb
-	movl	PARAM_DST, %edi
-	ja	L(unroll)
-
-
-	C simple loop
-
-	C The movl to load the next source limb is done well ahead of the
-	C mul.  This is necessary for full speed, and leads to one limb
-	C handled separately at the end.
-
-L(simple):
-	C eax	src limb
-	C ebx	loop counter
-	C ecx	carry limb
-	C edx	scratch
-	C esi	src
-	C edi	dst
-	C ebp	multiplier
-
-	mull	%ebp
-	leal	4(%esi), %esi
-
-	addl	%eax, %ecx
-	adcl	$0, %edx
-	movl	(%esi), %eax
-
-	M4_inst	%ecx, (%edi)
-	leal	4(%edi), %edi
-	adcl	$0, %edx
-
-	decl	%ebx
-	movl	%edx, %ecx
-	jnz	L(simple)
-
-
-	mull	%ebp
-
-	movl	SAVE_EBX, %ebx
-	movl	SAVE_ESI, %esi
-	movl	SAVE_EBP, %ebp
-
-	addl	%eax, %ecx
-	adcl	$0, %edx
-
-	M4_inst	%ecx, (%edi)
-	adcl	$0, %edx
-	movl	SAVE_EDI, %edi
-
-	addl	$SAVE_SIZE, %esp
-	movl	%edx, %eax
-	ret
-
-
-
-C -----------------------------------------------------------------------------
-	ALIGN(16)
-L(unroll):
-	C eax	src low limb
-	C ebx	size-1
-	C ecx	carry
-	C edx	size-1
-	C esi	src
-	C edi	dst
-	C ebp	multiplier
-
-dnl  overlapping with parameters no longer needed
-define(VAR_COUNTER,`PARAM_SIZE')
-define(VAR_JUMP,   `PARAM_MULTIPLIER')
-
-	subl	$2, %ebx	C (size-2)-1
-	decl	%edx		C size-2
-
-	shrl	$UNROLL_LOG2, %ebx
-	negl	%edx
-
-	movl	%ebx, VAR_COUNTER
-	andl	$UNROLL_MASK, %edx
-
-	movl	%edx, %ebx
-	shll	$4, %edx
-
-ifdef(`PIC',`
-	call	L(pic_calc)
-L(here):
-',`
-	leal	L(entry) (%edx,%ebx,1), %edx
-')
-	negl	%ebx
-	movl	%edx, VAR_JUMP
-
-	mull	%ebp
-
-	addl	%eax, %ecx	C initial carry, becomes low carry
-	adcl	$0, %edx
-	testb	$1, %bl
-
-	movl	4(%esi), %eax	C src second limb
-	leal	ifelse(UNROLL_BYTES,256,128+) 8(%esi,%ebx,4), %esi
-	leal	ifelse(UNROLL_BYTES,256,128)   (%edi,%ebx,4), %edi
-
-	movl	%edx, %ebx	C high carry
-	cmovnz(	%ecx, %ebx)	C high,low carry other way around
-	cmovnz(	%edx, %ecx)
-
-	jmp	*VAR_JUMP
-
-
-ifdef(`PIC',`
-L(pic_calc):
-	C See mpn/x86/README about old gas bugs
-	leal	(%edx,%ebx,1), %edx
-	addl	$L(entry)-L(here), %edx
-	addl	(%esp), %edx
-	ret_internal
-')
-
-
-C -----------------------------------------------------------------------------
-C This code uses a "two carry limbs" scheme.  At the top of the loop the
-C carries are ebx=lo, ecx=hi, then they swap for each limb processed.  For
-C the computed jump an odd size means they start one way around, an even


More information about the gmp-commit mailing list