[Gmp-commit] /var/hg/gmp: Rewrite atom/32 addmul_1 and submul_1.
mercurial at gmplib.org
mercurial at gmplib.org
Fri Feb 25 21:49:50 CET 2011
details: /var/hg/gmp/rev/3ea54f250ae5
changeset: 13907:3ea54f250ae5
user: Torbjorn Granlund <tege at gmplib.org>
date: Fri Feb 25 21:42:43 2011 +0100
description:
Rewrite atom/32 addmul_1 and submul_1.
diffstat:
ChangeLog | 5 +
mpn/x86/atom/aorsmul_1.asm | 371 ----------------------------------------
mpn/x86/atom/sse2/aorsmul_1.asm | 178 +++++++++++++++++++
3 files changed, 183 insertions(+), 371 deletions(-)
diffs (truncated from 569 to 300 lines):
diff -r c1f2195121bd -r 3ea54f250ae5 ChangeLog
--- a/ChangeLog Fri Feb 25 10:25:53 2011 +0100
+++ b/ChangeLog Fri Feb 25 21:42:43 2011 +0100
@@ -1,3 +1,8 @@
+2011-02-25 Torbjorn Granlund <tege at gmplib.org>
+
+ * mpn/x86/atom/sse2/aorsmul_1.asm: New file.
+ * mpn/x86/atom/aorsmul_1.asm: File removed.
+
2011-02-25 Marco Bodrato <bodrato at mail.dm.unipi.it>
* mpn/x86/atom/sse2/divrem_1.asm: New file (was in x86/atom).
diff -r c1f2195121bd -r 3ea54f250ae5 mpn/x86/atom/aorsmul_1.asm
--- a/mpn/x86/atom/aorsmul_1.asm Fri Feb 25 10:25:53 2011 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,371 +0,0 @@
-dnl Intel Atom mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
-
-dnl Copyright 1999, 2000, 2001, 2002, 2005, 2011 Free Software Foundation, Inc.
-dnl
-dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 3 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
-dnl You should have received a copy of the GNU Lesser General Public License
-dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C cycles/limb
-C P5
-C P6 model 0-8,10-12 6.35
-C P6 model 9 (Banias)
-C P6 model 13 (Dothan) 6.25
-C P4 model 0 (Willamette)
-C P4 model 1 (?)
-C P4 model 2 (Northwood)
-C P4 model 3 (Prescott)
-C P4 model 4 (Nocona)
-C Intel Atom 11
-C AMD K6
-C AMD K7 3.9
-C AMD K8
-C AMD K10
-
-dnl K7: UNROLL_COUNT cycles/limb
-dnl 4 4.42
-dnl 8 4.16
-dnl 16 3.9
-dnl 32 3.9
-dnl 64 3.87
-dnl Maximum possible with the current code is 64.
-
-deflit(UNROLL_COUNT, 16)
-
-
-ifdef(`OPERATION_addmul_1',`
- define(M4_inst, addl)
- define(M4_function_1, mpn_addmul_1)
- define(M4_function_1c, mpn_addmul_1c)
- define(M4_description, add it to)
- define(M4_desc_retval, carry)
-',`ifdef(`OPERATION_submul_1',`
- define(M4_inst, subl)
- define(M4_function_1, mpn_submul_1)
- define(M4_function_1c, mpn_submul_1c)
- define(M4_description, subtract it from)
- define(M4_desc_retval, borrow)
-',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
-')')')
-
-MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
-
-
-C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t mult);
-C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
-C mp_limb_t mult, mp_limb_t carry);
-C
-C Calculate src,size multiplied by mult and M4_description dst,size.
-C Return the M4_desc_retval limb from the top of the result.
-
-ifdef(`PIC',`
-deflit(UNROLL_THRESHOLD, 9)
-',`
-deflit(UNROLL_THRESHOLD, 6)
-')
-
-defframe(PARAM_CARRY, 20)
-defframe(PARAM_MULTIPLIER,16)
-defframe(PARAM_SIZE, 12)
-defframe(PARAM_SRC, 8)
-defframe(PARAM_DST, 4)
-deflit(`FRAME',0)
-
-defframe(SAVE_EBX, -4)
-defframe(SAVE_ESI, -8)
-defframe(SAVE_EDI, -12)
-defframe(SAVE_EBP, -16)
-deflit(SAVE_SIZE, 16)
-
-ASM_START()
- TEXT
- ALIGN(32)
-PROLOGUE(M4_function_1)
- movl PARAM_SIZE, %edx
- xorl %ecx, %ecx
-
- decl %edx
- movl PARAM_SRC, %eax
- jnz L(start_1)
-
- movl (%eax), %eax
- movl PARAM_DST, %ecx
-
- mull PARAM_MULTIPLIER
-
- M4_inst %eax, (%ecx)
- movl %edx, %eax
- adcl $0, %eax
-
- ret
-EPILOGUE()
-
- ALIGN(16)
-PROLOGUE(M4_function_1c)
- movl PARAM_SIZE, %edx
-
- decl %edx
- movl PARAM_SRC, %eax
- jnz L(more_than_one_limb)
-
- movl (%eax), %eax
- movl PARAM_DST, %ecx
-
- mull PARAM_MULTIPLIER
-
- addl PARAM_CARRY, %eax
-
- adcl $0, %edx
- M4_inst %eax, (%ecx)
-
- movl %edx, %eax
- adcl $0, %eax
-
- ret
-
-
- C offset 0x44 so close enough to aligned
-L(more_than_one_limb):
- movl PARAM_CARRY, %ecx
-L(start_1):
- C eax src
- C ecx initial carry
- C edx size-1
- subl $SAVE_SIZE, %esp
-deflit(`FRAME',16)
-
- movl %ebx, SAVE_EBX
- movl %edx, %ebx C size-1
- movl %esi, SAVE_ESI
-
- movl %eax, %esi
- movl %ebp, SAVE_EBP
- cmpl $UNROLL_THRESHOLD, %edx
-
- movl PARAM_MULTIPLIER, %ebp
- movl %edi, SAVE_EDI
-
- movl (%esi), %eax C src low limb
- movl PARAM_DST, %edi
- ja L(unroll)
-
-
- C simple loop
-
- C The movl to load the next source limb is done well ahead of the
- C mul. This is necessary for full speed, and leads to one limb
- C handled separately at the end.
-
-L(simple):
- C eax src limb
- C ebx loop counter
- C ecx carry limb
- C edx scratch
- C esi src
- C edi dst
- C ebp multiplier
-
- mull %ebp
- leal 4(%esi), %esi
-
- addl %eax, %ecx
- adcl $0, %edx
- movl (%esi), %eax
-
- M4_inst %ecx, (%edi)
- leal 4(%edi), %edi
- adcl $0, %edx
-
- decl %ebx
- movl %edx, %ecx
- jnz L(simple)
-
-
- mull %ebp
-
- movl SAVE_EBX, %ebx
- movl SAVE_ESI, %esi
- movl SAVE_EBP, %ebp
-
- addl %eax, %ecx
- adcl $0, %edx
-
- M4_inst %ecx, (%edi)
- adcl $0, %edx
- movl SAVE_EDI, %edi
-
- addl $SAVE_SIZE, %esp
- movl %edx, %eax
- ret
-
-
-
-C -----------------------------------------------------------------------------
- ALIGN(16)
-L(unroll):
- C eax src low limb
- C ebx size-1
- C ecx carry
- C edx size-1
- C esi src
- C edi dst
- C ebp multiplier
-
-dnl overlapping with parameters no longer needed
-define(VAR_COUNTER,`PARAM_SIZE')
-define(VAR_JUMP, `PARAM_MULTIPLIER')
-
- subl $2, %ebx C (size-2)-1
- decl %edx C size-2
-
- shrl $UNROLL_LOG2, %ebx
- negl %edx
-
- movl %ebx, VAR_COUNTER
- andl $UNROLL_MASK, %edx
-
- movl %edx, %ebx
- shll $4, %edx
-
-ifdef(`PIC',`
- call L(pic_calc)
-L(here):
-',`
- leal L(entry) (%edx,%ebx,1), %edx
-')
- negl %ebx
- movl %edx, VAR_JUMP
-
- mull %ebp
-
- addl %eax, %ecx C initial carry, becomes low carry
- adcl $0, %edx
- testb $1, %bl
-
- movl 4(%esi), %eax C src second limb
- leal ifelse(UNROLL_BYTES,256,128+) 8(%esi,%ebx,4), %esi
- leal ifelse(UNROLL_BYTES,256,128) (%edi,%ebx,4), %edi
-
- movl %edx, %ebx C high carry
- cmovnz( %ecx, %ebx) C high,low carry other way around
- cmovnz( %edx, %ecx)
-
- jmp *VAR_JUMP
-
-
-ifdef(`PIC',`
-L(pic_calc):
- C See mpn/x86/README about old gas bugs
- leal (%edx,%ebx,1), %edx
- addl $L(entry)-L(here), %edx
- addl (%esp), %edx
- ret_internal
-')
-
-
-C -----------------------------------------------------------------------------
-C This code uses a "two carry limbs" scheme. At the top of the loop the
-C carries are ebx=lo, ecx=hi, then they swap for each limb processed. For
-C the computed jump an odd size means they start one way around, an even
More information about the gmp-commit
mailing list