[Gmp-commit] /var/hg/gmp: 2 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Wed Feb 16 21:53:27 CET 2011
details: /var/hg/gmp/rev/b96860d90a50
changeset: 13857:b96860d90a50
user: Torbjorn Granlund <tege at gmplib.org>
date: Tue Feb 15 13:46:49 2011 +0100
description:
Retune.
details: /var/hg/gmp/rev/d6a50e0f3d8d
changeset: 13858:d6a50e0f3d8d
user: Torbjorn Granlund <tege at gmplib.org>
date: Wed Feb 16 21:53:15 2011 +0100
description:
Trivial merge.
diffstat:
ChangeLog | 4 +
mpn/x86/aorsmul_1.asm | 2 +-
mpn/x86/atom/aorsmul_1.asm | 353 ++++++++++++++++++++++++++++++++++++-
mpn/x86/k6/aorsmul_1.asm | 2 +-
mpn/x86/k7/aorsmul_1.asm | 2 +-
mpn/x86/k7/mmx/popham.asm | 2 +-
mpn/x86/p6/aors_n.asm | 2 +-
mpn/x86/p6/aorsmul_1.asm | 2 +-
mpn/x86/pentium4/sse2/addmul_1.asm | 2 +-
mpn/x86/pentium4/sse2/mul_1.asm | 2 +-
mpn/x86/pentium4/sse2/submul_1.asm | 2 +-
mpn/x86_64/atom/gmp-mparam.h | 218 +++++++++------------
12 files changed, 457 insertions(+), 136 deletions(-)
diffs (truncated from 739 to 300 lines):
diff -r 3bb523b90f7c -r d6a50e0f3d8d ChangeLog
--- a/ChangeLog Mon Feb 14 06:50:32 2011 +0100
+++ b/ChangeLog Wed Feb 16 21:53:15 2011 +0100
@@ -1,3 +1,7 @@
+2011-02-16 Marco Bodrato <bodrato at mail.dm.unipi.it>
+
+ * mpn/x86/atom/aorsmul_1.asm: Revive an old k7/aorsmul.
+
2011-02-14 Marco Bodrato <bodrato at mail.dm.unipi.it>
* gmp-impl.h (mpn_sublsh_n): Declare.
diff -r 3bb523b90f7c -r d6a50e0f3d8d mpn/x86/aorsmul_1.asm
--- a/mpn/x86/aorsmul_1.asm Mon Feb 14 06:50:32 2011 +0100
+++ b/mpn/x86/aorsmul_1.asm Wed Feb 16 21:53:15 2011 +0100
@@ -25,7 +25,7 @@
C cycles/limb
C P5: 14.75
C P6 model 0-8,10-12) 7.5
-C P6 model 9 (Banias)
+C P6 model 9 (Banias) 6.7
C P6 model 13 (Dothan) 6.75
C P4 model 0 (Willamette) 24.0
C P4 model 1 (?) 24.0
diff -r 3bb523b90f7c -r d6a50e0f3d8d mpn/x86/atom/aorsmul_1.asm
--- a/mpn/x86/atom/aorsmul_1.asm Mon Feb 14 06:50:32 2011 +0100
+++ b/mpn/x86/atom/aorsmul_1.asm Wed Feb 16 21:53:15 2011 +0100
@@ -1,6 +1,6 @@
-dnl Intel Atom mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
+dnl AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
-dnl Copyright 2011 Free Software Foundation, Inc.
+dnl Copyright 1999, 2000, 2001, 2002, 2005 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
@@ -19,5 +19,352 @@
include(`../config.m4')
+
+C cycles/limb
+C P5:
+C P6 model 0-8,10-12) 6.35
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan) 6.25
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C K6:
+C K7: 3.9
+C K8:
+
+
+dnl K7: UNROLL_COUNT cycles/limb
+dnl 4 4.42
+dnl 8 4.16
+dnl 16 3.9
+dnl 32 3.9
+dnl 64 3.87
+dnl Maximum possible with the current code is 64.
+
+deflit(UNROLL_COUNT, 16)
+
+
+ifdef(`OPERATION_addmul_1',`
+ define(M4_inst, addl)
+ define(M4_function_1, mpn_addmul_1)
+ define(M4_function_1c, mpn_addmul_1c)
+ define(M4_description, add it to)
+ define(M4_desc_retval, carry)
+',`ifdef(`OPERATION_submul_1',`
+ define(M4_inst, subl)
+ define(M4_function_1, mpn_submul_1)
+ define(M4_function_1c, mpn_submul_1c)
+ define(M4_description, subtract it from)
+ define(M4_desc_retval, borrow)
+',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
+')')')
+
MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
-include_mpn(`x86/k6/aorsmul_1.asm')
+
+
+C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult);
+C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t mult, mp_limb_t carry);
+C
+C Calculate src,size multiplied by mult and M4_description dst,size.
+C Return the M4_desc_retval limb from the top of the result.
+
+ifdef(`PIC',`
+deflit(UNROLL_THRESHOLD, 9)
+',`
+deflit(UNROLL_THRESHOLD, 6)
+')
+
+defframe(PARAM_CARRY, 20)
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+deflit(`FRAME',0)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+deflit(SAVE_SIZE, 16)
+
+ TEXT
+ ALIGN(32)
+PROLOGUE(M4_function_1)
+ movl PARAM_SIZE, %edx
+ movl PARAM_SRC, %eax
+ xorl %ecx, %ecx
+
+ decl %edx
+ jnz L(start_1)
+
+ movl (%eax), %eax
+ movl PARAM_DST, %ecx
+
+ mull PARAM_MULTIPLIER
+
+ M4_inst %eax, (%ecx)
+ adcl $0, %edx
+ movl %edx, %eax
+
+ ret
+EPILOGUE()
+
+ ALIGN(16)
+PROLOGUE(M4_function_1c)
+ movl PARAM_SIZE, %edx
+ movl PARAM_SRC, %eax
+
+ decl %edx
+ jnz L(more_than_one_limb)
+
+ movl (%eax), %eax
+ movl PARAM_DST, %ecx
+
+ mull PARAM_MULTIPLIER
+
+ addl PARAM_CARRY, %eax
+
+ adcl $0, %edx
+ M4_inst %eax, (%ecx)
+
+ adcl $0, %edx
+ movl %edx, %eax
+
+ ret
+
+
+ C offset 0x44 so close enough to aligned
+L(more_than_one_limb):
+ movl PARAM_CARRY, %ecx
+L(start_1):
+ C eax src
+ C ecx initial carry
+ C edx size-1
+ subl $SAVE_SIZE, %esp
+deflit(`FRAME',16)
+
+ movl %ebx, SAVE_EBX
+ movl %esi, SAVE_ESI
+ movl %edx, %ebx C size-1
+
+ movl PARAM_SRC, %esi
+ movl %ebp, SAVE_EBP
+ cmpl $UNROLL_THRESHOLD, %edx
+
+ movl PARAM_MULTIPLIER, %ebp
+ movl %edi, SAVE_EDI
+
+ movl (%esi), %eax C src low limb
+ movl PARAM_DST, %edi
+ ja L(unroll)
+
+
+ C simple loop
+
+ leal 4(%esi,%ebx,4), %esi C point one limb past last
+ leal (%edi,%ebx,4), %edi C point at last limb
+ negl %ebx
+
+ C The movl to load the next source limb is done well ahead of the
+ C mul. This is necessary for full speed, and leads to one limb
+ C handled separately at the end.
+
+L(simple):
+ C eax src limb
+ C ebx loop counter
+ C ecx carry limb
+ C edx scratch
+ C esi src
+ C edi dst
+ C ebp multiplier
+
+ mull %ebp
+
+ addl %eax, %ecx
+ adcl $0, %edx
+
+ M4_inst %ecx, (%edi,%ebx,4)
+ movl (%esi,%ebx,4), %eax
+ adcl $0, %edx
+
+ incl %ebx
+ movl %edx, %ecx
+ jnz L(simple)
+
+
+ mull %ebp
+
+ movl SAVE_EBX, %ebx
+ movl SAVE_ESI, %esi
+ movl SAVE_EBP, %ebp
+
+ addl %eax, %ecx
+ adcl $0, %edx
+
+ M4_inst %ecx, (%edi)
+ adcl $0, %edx
+ movl SAVE_EDI, %edi
+
+ addl $SAVE_SIZE, %esp
+ movl %edx, %eax
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+ ALIGN(16)
+L(unroll):
+ C eax src low limb
+ C ebx size-1
+ C ecx carry
+ C edx size-1
+ C esi src
+ C edi dst
+ C ebp multiplier
+
+dnl overlapping with parameters no longer needed
+define(VAR_COUNTER,`PARAM_SIZE')
+define(VAR_JUMP, `PARAM_MULTIPLIER')
+
+ subl $2, %ebx C (size-2)-1
+ decl %edx C size-2
+
+ shrl $UNROLL_LOG2, %ebx
+ negl %edx
+
+ movl %ebx, VAR_COUNTER
+ andl $UNROLL_MASK, %edx
+
+ movl %edx, %ebx
+ shll $4, %edx
+
+ifdef(`PIC',`
+ call L(pic_calc)
+L(here):
+',`
+ leal L(entry) (%edx,%ebx,1), %edx
+')
+ negl %ebx
+ movl %edx, VAR_JUMP
+
+ mull %ebp
+
+ addl %eax, %ecx C initial carry, becomes low carry
+ adcl $0, %edx
+ testb $1, %bl
+
+ movl 4(%esi), %eax C src second limb
+ leal ifelse(UNROLL_BYTES,256,128+) 8(%esi,%ebx,4), %esi
+ leal ifelse(UNROLL_BYTES,256,128) (%edi,%ebx,4), %edi
+
+ movl %edx, %ebx C high carry
+ cmovnz( %ecx, %ebx) C high,low carry other way around
+ cmovnz( %edx, %ecx)
+
+ jmp *VAR_JUMP
+
+
+ifdef(`PIC',`
+L(pic_calc):
+ C See mpn/x86/README about old gas bugs
+ leal (%edx,%ebx,1), %edx
+ addl $L(entry)-L(here), %edx
+ addl (%esp), %edx
+ ret_internal
+')
+
+
+C -----------------------------------------------------------------------------
More information about the gmp-commit
mailing list