[Gmp-commit] /home/hgfiles/gmp: 2 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Sat Jan 22 14:03:23 CET 2011
details: /home/hgfiles/gmp/rev/e26516bcb3d7
changeset: 13755:e26516bcb3d7
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Jan 20 19:36:02 2011 +0100
description:
Update to current gnuplot syntax.
details: /home/hgfiles/gmp/rev/33ef11b9a199
changeset: 13756:33ef11b9a199
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Jan 22 14:03:18 2011 +0100
description:
Trivial merge.
diffstat:
ChangeLog | 8 +
mpn/x86/p6/bdiv_q_1.asm | 275 +++++++++++++++++++++++++++++++++++++++++++
mpn/x86/pentium/bdiv_q_1.asm | 249 ++++++++++++++++++++++++++++++++++++++
tests/devel/try.c | 45 ++++++-
tune/speed.c | 2 +-
5 files changed, 574 insertions(+), 5 deletions(-)
diffs (truncated from 651 to 300 lines):
diff -r 37ac1001d611 -r 33ef11b9a199 ChangeLog
--- a/ChangeLog Thu Jan 20 11:33:06 2011 +0100
+++ b/ChangeLog Sat Jan 22 14:03:18 2011 +0100
@@ -1,5 +1,13 @@
+2011-01-21 Marco Bodrato <bodrato at mail.dm.unipi.it>
+
+ * tests/devel/try.c: Initial support for mpn_bdiv_q_1.
+ * mpn/x86/pentium/bdiv_q_1.asm: New file.
+ * mpn/x86/p6/bdiv_q_1.asm: New file.
+
2011-01-20 Torbjorn Granlund <tege at gmplib.org>
+ * tune/speed.c (run_gnuplot): Update to current gnuplot syntax.
+
* mpn/powerpc64/mode64/aorsmul_1.asm: Trim away 0.5 c/l for submul_1
for POWER5.
diff -r 37ac1001d611 -r 33ef11b9a199 mpn/x86/p6/bdiv_q_1.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86/p6/bdiv_q_1.asm Sat Jan 22 14:03:18 2011 +0100
@@ -0,0 +1,275 @@
+dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder.
+
+dnl Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl Rearranged from mpn/x86/p6/dive_1.asm by Marco Bodrato.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C odd even divisor
+C P6: 10.0 12.0 cycles/limb
+
+C MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
+
+C The odd case is basically the same as mpn_modexact_1_odd, just with an
+C extra store, and it runs at the same 10 cycles which is the dependent
+C chain.
+C
+C The shifts for the even case aren't on the dependent chain so in principle
+C it could run the same too, but nothing running at 10 has been found.
+C Perhaps there's too many uops (an extra 4 over the odd case).
+
+defframe(PARAM_SHIFT, 24)
+defframe(PARAM_INVERSE,20)
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+deflit(STACK_SPACE, 16)
+
+dnl re-use parameter space
+define(VAR_INVERSE,`PARAM_SRC')
+
+ TEXT
+
+C mp_limb_t
+C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t inverse, int shift)
+
+ ALIGN(16)
+PROLOGUE(mpn_pi1_bdiv_q_1)
+deflit(`FRAME',0)
+
+ subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_SIZE, %ebx
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_INVERSE, %ebp
+
+ movl PARAM_SHIFT, %ecx C trailing twos
+
+L(common):
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+
+ leal (%esi,%ebx,4), %esi C src end
+
+ leal (%edi,%ebx,4), %edi C dst end
+ negl %ebx C -size
+
+ movl (%esi,%ebx,4), %eax C src[0]
+
+ orl %ecx, %ecx
+ jz L(odd_entry)
+
+ movl %edi, PARAM_DST
+ movl %ebp, VAR_INVERSE
+
+L(even):
+ C eax src[0]
+ C ebx counter, limbs, negative
+ C ecx shift
+ C edx
+ C esi
+ C edi
+ C ebp
+
+ xorl %ebp, %ebp C initial carry bit
+ xorl %edx, %edx C initial carry limb (for size==1)
+
+ incl %ebx
+ jz L(even_one)
+
+ movl (%esi,%ebx,4), %edi C src[1]
+
+ shrdl( %cl, %edi, %eax)
+
+ jmp L(even_entry)
+
+
+L(even_top):
+ C eax scratch
+ C ebx counter, limbs, negative
+ C ecx shift
+ C edx scratch
+ C esi &src[size]
+ C edi &dst[size] and scratch
+ C ebp carry bit
+
+ movl (%esi,%ebx,4), %edi
+
+ mull PARAM_DIVISOR
+
+ movl -4(%esi,%ebx,4), %eax
+ shrdl( %cl, %edi, %eax)
+
+ subl %ebp, %eax
+
+ sbbl %ebp, %ebp
+ subl %edx, %eax
+
+ sbbl $0, %ebp
+
+L(even_entry):
+ imull VAR_INVERSE, %eax
+
+ movl PARAM_DST, %edi
+ negl %ebp
+
+ movl %eax, -4(%edi,%ebx,4)
+ incl %ebx
+ jnz L(even_top)
+
+ mull PARAM_DIVISOR
+
+ movl -4(%esi), %eax
+
+L(even_one):
+ shrl %cl, %eax
+ movl SAVE_ESI, %esi
+
+ subl %ebp, %eax
+ movl SAVE_EBP, %ebp
+
+ subl %edx, %eax
+ movl SAVE_EBX, %ebx
+
+ imull VAR_INVERSE, %eax
+
+ movl %eax, -4(%edi)
+ movl SAVE_EDI, %edi
+ addl $STACK_SPACE, %esp
+
+ ret
+
+C The dependent chain here is
+C
+C subl %edx, %eax 1
+C imull %ebp, %eax 4
+C mull PARAM_DIVISOR 5
+C ----
+C total 10
+C
+C and this is the measured speed. No special scheduling is necessary, out
+C of order execution hides the load latency.
+
+L(odd_top):
+ C eax scratch (src limb)
+ C ebx counter, limbs, negative
+ C ecx carry bit
+ C edx carry limb, high of last product
+ C esi &src[size]
+ C edi &dst[size]
+ C ebp inverse
+
+ mull PARAM_DIVISOR
+
+ movl (%esi,%ebx,4), %eax
+ subl %ecx, %eax
+
+ sbbl %ecx, %ecx
+ subl %edx, %eax
+
+ sbbl $0, %ecx
+
+L(odd_entry):
+ imull %ebp, %eax
+
+ movl %eax, (%edi,%ebx,4)
+ negl %ecx
+
+ incl %ebx
+ jnz L(odd_top)
+
+
+ movl SAVE_ESI, %esi
+
+ movl SAVE_EDI, %edi
+
+ movl SAVE_EBP, %ebp
+
+ movl SAVE_EBX, %ebx
+ addl $STACK_SPACE, %esp
+
+ ret
+
+EPILOGUE()
+
+C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C
+
+ ALIGN(16)
+PROLOGUE(mpn_bdiv_q_1)
+deflit(`FRAME',0)
+
+ movl PARAM_DIVISOR, %eax
+ subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %ebx, SAVE_EBX
+ movl PARAM_SIZE, %ebx
+
+ bsfl %eax, %ecx C trailing twos
+
+ movl %ebp, SAVE_EBP
+
+ shrl %cl, %eax C d without twos
+
+ movl %eax, %edx
+ shrl %eax C d/2 without twos
+
+ movl %edx, PARAM_DIVISOR
+ andl $127, %eax
+
+ifdef(`PIC',`
+ LEA( binvert_limb_table, %ebp)
+ movzbl (%eax,%ebp), %ebp C inv 8 bits
+',`
+ movzbl binvert_limb_table(%eax), %ebp C inv 8 bits
+')
+
+ leal (%ebp,%ebp), %eax C 2*inv
+
+ imull %ebp, %ebp C inv*inv
+ imull %edx, %ebp C inv*inv*d
+
+ subl %ebp, %eax C inv = 2*inv - inv*inv*d
+ leal (%eax,%eax), %ebp C 2*inv
+
+ imull %eax, %eax C inv*inv
+ imull %edx, %eax C inv*inv*d
+
+ subl %eax, %ebp C inv = 2*inv - inv*inv*d
+
+ jmp L(common)
+
+EPILOGUE()
diff -r 37ac1001d611 -r 33ef11b9a199 mpn/x86/pentium/bdiv_q_1.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86/pentium/bdiv_q_1.asm Sat Jan 22 14:03:18 2011 +0100
@@ -0,0 +1,249 @@
More information about the gmp-commit
mailing list