[Gmp-commit] /home/hgfiles/gmp: 2 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Tue Jan 25 17:36:12 CET 2011
details: /home/hgfiles/gmp/rev/155ea5b007a6
changeset: 13771:155ea5b007a6
user: Marco Bodrato <bodrato at mail.dm.unipi.it>
date: Tue Jan 25 17:32:18 2011 +0100
description:
Native mpn{_pi1,}_bdiv_q_1 for x86/k7.
details: /home/hgfiles/gmp/rev/df87adb1597b
changeset: 13772:df87adb1597b
user: Marco Bodrato <bodrato at mail.dm.unipi.it>
date: Tue Jan 25 17:33:42 2011 +0100
description:
Native mpn{_pi1,}_bdiv_q_1 for x86/pentium4.
diffstat:
ChangeLog | 5 +
mpn/x86/k7/bdiv_q_1.asm | 233 +++++++++++++++++++++++++++++++++++++
mpn/x86/pentium4/sse2/bdiv_q_1.asm | 222 +++++++++++++++++++++++++++++++++++
3 files changed, 460 insertions(+), 0 deletions(-)
diffs (truncated from 475 to 300 lines):
diff -r 55daed086815 -r df87adb1597b ChangeLog
--- a/ChangeLog Tue Jan 25 00:00:06 2011 +0100
+++ b/ChangeLog Tue Jan 25 17:33:42 2011 +0100
@@ -1,3 +1,8 @@
+2011-01-25 Marco Bodrato <bodrato at mail.dm.unipi.it>
+
+ * mpn/x86/pentium4/sse2/bdiv_q_1.asm: New file.
+ * mpn/x86/k7/bdiv_q_1.asm: New file.
+
2011-01-24 Torbjorn Granlund <tege at gmplib.org>
* tune/tuneup.c (tune_mul_n, tune_sqr): Loop, re-measuring thresholds
diff -r 55daed086815 -r df87adb1597b mpn/x86/k7/bdiv_q_1.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86/k7/bdiv_q_1.asm Tue Jan 25 17:33:42 2011 +0100
@@ -0,0 +1,233 @@
+dnl AMD K7 mpn_bdiv_q_1 -- mpn by limb exact division.
+
+dnl Copyright 2001, 2002, 2004, 2007, 2011 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl Rearranged from mpn/x86/k7/dive_1.asm by Marco Bodrato.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C Athlon: 11.0
+C Hammer: 9.0
+
+
+C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C
+C The dependent chain is mul+imul+sub for 11 cycles and that speed is
+C achieved with no special effort. The load and shrld latencies are hidden
+C by out of order execution.
+C
+C It's a touch faster on size==1 to use the mul-by-inverse than divl.
+
+defframe(PARAM_SHIFT, 24)
+defframe(PARAM_INVERSE,20)
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+defframe(VAR_INVERSE, -20)
+defframe(VAR_DST_END, -24)
+
+deflit(STACK_SPACE, 24)
+
+ TEXT
+
+C mp_limb_t
+C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t inverse, int shift)
+ ALIGN(16)
+PROLOGUE(mpn_pi1_bdiv_q_1)
+deflit(`FRAME',0)
+
+ subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
+ movl PARAM_SHIFT, %ecx C shift count
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_SIZE, %ebp
+
+ movl %esi, SAVE_ESI
+ movl PARAM_SRC, %esi
+
+ movl %edi, SAVE_EDI
+ movl PARAM_DST, %edi
+
+ movl %ebx, SAVE_EBX
+
+ leal (%esi,%ebp,4), %esi C src end
+ leal (%edi,%ebp,4), %edi C dst end
+ negl %ebp C -size
+
+ movl PARAM_INVERSE, %eax C inv
+
+L(common):
+ movl %eax, VAR_INVERSE
+ movl (%esi,%ebp,4), %eax C src[0]
+
+ incl %ebp
+ jz L(one)
+
+ movl (%esi,%ebp,4), %edx C src[1]
+
+ shrdl( %cl, %edx, %eax)
+
+ movl %edi, VAR_DST_END
+ xorl %ebx, %ebx
+ jmp L(entry)
+
+ ALIGN(8)
+L(top):
+ C eax q
+ C ebx carry bit, 0 or 1
+ C ecx shift
+ C edx
+ C esi src end
+ C edi dst end
+ C ebp counter, limbs, negative
+
+ mull PARAM_DIVISOR C carry limb in edx
+
+ movl -4(%esi,%ebp,4), %eax
+ movl (%esi,%ebp,4), %edi
+
+ shrdl( %cl, %edi, %eax)
+
+ subl %ebx, %eax C apply carry bit
+ setc %bl
+ movl VAR_DST_END, %edi
+
+ subl %edx, %eax C apply carry limb
+ adcl $0, %ebx
+
+L(entry):
+ imull VAR_INVERSE, %eax
+
+ movl %eax, -4(%edi,%ebp,4)
+ incl %ebp
+ jnz L(top)
+
+
+ mull PARAM_DIVISOR C carry limb in edx
+
+ movl -4(%esi), %eax C src high limb
+ shrl %cl, %eax
+ movl SAVE_ESI, %esi
+
+ subl %ebx, %eax C apply carry bit
+ movl SAVE_EBX, %ebx
+ movl SAVE_EBP, %ebp
+
+ subl %edx, %eax C apply carry limb
+
+ imull VAR_INVERSE, %eax
+
+ movl %eax, -4(%edi)
+ movl SAVE_EDI, %edi
+ addl $STACK_SPACE, %esp
+
+ ret
+
+L(one):
+ shrl %cl, %eax
+ movl SAVE_ESI, %esi
+ movl SAVE_EBX, %ebx
+
+ imull VAR_INVERSE, %eax
+
+ movl SAVE_EBP, %ebp
+
+ movl %eax, -4(%edi)
+ movl SAVE_EDI, %edi
+ addl $STACK_SPACE, %esp
+
+ ret
+EPILOGUE()
+
+C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C
+
+ ALIGN(16)
+PROLOGUE(mpn_bdiv_q_1)
+deflit(`FRAME',0)
+
+ movl PARAM_DIVISOR, %eax
+ subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
+ movl $-1, %ecx C shift count
+
+ movl %ebp, SAVE_EBP
+ movl PARAM_SIZE, %ebp
+
+ movl %esi, SAVE_ESI
+ movl %edi, SAVE_EDI
+
+ C If there's usually only one or two trailing zero bits then this
+ C should be faster than bsfl.
+L(strip_twos):
+ incl %ecx
+ shrl %eax
+ jnc L(strip_twos)
+
+ movl %ebx, SAVE_EBX
+ leal 1(%eax,%eax), %ebx C d without twos
+ andl $127, %eax C d/2, 7 bits
+
+ifdef(`PIC',`
+ LEA( binvert_limb_table, %edx)
+ movzbl (%eax,%edx), %eax C inv 8 bits
+',`
+ movzbl binvert_limb_table(%eax), %eax C inv 8 bits
+')
+
+ leal (%eax,%eax), %edx C 2*inv
+ movl %ebx, PARAM_DIVISOR C d without twos
+
+ imull %eax, %eax C inv*inv
+
+ movl PARAM_SRC, %esi
+ movl PARAM_DST, %edi
+
+ imull %ebx, %eax C inv*inv*d
+
+ subl %eax, %edx C inv = 2*inv - inv*inv*d
+ leal (%edx,%edx), %eax C 2*inv
+
+ imull %edx, %edx C inv*inv
+
+ leal (%esi,%ebp,4), %esi C src end
+ leal (%edi,%ebp,4), %edi C dst end
+ negl %ebp C -size
+
+ imull %ebx, %edx C inv*inv*d
+
+ subl %edx, %eax C inv = 2*inv - inv*inv*d
+
+ ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ pushl %eax FRAME_pushl()
+ imull PARAM_DIVISOR, %eax
+ cmpl $1, %eax
+ popl %eax FRAME_popl()')
+
+ jmp L(common)
+EPILOGUE()
diff -r 55daed086815 -r df87adb1597b mpn/x86/pentium4/sse2/bdiv_q_1.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86/pentium4/sse2/bdiv_q_1.asm Tue Jan 25 17:33:42 2011 +0100
@@ -0,0 +1,222 @@
+dnl Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl Rearranged from mpn/x86/pentium4/sse2/dive_1.asm by Marco Bodrato.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P4: 19.0 cycles/limb
+
+C Pairs of movd's are used to avoid unaligned loads. Despite the loads not
+C being on the dependent chain and there being plenty of cycles available,
+C using an unaligned movq on every second iteration measured about 23 c/l.
+C
+
+defframe(PARAM_SHIFT, 24)
+defframe(PARAM_INVERSE,20)
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+ TEXT
+
+C mp_limb_t
+C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t inverse, int shift)
+ ALIGN(32)
+PROLOGUE(mpn_pi1_bdiv_q_1)
+deflit(`FRAME',0)
+
More information about the gmp-commit
mailing list