[Gmp-commit] /home/hgfiles/gmp: Native mpn{_pi1, }_bdiv_q_1 for x86.
mercurial at gmplib.org
mercurial at gmplib.org
Tue Jan 18 08:15:47 CET 2011
details: /home/hgfiles/gmp/rev/0bf61597048d
changeset: 13746:0bf61597048d
user: Marco Bodrato <bodrato at mail.dm.unipi.it>
date: Tue Jan 18 07:59:02 2011 +0100
description:
Native mpn{_pi1,}_bdiv_q_1 for x86.
diffstat:
ChangeLog | 4 +
mpn/x86/bdiv_q_1.asm | 197 +++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 201 insertions(+), 0 deletions(-)
diffs (212 lines):
diff -r 2eb47368ef96 -r 0bf61597048d ChangeLog
--- a/ChangeLog Sat Jan 15 08:25:10 2011 +0100
+++ b/ChangeLog Tue Jan 18 07:59:02 2011 +0100
@@ -1,3 +1,7 @@
+2011-01-18 Marco Bodrato <bodrato at mail.dm.unipi.it>
+
+ * mpn/x86/bdiv_q_1.asm: New file (same core alg. as dive_1).
+
2011-01-15 Marco Bodrato <bodrato at mail.dm.unipi.it>
* mpn/generic/divexact.c: Avoid COPY if not needed.
diff -r 2eb47368ef96 -r 0bf61597048d mpn/x86/bdiv_q_1.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86/bdiv_q_1.asm Tue Jan 18 07:59:02 2011 +0100
@@ -0,0 +1,197 @@
+dnl x86 mpn_bdiv_q_1 -- mpn by limb exact division.
+
+dnl Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl Rearranged from mpn/x86/dive_1.asm by Marco Bodrato.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C P54 30.0
+C P55 29.0
+C P6 13.0 odd divisor, 12.0 even (strangely)
+C K6 14.0
+C K7 12.0
+C P4 42.0
+
+MULFUNC_PROLOGUE(mpn_bdiv_q_1 mpn_pi1_bdiv_q_1)
+
+defframe(PARAM_SHIFT, 24)
+defframe(PARAM_INVERSE,20)
+defframe(PARAM_DIVISOR,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(VAR_INVERSE,`PARAM_SRC')
+
+ TEXT
+
+C mp_limb_t
+C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C mp_limb_t inverse, int shift)
+
+ ALIGN(16)
+PROLOGUE(mpn_pi1_bdiv_q_1)
+deflit(`FRAME',0)
+
+ movl PARAM_SHIFT, %ecx
+ pushl %ebp FRAME_pushl()
+
+ movl PARAM_INVERSE, %eax
+ movl PARAM_SIZE, %ebp
+ pushl %ebx FRAME_pushl()
+L(common):
+ pushl %edi FRAME_pushl()
+ pushl %esi FRAME_pushl()
+
+ movl PARAM_SRC, %esi
+ movl PARAM_DST, %edi
+
+ leal (%esi,%ebp,4), %esi C src end
+ leal (%edi,%ebp,4), %edi C dst end
+ negl %ebp C -size
+
+ movl %eax, VAR_INVERSE
+ movl (%esi,%ebp,4), %eax C src[0]
+
+ xorl %ebx, %ebx
+ xorl %edx, %edx
+
+ incl %ebp
+ jz L(one)
+
+ movl (%esi,%ebp,4), %edx C src[1]
+
+ shrdl( %cl, %edx, %eax)
+
+ movl VAR_INVERSE, %edx
+ jmp L(entry)
+
+
+ ALIGN(8)
+ nop C k6 code alignment
+ nop
+L(top):
+ C eax q
+ C ebx carry bit, 0 or -1
+ C ecx shift
+ C edx carry limb
+ C esi src end
+ C edi dst end
+ C ebp counter, limbs, negative
+
+ movl -4(%esi,%ebp,4), %eax
+ subl %ebx, %edx C accumulate carry bit
+
+ movl (%esi,%ebp,4), %ebx
+
+ shrdl( %cl, %ebx, %eax)
+
+ subl %edx, %eax C apply carry limb
+ movl VAR_INVERSE, %edx
+
+ sbbl %ebx, %ebx
+
+L(entry):
+ imull %edx, %eax
+
+ movl %eax, -4(%edi,%ebp,4)
+ movl PARAM_DIVISOR, %edx
+
+ mull %edx
+
+ incl %ebp
+ jnz L(top)
+
+
+ movl -4(%esi), %eax C src high limb
+L(one):
+ shrl %cl, %eax
+ popl %esi FRAME_popl()
+
+ addl %ebx, %eax C apply carry bit
+
+ subl %edx, %eax C apply carry limb
+
+ imull VAR_INVERSE, %eax
+
+ movl %eax, -4(%edi)
+
+ popl %edi
+ popl %ebx
+ popl %ebp
+
+ ret
+
+EPILOGUE()
+
+C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C mp_limb_t divisor);
+C
+
+ ALIGN(16)
+PROLOGUE(mpn_bdiv_q_1)
+deflit(`FRAME',0)
+
+ movl PARAM_DIVISOR, %eax
+ pushl %ebp FRAME_pushl()
+
+ movl $-1, %ecx C shift count
+ movl PARAM_SIZE, %ebp
+
+ pushl %ebx FRAME_pushl()
+
+L(strip_twos):
+ incl %ecx
+
+ shrl %eax
+ jnc L(strip_twos)
+
+ leal 1(%eax,%eax), %ebx C d without twos
+ andl $127, %eax C d/2, 7 bits
+
+ifdef(`PIC',`
+ LEA( binvert_limb_table, %edx)
+ movzbl (%eax,%edx), %eax C inv 8 bits
+',`
+ movzbl binvert_limb_table(%eax), %eax C inv 8 bits
+')
+
+ leal (%eax,%eax), %edx C 2*inv
+ movl %ebx, PARAM_DIVISOR C d without twos
+ imull %eax, %eax C inv*inv
+ imull %ebx, %eax C inv*inv*d
+ subl %eax, %edx C inv = 2*inv - inv*inv*d
+
+ leal (%edx,%edx), %eax C 2*inv
+ imull %edx, %edx C inv*inv
+ imull %ebx, %edx C inv*inv*d
+ subl %edx, %eax C inv = 2*inv - inv*inv*d
+
+ ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
+ pushl %eax FRAME_pushl()
+ imull PARAM_DIVISOR, %eax
+ cmpl $1, %eax
+ popl %eax FRAME_popl()')
+
+ jmp L(common)
+EPILOGUE()
+
More information about the gmp-commit
mailing list