[Gmp-commit] /var/hg/gmp: 3 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Thu Feb 24 22:24:52 CET 2011
details: /var/hg/gmp/rev/56b84d514294
changeset: 13893:56b84d514294
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Feb 24 22:19:20 2011 +0100
description:
Fix typo in MULFUNC_PROLOGUE.
details: /var/hg/gmp/rev/44b583f583d4
changeset: 13894:44b583f583d4
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Feb 24 22:20:25 2011 +0100
description:
New file.
details: /var/hg/gmp/rev/71bcec497b55
changeset: 13895:71bcec497b55
user: Torbjorn Granlund <tege at gmplib.org>
date: Thu Feb 24 22:24:46 2011 +0100
description:
Trivial merge.
diffstat:
ChangeLog | 9 ++
mpn/x86/atom/aors_n.asm | 6 +-
mpn/x86/atom/logops_n.asm | 142 ++++++++++++++++++++++++++++--
mpn/x86/k7/mod_1_1.asm | 197 +++++++++++++++++++++++++++++--------------
mpn/x86/p6/sse2/mod_1_1.asm | 23 +++++
mpn/x86/p6/sse2/mod_1_4.asm | 23 ++--
6 files changed, 308 insertions(+), 92 deletions(-)
diffs (truncated from 510 to 300 lines):
diff -r 980fee0af6d5 -r 71bcec497b55 ChangeLog
--- a/ChangeLog Wed Feb 23 11:28:44 2011 +0100
+++ b/ChangeLog Thu Feb 24 22:24:46 2011 +0100
@@ -1,3 +1,12 @@
+2011-02-24 Niels Möller <nisse at lysator.liu.se>
+
+ * mpn/x86/k7/mod_1_1.asm (mpn_mod_1_1p): Rewrite using the same
+ algorithm as the x86_64 version.
+
+2011-02-23 Marco Bodrato <bodrato at mail.dm.unipi.it>
+
+ * mpn/x86/atom/logops_n.asm: New file (same loop as aors_n).
+
2011-02-23 Niels Möller <nisse at lysator.liu.se>
* mpn/x86_64/mod_1_1.asm (mpn_mod_1_1p): Shaved off one
diff -r 980fee0af6d5 -r 71bcec497b55 mpn/x86/atom/aors_n.asm
--- a/mpn/x86/atom/aors_n.asm Wed Feb 23 11:28:44 2011 +0100
+++ b/mpn/x86/atom/aors_n.asm Thu Feb 24 22:24:46 2011 +0100
@@ -78,7 +78,7 @@
define(`rp', `%edi')
define(`up', `%esi')
-define(`vp', `%ebp')
+define(`vp', `%ebx')
define(`cy', `%ecx')
define(`r1', `%ecx')
define(`r2', `%edx')
@@ -95,8 +95,8 @@
mov rp, SAVE_RP
mov PARAM_DST, rp
mov up, SAVE_UP
+ mov PARAM_SRC1, up
shr %eax C size >> 1
- mov PARAM_SRC1, up
mov vp, SAVE_VP
mov PARAM_SRC2, vp
jz L(one) C size == 1
@@ -125,8 +125,8 @@
mov r1, (rp)
L(entry):
M4_inst -4(vp), r2
+ lea 8(rp), rp
dec %eax
- lea 8(rp), rp
mov (up), r1
mov r2, -4(rp)
jnz L(oop)
diff -r 980fee0af6d5 -r 71bcec497b55 mpn/x86/atom/logops_n.asm
--- a/mpn/x86/atom/logops_n.asm Wed Feb 23 11:28:44 2011 +0100
+++ b/mpn/x86/atom/logops_n.asm Thu Feb 24 22:24:46 2011 +0100
@@ -1,24 +1,140 @@
dnl Intel Atom mpn_and_n,...,mpn_xnor_n -- bitwise logical operations.
dnl Copyright 2011 Free Software Foundation, Inc.
-dnl
+
+dnl Contributed to the GNU project by Marco Bodrato.
+
dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 3 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
+C cycles/limb
+C op nop opn
+C P5
+C P6 model 0-8,10-12
+C P6 model 9 (Banias)
+C P6 model 13 (Dothan)
+C P4 model 0 (Willamette)
+C P4 model 1 (?)
+C P4 model 2 (Northwood)
+C P4 model 3 (Prescott)
+C P4 model 4 (Nocona)
+C Intel Atom 3 3.5 3.5
+C AMD K6
+C AMD K7
+C AMD K8
+C AMD K10
+
+define(M4_choose_op,
+`ifdef(`OPERATION_$1',`
+define(`M4_function', `mpn_$1')
+define(`M4_want_pre', `$4')
+define(`M4_inst', `$3')
+define(`M4_want_post',`$2')
+')')
+define(M4pre, `ifelse(M4_want_pre, yes,`$1')')
+define(M4post,`ifelse(M4_want_post,yes,`$1')')
+
+M4_choose_op( and_n, , andl, )
+M4_choose_op( andn_n, , andl, yes)
+M4_choose_op( nand_n, yes, andl, )
+M4_choose_op( ior_n, , orl, )
+M4_choose_op( iorn_n, , orl, yes)
+M4_choose_op( nior_n, yes, orl, )
+M4_choose_op( xor_n, , xorl, )
+M4_choose_op( xnor_n, yes, xorl, )
+
+ifdef(`M4_function',,
+`m4_error(`Unrecognised or undefined OPERATION symbol
+')')
MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
-include_mpn(`x86/pentium/logops_n.asm')
+
+C void M4_function (mp_ptr dst, mp_srcptr src2, mp_srcptr src1, mp_size_t size);
+C
+
+defframe(PARAM_SIZE, 16)
+defframe(PARAM_SRC1, 12)
+defframe(PARAM_SRC2, 8)
+defframe(PARAM_DST, 4)
+
+dnl re-use parameter space
+define(SAVE_RP,`PARAM_SIZE')
+define(SAVE_VP,`PARAM_SRC1')
+define(SAVE_UP,`PARAM_DST')
+
+define(`rp', `%edi')
+define(`up', `%esi')
+define(`vp', `%ebx')
+define(`cnt', `%eax')
+define(`r1', `%ecx')
+define(`r2', `%edx')
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+deflit(`FRAME',0)
+
+PROLOGUE(M4_function)
+ mov PARAM_SIZE, cnt C size
+ mov rp, SAVE_RP
+ mov PARAM_DST, rp
+ mov up, SAVE_UP
+ mov PARAM_SRC1, up
+ shr cnt C size >> 1
+ mov vp, SAVE_VP
+ mov PARAM_SRC2, vp
+ mov (up), r1
+ jz L(end) C size == 1
+ jnc L(even) C size % 2 == 0
+
+ ALIGN(16)
+L(oop):
+M4pre(` notl_or_xorl_GMP_NUMB_MASK(r1)')
+ M4_inst (vp), r1
+ lea 8(up), up
+ mov -4(up), r2
+M4post(` notl_or_xorl_GMP_NUMB_MASK(r1)')
+ lea 8(vp), vp
+ mov r1, (rp)
+L(entry):
+M4pre(` notl_or_xorl_GMP_NUMB_MASK(r2)')
+ M4_inst -4(vp), r2
+ lea 8(rp), rp
+M4post(` notl_or_xorl_GMP_NUMB_MASK(r2)')
+ dec cnt
+ mov (up), r1
+ mov r2, -4(rp)
+ jnz L(oop)
+
+L(end):
+M4pre(` notl_or_xorl_GMP_NUMB_MASK(r1)')
+ mov SAVE_UP, up
+ M4_inst (vp), r1
+M4post(`notl_or_xorl_GMP_NUMB_MASK(r1)')
+ mov SAVE_VP, vp
+ mov r1, (rp)
+ mov SAVE_RP, rp
+ ret
+
+L(even):
+ mov r1, r2
+ lea 4(up), up
+ lea 4(vp), vp
+ lea -4(rp), rp
+ jmp L(entry)
+EPILOGUE()
+ASM_END()
diff -r 980fee0af6d5 -r 71bcec497b55 mpn/x86/k7/mod_1_1.asm
--- a/mpn/x86/k7/mod_1_1.asm Wed Feb 23 11:28:44 2011 +0100
+++ b/mpn/x86/k7/mod_1_1.asm Thu Feb 24 22:24:46 2011 +0100
@@ -1,8 +1,8 @@
dnl x86-32 mpn_mod_1_1p, requiring cmov.
-dnl Contributed to the GNU project by Torbjorn Granlund.
+dnl Contributed to the GNU project by Niels Möller and Torbjorn Granlund.
dnl
-dnl Copyright 2010 Free Software Foundation, Inc.
+dnl Copyright 2010, 2011 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
@@ -25,16 +25,45 @@
C P5 ?
C P6 model 0-8,10-12 ?
C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) 11.75
+C P6 model 13 (Dothan) ?
C P4 model 0 (Willamette) ?
C P4 model 1 (?) ?
C P4 model 2 (Northwood) ?
C P4 model 3 (Prescott) ?
C P4 model 4 (Nocona) ?
C AMD K6 ?
-C AMD K7 8
+C AMD K7 7
C AMD K8 ?
+define(`B2mb', `%ebx')
+define(`r0', `%esi')
+define(`r2', `%ebp')
+define(`t0', `%edi')
+define(`ap', `%ecx') C Also shift count
+
+C Stack frame
+C pre 36(%esp)
+C b 32(%esp)
+C n 28(%esp)
+C ap 24(%esp)
+C return 20(%esp)
+C %ebp 16(%esp)
+C %edi 12(%esp)
+C %esi 8(%esp)
+C %ebx 4(%esp)
+C B2mod (%esp)
+
+define(`B2modb', `(%esp)')
+define(`n', `28(%esp)')
+define(`b', `32(%esp)')
+define(`pre', `36(%esp)')
+
+C mp_limb_t
+C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t pre[4])
+C
+C The pre array contains bi, cnt, B1modb, B2modb
+C Note: This implementation needs B1modb only when cnt > 0
+
ASM_START()
TEXT
ALIGN(8)
@@ -43,74 +72,116 @@
push %edi
push %esi
push %ebx
- mov 24(%esp), %ebx
- mov 20(%esp), %esi
- mov 32(%esp), %ebp C cps[]
- lea (%esi,%ebx,4), %esi
+ mov 32(%esp), %ebp C pre[]
- mov 8(%ebp), %edi C B1modb
- mov 12(%ebp), %ebp C B2modb
- mov -4(%esi), %eax
- mul %edi
- xor %ecx, %ecx
- add -8(%esi), %eax
- adc %edx, %ecx
- sub $2, 24(%esp)
- jle L(end)
+ mov 12(%ebp), %eax C B2modb
+ push %eax C Put it on stack
+
+ mov 4(%ebp), %cl
+ shrl %cl, b
+
+ mov n, %edx
+ mov 24(%esp), ap
+
+ lea (ap, %edx, 4), ap
+ mov -4(ap), %eax
+ cmp $3, %edx
+ jnc L(first)
+ mov -8(ap), r0
More information about the gmp-commit
mailing list