[Gmp-commit] /var/hg/gmp: 2 new changesets
mercurial at gmplib.org
mercurial at gmplib.org
Sat Mar 17 23:53:57 CET 2012
details: /var/hg/gmp/rev/acaa1452a82e
changeset: 14762:acaa1452a82e
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Mar 17 23:52:21 2012 +0100
description:
Restore DOS64 support.
details: /var/hg/gmp/rev/775523081472
changeset: 14763:775523081472
user: Torbjorn Granlund <tege at gmplib.org>
date: Sat Mar 17 23:53:55 2012 +0100
description:
Rewrite x86-32 gcd_1 support.
diffstat:
ChangeLog | 3 +
mpn/x86/k7/gcd_1.asm | 405 +++++++++++++-------------------------------
mpn/x86/p6/gcd_1.asm | 142 +++++++++++++++
mpn/x86_64/core2/gcd_1.asm | 16 +-
mpn/x86_64/gcd_1.asm | 16 +-
5 files changed, 285 insertions(+), 297 deletions(-)
diffs (truncated from 727 to 300 lines):
diff -r b319b5e7816c -r 775523081472 ChangeLog
--- a/ChangeLog Sat Mar 17 15:19:11 2012 +0100
+++ b/ChangeLog Sat Mar 17 23:53:55 2012 +0100
@@ -1,5 +1,8 @@
2012-03-17 Torbjorn Granlund <tege at gmplib.org>
+ * mpn/x86/k7/gcd_1.asm: Rewrite.
+ * mpn/x86/p6/gcd_1.asm: New file.
+
* mpn/x86_64/core2/gcd_1.asm: Conditionally suppress reduction calls.
* mpn/x86_64/gcd_1.asm: Rewrite.
diff -r b319b5e7816c -r 775523081472 mpn/x86/k7/gcd_1.asm
--- a/mpn/x86/k7/gcd_1.asm Sat Mar 17 15:19:11 2012 +0100
+++ b/mpn/x86/k7/gcd_1.asm Sat Mar 17 23:53:55 2012 +0100
@@ -1,328 +1,169 @@
-dnl AMD K7 mpn_gcd_1 -- mpn by 1 gcd.
+dnl x86 mpn_gcd_1 optimised for AMD K7.
-dnl Copyright 2000, 2001, 2002, 2009, 2010 Free Software Foundation, Inc.
-dnl
+dnl Contributed to the GNU project by by Kevin Ryde. Rehacked by Torbjorn
+dnl Granlund.
+
+dnl Copyright 2000, 2001, 2002, 2005, 2009, 2011, 2012 Free Software
+dnl Foundation, Inc.
+
dnl This file is part of the GNU MP Library.
-dnl
-dnl The GNU MP Library is free software; you can redistribute it and/or
-dnl modify it under the terms of the GNU Lesser General Public License as
-dnl published by the Free Software Foundation; either version 3 of the
-dnl License, or (at your option) any later version.
-dnl
-dnl The GNU MP Library is distributed in the hope that it will be useful,
-dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
-dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-dnl Lesser General Public License for more details.
-dnl
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
-C K7: 6.75 cycles/bit (approx) 1x1 gcd
-C 11.0 cycles/limb Nx1 reduction (modexact_1_odd)
+C cycles/bit (approx)
+C AMD K7 5.31
+C AMD K8,K9 5.33
+C AMD K10 5.30
+C AMD bd1 ?
+C AMD bobcat 7.02
+C Intel P4-2 10.1
+C Intel P4-3/4 10.0
+C Intel P6/13 5.88
+C Intel core2 6.26
+C Intel NHM 6.83
+C Intel SBR 8.50
+C Intel atom 8.90
+C VIA nano ?
+C Numbers measured with: speed -CD -s16-32 -t16 mpn_gcd_1
-C This code was modernised in 2010 to avoid most use of 'div', but not
-C completely cleaned up. Presumably, we should remove last 'div' too,
-C and simplify the structure to save many 'mov' insns.
+C TODO
+C * Tune overhead, this takes 2-3 cycles more than old code when v0 is tiny.
+C * Stream things better through registers, avoiding some copying.
-C Reduce using x%y if x is more than DIV_THRESHOLD bits bigger than y,
-C where x is the larger of the two. See tune/README for more.
-C
-C divl at 40 cycles compared to the gcd at about 7 cycles/bitpair
-C suggests 40/7*2=11.4 but 7 seems to be about right.
-
-deflit(DIV_THRESHOLD, 7)
-
-
-C table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
-C
-C This is mixed in with the code, but as per the k7 optimization manual it's
-C a full cache line and suitably aligned so it won't get swapped between
-C code and data. Having it in TEXT rather than RODATA saves needing a GOT
-C entry when PIC.
-C
-C Actually, there doesn't seem to be a measurable difference between this in
-C it's own cache line or plonked in the middle of the code. Presumably
-C since TEXT is read-only there's no worries about coherency.
+C ctz_table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
deflit(MAXSHIFT, 6)
deflit(MASK, eval((m4_lshift(1,MAXSHIFT))-1))
- TEXT
- ALIGN(64)
-L(table):
+DEF_OBJECT(ctz_table,64)
.byte MAXSHIFT
forloop(i,1,MASK,
` .byte m4_count_trailing_zeros(i)
')
+END_OBJECT(ctz_table)
+C Threshold of when to call bmod when U is one limb. Should be about
+C (time_in_cycles(bmod_1,1) + call_overhead) / (cycles/bit).
+define(`DIV_THRES_LOG2', 7)
-C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t limb);
-C
-defframe(PARAM_LIMB, 12)
-defframe(PARAM_SIZE, 8)
-defframe(PARAM_SRC, 4)
+define(`up', `%edi')
+define(`n', `%esi')
+define(`v0', `%edx')
-defframe(SAVE_EBX, -4)
-defframe(SAVE_ESI, -8)
-defframe(SAVE_EDI, -12)
-defframe(SAVE_EBP, -16)
-defframe(CALL_DIVISOR,-20)
-defframe(CALL_SIZE, -24)
-defframe(CALL_SRC, -28)
-deflit(STACK_SPACE, 28)
-
+ASM_START()
TEXT
ALIGN(16)
+PROLOGUE(mpn_gcd_1)
+ push %edi
+ push %esi
-PROLOGUE(mpn_gcd_1)
-deflit(`FRAME',0)
+ mov 12(%esp), up
+ mov 16(%esp), n
+ mov 20(%esp), v0
- ASSERT(ne, `cmpl $0, PARAM_LIMB') C y!=0
- ASSERT(ae, `cmpl $1, PARAM_SIZE') C size>=1
-
- mov PARAM_SRC, %eax
- mov PARAM_LIMB, %edx
- sub $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
-
- mov %esi, SAVE_ESI
- mov %ebx, SAVE_EBX
-
- mov (%eax), %esi C src low limb
-
-ifdef(`PIC',`
- mov %edi, SAVE_EDI
- call L(movl_eip_to_edi)
-L(here):
- add $L(table)-L(here), %edi
-')
-
- mov %esi, %ebx
- or %edx, %esi C x|y
+ mov (up), %eax C U low limb
+ or v0, %eax C x | y
mov $-1, %ecx
L(twos):
inc %ecx
- shr %esi
- jnc L(twos) C 3/4 chance of x or y odd already
+ shr %eax
+ jnc L(twos)
- shr %cl, %ebx
- shr %cl, %edx
- mov %ecx, %esi C common twos
+ shr %cl, v0
+ mov %ecx, %eax C common twos
- mov PARAM_SIZE, %ecx
- cmp $1, %ecx
- ja L(divide)
+L(divide_strip_y):
+ shr v0
+ jnc L(divide_strip_y)
+ adc v0, v0
+ push %eax
+ push v0
- C eax
- C ebx x
- C ecx
- C edx y
- C esi common twos
- C edi [PIC] L(table)
- C ebp
+ cmp $1, n
+ jnz L(reduce_nby1)
+C Both U and V are single limbs, reduce with bmod if u0 >> v0.
+ mov (up), %ecx
+ mov %ecx, %eax
+ shr $DIV_THRES_LOG2, %ecx
+ cmp %ecx, v0
+ ja L(reduced)
+
+ mov v0, %esi
+ xor %edx, %edx
+ div %esi
mov %edx, %eax
- cmp %ebx, %edx
+ jmp L(reduced)
- cmovc( %ebx, %eax) C swap to make x bigger than y
- cmovc( %edx, %ebx)
+L(reduce_nby1):
+ push v0 C param 3
+ push n C param 2
+ push up C param 1
+ cmp $BMOD_1_TO_MOD_1_THRESHOLD, n
+ jl L(bmod)
+ifdef(`PIC',`
+ call GSYM_PREFIX`'mpn_mod_1 at PLT
+',`
+ call GSYM_PREFIX`'mpn_mod_1
+')
+ jmp L(called)
+L(bmod):
+ifdef(`PIC',`
+ call GSYM_PREFIX`'mpn_modexact_1_odd at PLT
+',`
+ call GSYM_PREFIX`'mpn_modexact_1_odd
+')
+L(called):
+ add $12, %esp C deallocate params
+L(reduced):
+ pop %edx
+ LEA( ctz_table, %esi)
+ test %eax, %eax
+ mov %eax, %ecx
+ jnz L(mid)
+ jmp L(end)
-L(strip_y):
- C eax x
- C ebx y
- C ecx
- C edx
- C esi common twos
- C edi [PIC] L(table)
- C ebp
+ ALIGN(16) C K8 BC P4 NHM SBR
+L(top): cmovc( %ecx, %eax) C if x-y < 0 0
+ cmovc( %edi, %edx) C use x,y-x 0
+L(mid): and $MASK, %ecx C 0
+ movzbl (%esi,%ecx), %ecx C 1
+ jz L(shift_alot) C 1
+ shr %cl, %eax C 3
+ mov %eax, %edi C 4
+ mov %edx, %ecx C 3
+ sub %eax, %ecx C 4
+ sub %edx, %eax C 4
+ jnz L(top) C 5
- ASSERT(nz,`orl %ebx,%ebx')
- shr %ebx
- jnc L(strip_y)
- rcl %ebx
-
-
- C eax x
- C ebx y (odd)
- C ecx
- C edx
- C esi common twos
- C edi [PIC] L(table)
- C ebp
-
- mov %eax, %ecx
- mov %ebx, %edx
- shr $DIV_THRESHOLD, %eax
-
- cmp %eax, %ebx
- mov %ecx, %eax
- ja L(strip_x_entry) C do x%y if x much bigger than y
-
-
More information about the gmp-commit
mailing list