[Gmp-commit] /var/hg/gmp: (mpn_mod_1_1p): Rewrite using the same algorithm as...
mercurial at gmplib.org
mercurial at gmplib.org
Thu Feb 24 21:08:52 CET 2011
details: /var/hg/gmp/rev/e2be96b70ba6
changeset: 13892:e2be96b70ba6
user: Niels M?ller <nisse at lysator.liu.se>
date: Thu Feb 24 21:08:30 2011 +0100
description:
(mpn_mod_1_1p): Rewrite using the same algorithm as the x86_64 version.
diffstat:
ChangeLog | 5 +
mpn/x86/k7/mod_1_1.asm | 197 +++++++++++++++++++++++++++++++++---------------
2 files changed, 139 insertions(+), 63 deletions(-)
diffs (250 lines):
diff -r 6c79ae2572a4 -r e2be96b70ba6 ChangeLog
--- a/ChangeLog Wed Feb 23 22:12:01 2011 +0100
+++ b/ChangeLog Thu Feb 24 21:08:30 2011 +0100
@@ -1,3 +1,8 @@
+2011-02-24 Niels Möller <nisse at lysator.liu.se>
+
+ * mpn/x86/k7/mod_1_1.asm (mpn_mod_1_1p): Rewrite using the same
+ algorithm as the x86_64 version.
+
2011-02-23 Marco Bodrato <bodrato at mail.dm.unipi.it>
* mpn/x86/atom/logops_n.asm: New file (same loop as aors_n).
diff -r 6c79ae2572a4 -r e2be96b70ba6 mpn/x86/k7/mod_1_1.asm
--- a/mpn/x86/k7/mod_1_1.asm Wed Feb 23 22:12:01 2011 +0100
+++ b/mpn/x86/k7/mod_1_1.asm Thu Feb 24 21:08:30 2011 +0100
@@ -1,8 +1,8 @@
dnl x86-32 mpn_mod_1_1p, requiring cmov.
-dnl Contributed to the GNU project by Torbjorn Granlund.
+dnl Contributed to the GNU project by Niels Möller and Torbjorn Granlund.
dnl
-dnl Copyright 2010 Free Software Foundation, Inc.
+dnl Copyright 2010, 2011 Free Software Foundation, Inc.
dnl
dnl This file is part of the GNU MP Library.
dnl
@@ -25,16 +25,45 @@
C P5 ?
C P6 model 0-8,10-12 ?
C P6 model 9 (Banias) ?
-C P6 model 13 (Dothan) 11.75
+C P6 model 13 (Dothan) ?
C P4 model 0 (Willamette) ?
C P4 model 1 (?) ?
C P4 model 2 (Northwood) ?
C P4 model 3 (Prescott) ?
C P4 model 4 (Nocona) ?
C AMD K6 ?
-C AMD K7 8
+C AMD K7 7
C AMD K8 ?
+define(`B2mb', `%ebx')
+define(`r0', `%esi')
+define(`r2', `%ebp')
+define(`t0', `%edi')
+define(`ap', `%ecx') C Also shift count
+
+C Stack frame
+C pre 36(%esp)
+C b 32(%esp)
+C n 28(%esp)
+C ap 24(%esp)
+C return 20(%esp)
+C %ebp 16(%esp)
+C %edi 12(%esp)
+C %esi 8(%esp)
+C %ebx 4(%esp)
+C B2mod (%esp)
+
+define(`B2modb', `(%esp)')
+define(`n', `28(%esp)')
+define(`b', `32(%esp)')
+define(`pre', `36(%esp)')
+
+C mp_limb_t
+C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t pre[4])
+C
+C The pre array contains bi, cnt, B1modb, B2modb
+C Note: This implementation needs B1modb only when cnt > 0
+
ASM_START()
TEXT
ALIGN(8)
@@ -43,74 +72,116 @@
push %edi
push %esi
push %ebx
- mov 24(%esp), %ebx
- mov 20(%esp), %esi
- mov 32(%esp), %ebp C cps[]
- lea (%esi,%ebx,4), %esi
+ mov 32(%esp), %ebp C pre[]
- mov 8(%ebp), %edi C B1modb
- mov 12(%ebp), %ebp C B2modb
- mov -4(%esi), %eax
- mul %edi
- xor %ecx, %ecx
- add -8(%esi), %eax
- adc %edx, %ecx
- sub $2, 24(%esp)
- jle L(end)
+ mov 12(%ebp), %eax C B2modb
+ push %eax C Put it on stack
+
+ mov 4(%ebp), %cl
+ shrl %cl, b
+
+ mov n, %edx
+ mov 24(%esp), ap
+
+ lea (ap, %edx, 4), ap
+ mov -4(ap), %eax
+ cmp $3, %edx
+ jnc L(first)
+ mov -8(ap), r0
+ jmp L(reduce_two)
+
+L(first):
+ C First iteration, no r2
+ mull B2modb
+ mov -12(ap), r0
+ add %eax, r0
+ mov -8(ap), %eax
+ adc %edx, %eax
+ sbb r2, r2
+ sub $3, n
+ lea -16(ap), ap
+ jz L(reduce_three)
+
+ mov B2modb, B2mb
+ sub b, B2mb
+ lea (B2mb, r0), t0
+ jmp L(mid)
ALIGN(16)
-L(top): mul %edi C 0
- mov -12(%esi), %ebx C
- add %eax, %ebx C 4
- mov %ecx, %eax C 2
- mov $0, %ecx C
- adc %edx, %ecx C 6
- mul %ebp C 3
- add %ebx, %eax C 7
- adc %edx, %ecx C 9
- decl 24(%esp)
- lea -4(%esi), %esi
- jg L(top)
+L(top): C Loopmixed to 7 c/l on k7
+ add %eax, r0
+ lea (B2mb, r0), t0
+ mov r2, %eax
+ adc %edx, %eax
+ sbb r2, r2
+L(mid): mull B2modb
+ and B2modb, r2
+ add r0, r2
+ decl n
+ mov (ap), r0
+ cmovc( t0, r2)
+ lea -4(ap), ap
+ jnz L(top)
-L(end): mov %eax, %ebp
- mov %ecx, %eax
- mul %edi
- mov 32(%esp), %edi
- add %eax, %ebp
- adc $0, %edx
- mov 4(%edi), %ecx
- mov %edx, %eax C rh
- mov %ebp, %esi C rl
- sal %cl, %eax
- mov %ecx, %ebx
- test %ecx, %ecx
- je L(nrm)
- neg %ecx
- shr %cl, %esi
- or %esi, %eax
- neg %ecx
-L(nrm): lea 1(%eax), %esi
- mull (%edi)
- mov %eax, %ebx
- mov %ebp, %eax
- mov 28(%esp), %ebp
- sal %cl, %eax
- add %eax, %ebx
- adc %esi, %edx
- imul %ebp, %edx
- sub %edx, %eax
- lea (%eax,%ebp), %edx
- cmp %eax, %ebx
- cmovc( %edx, %eax)
- mov %eax, %edx
- sub %ebp, %eax
- cmovc( %edx, %eax)
+ add %eax, r0
+ mov r2, %eax
+ adc %edx, %eax
+ sbb r2, r2
+
+L(reduce_three):
+ C Eliminate r2
+ and b, r2
+ sub r2, %eax
+
+L(reduce_two):
+ mov pre, %ebp
+ movb 4(%ebp), %cl
+ test %cl, %cl
+ jz L(normalized)
+
+ C Unnormalized, use B1modb to reduce to size < B b
+ mull 8(%ebp)
+ xor t0, t0
+ add %eax, r0
+ adc %edx, t0
+ mov t0, %eax
+
+ C Left-shift to normalize
+ shll %cl, b
+ shld %cl, r0, %eax C Always use shld?
+
+ shl %cl, r0
+ jmp L(udiv)
+
+L(normalized):
+ mov %eax, t0
+ sub b, t0
+ cmovnc( t0, %eax)
+
+L(udiv):
+ lea 1(%eax), t0
+ mull (%ebp)
+ mov b, %ebx C Needed in register for lea
+ add r0, %eax
+ adc t0, %edx
+ imul %ebx, %edx
+ sub %edx, r0
+ cmp r0, %eax
+ lea (%ebx, r0), %eax
+ cmovnc( r0, %eax)
+ cmp %ebx, %eax
+ jnc L(fix)
+L(ok): shr %cl, %eax
+
+ add $4, %esp
pop %ebx
pop %esi
pop %edi
pop %ebp
- shr %cl, %eax
+
ret
+L(fix): sub %ebx, %eax
+ jmp L(ok)
EPILOGUE()
PROLOGUE(mpn_mod_1_1p_cps)
More information about the gmp-commit
mailing list