Alternative div_qr_1
Niels Möller
nisse at lysator.liu.se
Tue Jun 15 15:13:07 CEST 2010
nisse at lysator.liu.se (Niels Möller) writes:
> The algorithm is really simple, the iteration is
>
> <r2, r1, r0> <-- <r0, u[j]> + r1 b2 + r2 b3
I've now implemented a slightly different variant,
<r2, r1, r0> <-- <r0, u[j]> + (r2 b2 - c d) B + r1 b2
where c is the carry from r0 + r2 b2 (here, r2 is a single bit, and in
fact <r2, r1> < B + d - 1). Note that the b3 constant is no longer
needed.
I'm attaching a new x86_64/mod_1_1.asm. The function mpn_mod_1_1p is
rewritten from scratch, while mpn_mod_1_1p_cps is unchanged.
This is the inner loop:
L(loop):
and B2modb, r2
lea (B2mb, r0), t1
mul B2modb
add r2, r0
mov (ap, n, 8), t0
cmovc t1, r0
add %rax, t0
mov r0, %rax
adc %rdx, %rax
sbb r2, r2
sub $1, n
mov t0, r0
jnc L(loop)
Runs at 6 cycles per limb on AMD (panther.gmplib.org).
On Intel corei (shell.gmplib.org) results are a little odd; it runs at
11 cycles per limb when the divisor is unnormalized, and 11.5 when the
divisor is normalized. Which is strange, since there's no instruction in
the loop which I'd expect to have data-dependent timing.
Comments apprecitad.
I'd like to extend this to also compute the quotient, then in each
iteration one needs to also compute (in the normalized case; I haven't
thought about the unnormalized case yet):
<q2, q1, q0> <-- <r2, r1> <1, v> + c B
= r2 <1, v> + r1 v + (r1 + c) B
where c is the same carry out as in the r update. And then accumulate
those limbs. Unfortunately, the simple way of doing those additions
takes an awful lot of instructions. The critical path is still the
dependency on r1, which will be 7 cycles (one additional move on the
path, to free up %rax for the r1 v multiply in the q computation). So
we're going to be limited by decoder bandwidth rather than latency. I
hope that one should be able to get a loop running in 11 or 10 cycles.
-------------- next part --------------
dnl AMD64 mpn_mod_1_1p
dnl Contributed to the GNU project by Torbjörn Granlund and Niels Möller.
dnl Copyright 2009, 2010 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
C AMD K8,K9 ( 7 )
C AMD K10 ( 7 )
C Intel P4 (27 )
C Intel core2 (14 )
C Intel corei (12.5)
C Intel atom (37 )
C VIA nano (15 )
define(`B2mb', `%r10')
define(`B2modb', `%r11')
define(`ap', `%rdi')
define(`n', `%rsi')
define(`pre', `%r12')
define(`b', `%rbx')
define(`r0', `%rbp') C r1 kept in %rax
define(`r2', `%r8') C kept negated
define(`t0', `%r9')
define(`t1', `%rcx') C Also used as shift count
C mp_limb_t
C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t bmodb[4])
C %rdi %rsi %rdx %rcx
C The pre array contains bi, cnt, B1modb, B2modb
C Note: This implementaion needs B1modb only when cnt > 0
ASM_START()
TEXT
ALIGN(16)
PROLOGUE(mpn_mod_1_1p)
push %rbp
push %rbx
push %r12
mov %rdx, b
mov %rcx, pre
C Currently needs b to not be preshifted, so undo C shift done
C by caller. Ideally, b shouldn't be passed at all, C it
C should be in the pre block where the cps function is free to
C store whatever is needed.
mov 8(pre), R32(%rcx)
shr R8(%rcx), b
mov -8(ap, n, 8), %rax
cmp $3, n
jnc L(first)
mov -16(ap, n, 8), r0
jmp L(reduce_two)
L(first):
C First iteration, no r2
mov 24(pre), B2modb
mul B2modb
mov -24(ap, n, 8), r0
add %rax, r0
mov -16(ap, n, 8), %rax
adc %rdx, %rax
sbb r2, r2
sub $4, n
jc L(reduce_three)
mov B2modb, B2mb
sub b, B2mb
ALIGN(16)
L(loop):
and B2modb, r2
lea (B2mb, r0), t1
mul B2modb
add r2, r0
mov (ap, n, 8), t0
cmovc t1, r0
add %rax, t0
mov r0, %rax
adc %rdx, %rax
sbb r2, r2
sub $1, n
mov t0, r0
jnc L(loop)
L(reduce_three):
C Eliminate r2
and b, r2
sub r2, %rax
L(reduce_two):
mov 8(pre), R32(%rcx)
test R32(%rcx), R32(%rcx)
jz L(normalized)
C Unnormalized, use B1modb to reduce to size < B b
mulq 16(pre)
xor t0, t0
add %rax, r0
adc %rdx, t0
mov t0, %rax
C Then left-shift to normalize. FIXME: Use shld
shl R8(%rcx), b
shl R8(%rcx), %rax
mov r0, t0
shl R8(%rcx), r0
neg R32(%rcx)
shr R8(%rcx), t0
or t0, %rax
neg R32(%rcx)
jmp L(udiv)
L(normalized):
mov %rax, t0
sub b, t0
cmovnc t0, %rax
L(udiv):
lea 1(%rax), t0
mulq (pre)
add r0, %rax
adc t0, %rdx
imul b, %rdx
sub %rdx, r0
cmp r0, %rax
lea (r0, b), %rax
cmovnc r0, %rax
cmp b, %rax
jc L(done)
sub b, %rax
L(done):
shr R8(%rcx), %rax
pop %r12
pop %rbx
pop %rbp
ret
EPILOGUE()
ALIGN(16)
PROLOGUE(mpn_mod_1_1p_cps)
mov %rbx, -24(%rsp)
mov %rbp, -16(%rsp)
mov %rsi, %rbp
bsr %rsi,%rax
mov %eax, %ebx
mov %r12, -8(%rsp)
sub $24, %rsp
xor $63, %ebx
mov %rdi, %r12
mov %ebx, %ecx
sal %cl, %rbp
mov %rbp, %rdi
CALL( mpn_invert_limb)
mov %rbp, %r8
mov %rax, %r9
mov %rax, (%r12)
neg %r8
test %ebx, %ebx
je L(c0)
xor %ecx, %ecx
mov %rax, %rdx
mov $1, %eax
sub %ebx, %ecx
shr %cl, %rdx
mov %ebx, %ecx
sal %cl, %rax
or %rax, %rdx
imul %rdx, %r8
L(c0): mov %r8, %rax
mul %r9
lea 1(%rdx,%r8), %rdx
mov %rax, %rcx
neg %rdx
imul %rbp, %rdx
lea (%rdx,%rbp), %rax
cmp %rdx, %rcx
mov %ebx, %ecx
mov 8(%rsp), %rbp
cmovb %rax, %rdx
shr %cl, %r8
shr %cl, %rdx
mov %rbx, 8(%r12)
mov %r8, 16(%r12)
mov %rdx, 24(%r12)
mov (%rsp), %rbx
mov 16(%rsp), %r12
add $24, %rsp
ret
EPILOGUE()
ASM_END()
-------------- next part --------------
Regards,
/Niels
--
Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26.
Internet email is subject to wholesale government surveillance.
More information about the gmp-devel
mailing list