Sandybridge addmul_N challenge
Torbjorn Granlund
tg at gmplib.org
Mon Feb 27 16:21:20 CET 2012
Torbjorn Granlund <tg at gmplib.org> writes:
carry-in lo in r14
carry-in hi in rcx
mov 0(up), %rax
mul v1
mov 8(rp), %r8
add %rax, %r8
mov %rdx, %r9
adc $0, %r9
mov 8(up), %rax
mul v0
add %rax, %r8
adc %rdx, %r9
mov $0, R32(%rbx)
adc $0, R32(%rbx)
add %r14, %r8 C 0
adc %rcx, %r9 C 1
adc $0, R32(%rbx) C might be removed
mov %r8, 8(rp)
carry-out lo in r9
carry-out hi in rbx
I committed code using that block, see mpn/x86_64/coreisbr/addmul_2.asm.
In the end, the code runs at about 3.2 c/l. I have not reached 3.0 with
complete code. I have no understanding of what limits things.
I played with convolution style code, i.e., code that multiplies and
accumulated columns-wise. It runs at 2.5 c/l, not counting the final
summarisation code:
.text
.globl main
main:
push %r12
push %r13
push %r14
mov $3300000000/4, %ecx
.align 16
1:
mov 8(%rsp), %rax
mulq 16(%rsp)
add %rax, %r8
adc %rdx, %r9
adc $0, %r10d
mov 8(%rsp), %rax
mulq 16(%rsp)
add %rax, %r12
adc %rdx, %r13
adc $0, %r14d
mov 8(%rsp), %rax
mulq 16(%rsp)
add %rax, %r8
adc %rdx, %r9
adc $0, %r10d
mov 8(%rsp), %rax
mulq 16(%rsp)
add %rax, %r12
adc %rdx, %r13
adc $0, %r10d
dec %ecx
jnz 1b
pop %r14
pop %r13
pop %r12
ret
--
Torbjörn
More information about the gmp-devel
mailing list