core2 add_n_sub_n
David Harvey
dmharvey at cims.nyu.edu
Tue Jun 1 14:04:16 CEST 2010
I have been playing around with core2 add_n_sub_n.
Main loop currently looks like this:
define(`r1p', `%rdi')
define(`r2p', `%rsi')
define(`s1p', `%rdx')
define(`s2p_param', `%rcx')
define(`n_param', `%r8')
define(`n', `%rcx')
define(`s2p', `%r9')
define(`x0', `%r8')
define(`y0', `%r10')
define(`x1', `%r11')
define(`y1', `%rbp')
define(`acy', `%al')
define(`scy', `%bl')
define(`t', `%r12')
[...]
ALIGN(16)
L(main):
mov (s1p,n,8), x0
mov x0, y0
mov (s2p,n,8), t
sbb t, x0
mov 8(s1p,n,8), x1
mov x1, y1
sbb 8(s2p,n,8), x1
setc scy
shr acy
adc t, y0
mov x0, (r2p,n,8)
mov y0, (r1p,n,8)
adc 8(s2p,n,8), y1
mov x1, 8(r2p,n,8)
mov y1, 8(r1p,n,8)
mov 16(s1p,n,8), x0
mov x0, y0
mov 16(s2p,n,8), t
adc t, x0
mov 24(s1p,n,8), x1
mov x1, y1
adc 24(s2p,n,8), x1
setc acy
shr scy
sbb t, y0
mov x0, 16(r1p,n,8)
mov y0, 16(r2p,n,8)
sbb 24(s2p,n,8), y1
mov x1, 24(r1p,n,8)
mov y1, 24(r2p,n,8)
lea 4(n), n
jrcxz L(end)
jmp L(main)
This runs at about 3.5 c/l measured via
./speed -p 100000 -C -D -s 100-700 -t 100 mpn_add_n_sub_n
This should be better then mpn_add_n + mpn_sub_n which would be
currently about 4 c/l.
I do not know much about optimising for this chip. I'm wondering if
anyone has any thoughts about what the maximum theoretical speed of
mpn_add_n_sub_n should be on core2.
david
More information about the gmp-devel
mailing list