David Harvey dmharvey at cims.nyu.edu
Tue Jun 1 14:04:16 CEST 2010

```I have been playing around with core2 add_n_sub_n.

Main loop currently looks like this:

define(`r1p',       `%rdi')
define(`r2p',       `%rsi')
define(`s1p',       `%rdx')
define(`s2p_param', `%rcx')
define(`n_param',   `%r8')

define(`n',     `%rcx')
define(`s2p',   `%r9')

define(`x0',    `%r8')
define(`y0',    `%r10')
define(`x1',    `%r11')
define(`y1',    `%rbp')

define(`acy',   `%al')
define(`scy',   `%bl')
define(`t',     `%r12')

[...]

ALIGN(16)
L(main):
mov     (s1p,n,8), x0
mov     x0, y0
mov     (s2p,n,8), t
sbb     t, x0
mov     8(s1p,n,8), x1
mov     x1, y1
sbb     8(s2p,n,8), x1
setc    scy
shr     acy
mov     x0, (r2p,n,8)
mov     y0, (r1p,n,8)
mov     x1, 8(r2p,n,8)
mov     y1, 8(r1p,n,8)

mov     16(s1p,n,8), x0
mov     x0, y0
mov     16(s2p,n,8), t
mov     24(s1p,n,8), x1
mov     x1, y1
setc    acy
shr     scy
sbb     t, y0
mov     x0, 16(r1p,n,8)
mov     y0, 16(r2p,n,8)
sbb     24(s2p,n,8), y1
mov     x1, 24(r1p,n,8)
mov     y1, 24(r2p,n,8)

lea     4(n), n
jrcxz   L(end)
jmp     L(main)

This runs at about 3.5 c/l measured via

./speed -p 100000 -C -D -s 100-700 -t 100 mpn_add_n_sub_n

This should be better then mpn_add_n + mpn_sub_n which would be