Sandybridge addmul_N challenge

Mon Feb 27 16:21:20 CET 2012

Torbjorn Granlund <tg at gmplib.org> writes:

  carry-in lo in r14
  carry-in hi in rcx
          mov     0(up), %rax
          mul     v1
          mov     8(rp), %r8
          add     %rax, %r8
          mov     %rdx, %r9
          adc     $0, %r9
          mov     8(up), %rax
          mul     v0
          add     %rax, %r8
          adc     %rdx, %r9
          mov     $0, R32(%rbx)
          adc     $0, R32(%rbx)
          add     %r14, %r8               C 0
          adc     %rcx, %r9               C 1
          adc     $0, R32(%rbx)           C might be removed
          mov     %r8, 8(rp)
  carry-out lo in r9
  carry-out hi in rbx

I committed code using that block, see mpn/x86_64/coreisbr/addmul_2.asm.

In the end, the code runs at about 3.2 c/l.  I have not reached 3.0 with
complete code.  I have no understanding of what limits things.

I played with convolution style code, i.e., code that multiplies and
accumulated columns-wise.  It runs at 2.5 c/l, not counting the final
summarisation code:

	.text
	.globl	main
main:
	push	%r12
	push	%r13
	push	%r14
	mov	$3300000000/4, %ecx
	.align	16
1:
	mov	8(%rsp), %rax
	mulq	16(%rsp)
	add	%rax, %r8
	adc	%rdx, %r9
	adc	$0, %r10d

	mov	8(%rsp), %rax
	mulq	16(%rsp)
	add	%rax, %r12
	adc	%rdx, %r13
	adc	$0, %r14d

	mov	8(%rsp), %rax
	mulq	16(%rsp)
	add	%rax, %r8
	adc	%rdx, %r9
	adc	$0, %r10d

	mov	8(%rsp), %rax
	mulq	16(%rsp)
	add	%rax, %r12
	adc	%rdx, %r13
	adc	$0, %r10d

	dec	%ecx
	jnz	1b

	pop	%r14
	pop	%r13
	pop	%r12
	ret

-- 
Torbjörn