Alternative div_qr_1

Niels Möller nisse at
Tue Jun 15 15:13:07 CEST 2010

nisse at (Niels Möller) writes:

> The algorithm is really simple, the iteration is
>   <r2, r1, r0>  <-- <r0, u[j]> + r1 b2 + r2 b3

I've now implemented a slightly different variant,

  <r2, r1, r0>  <-- <r0, u[j]> + (r2 b2 - c d) B + r1 b2

where c is the carry from r0 + r2 b2 (here, r2 is a single bit, and in
fact <r2, r1>  < B + d - 1). Note that the b3 constant is no longer
I'm attaching a new x86_64/mod_1_1.asm. The function mpn_mod_1_1p is
rewritten from scratch, while mpn_mod_1_1p_cps is unchanged.

This is the inner loop:

	and	B2modb, r2
	lea	(B2mb, r0), t1
	mul	B2modb
	add	r2, r0
	mov	(ap, n, 8), t0
	cmovc	t1, r0
	add	%rax, t0
	mov	r0, %rax
	adc	%rdx, %rax
	sbb 	r2, r2
	sub 	$1, n
	mov	t0, r0
	jnc	L(loop)

Runs at 6 cycles per limb on AMD ( 

On Intel corei ( results are a little odd; it runs at
11 cycles per limb when the divisor is unnormalized, and 11.5 when the
divisor is normalized. Which is strange, since there's no instruction in
the loop which I'd expect to have data-dependent timing.

Comments apprecitad.

I'd like to extend this to also compute the quotient, then in each
iteration one needs to also compute (in the normalized case; I haven't
thought about the unnormalized case yet):

  <q2, q1, q0>  <--  <r2, r1> <1, v> + c B
                  = r2 <1, v> + r1 v + (r1 + c) B

where c is the same carry out as in the r update. And then accumulate
those limbs. Unfortunately, the simple way of doing those additions
takes an awful lot of instructions. The critical path is still the
dependency on r1, which will be 7 cycles (one additional move on the
path, to free up %rax for the r1 v multiply in the q computation). So
we're going to be limited by decoder bandwidth rather than latency. I
hope that one should be able to get a loop running in 11 or 10 cycles.

-------------- next part --------------
dnl  AMD64 mpn_mod_1_1p

dnl  Contributed to the GNU project by Torbjörn Granlund and Niels Möller.

dnl  Copyright 2009, 2010 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see


C	     cycles/limb
C AMD K8,K9	( 7  )
C AMD K10	( 7  )
C Intel P4	(27  )
C Intel core2	(14  )
C Intel corei	(12.5)
C Intel atom	(37  )
C VIA nano	(15  )

define(`B2mb',   `%r10')
define(`B2modb', `%r11')
define(`ap',     `%rdi')
define(`n',      `%rsi')
define(`pre',    `%r12')
define(`b',      `%rbx')

define(`r0',     `%rbp') C r1 kept in %rax
define(`r2',	 `%r8')  C kept negated
define(`t0',     `%r9')
define(`t1',     `%rcx') C Also used as shift count

C mp_limb_t
C mpn_mod_1_1p (mp_srcptr ap, mp_size_t n, mp_limb_t b, mp_limb_t bmodb[4])
C                       %rdi         %rsi         %rdx                %rcx
C The pre array contains bi, cnt, B1modb, B2modb
C Note: This implementaion needs B1modb only when cnt > 0

	push	%rbp
	push	%rbx
	push	%r12
	mov	%rdx, b
	mov	%rcx, pre

	C Currently needs b to not be preshifted, so undo C shift done
	C by caller. Ideally, b shouldn't be passed at all, C it
	C should be in the pre block where the cps function is free to
	C store whatever is needed.
	mov	8(pre), R32(%rcx)
	shr	R8(%rcx), b

	mov	-8(ap, n, 8), %rax
	cmp	$3, n
	jnc	L(first)
	mov	-16(ap, n, 8), r0
	jmp	L(reduce_two)

	C First iteration, no r2
	mov	24(pre), B2modb
	mul	B2modb
	mov	-24(ap, n, 8), r0
	add	%rax, r0
	mov	-16(ap, n, 8), %rax
	adc	%rdx, %rax
	sbb	r2, r2
	sub	$4, n
	jc	L(reduce_three)

	mov	B2modb, B2mb
	sub	b, B2mb

	and	B2modb, r2
	lea	(B2mb, r0), t1
	mul	B2modb
	add	r2, r0
	mov	(ap, n, 8), t0
	cmovc	t1, r0
	add	%rax, t0
	mov	r0, %rax
	adc	%rdx, %rax
	sbb 	r2, r2
	sub 	$1, n
	mov	t0, r0
	jnc	L(loop)

	C Eliminate r2
	and	b, r2
	sub	r2, %rax

	mov	8(pre), R32(%rcx)
	test	R32(%rcx), R32(%rcx)
	jz	L(normalized)

	C Unnormalized, use B1modb to reduce to size < B b
	mulq	16(pre)
	xor	t0, t0
	add	%rax, r0
	adc	%rdx, t0
	mov	t0, %rax

	C Then left-shift to normalize. FIXME: Use shld
	shl	R8(%rcx), b
	shl	R8(%rcx), %rax
	mov	r0, t0
	shl	R8(%rcx), r0
	neg	R32(%rcx)
	shr	R8(%rcx), t0
	or	t0, %rax
	neg	R32(%rcx)
	jmp	L(udiv)

	mov	%rax, t0
	sub	b, t0
	cmovnc	t0, %rax

	lea	1(%rax), t0
	mulq	(pre)
	add	r0, %rax
	adc	t0, %rdx
	imul	b, %rdx
	sub	%rdx, r0
	cmp	r0, %rax
	lea	(r0, b), %rax
	cmovnc	r0, %rax
	cmp	b, %rax
	jc	L(done)
	sub	b, %rax

	shr	R8(%rcx), %rax

	pop	%r12
	pop	%rbx
	pop	%rbp

	mov	%rbx, -24(%rsp)
	mov	%rbp, -16(%rsp)
	mov	%rsi, %rbp
	bsr	%rsi,%rax
	mov	%eax, %ebx
	mov	%r12, -8(%rsp)
	sub	$24, %rsp
	xor	$63, %ebx
	mov	%rdi, %r12
	mov	%ebx, %ecx
	sal	%cl, %rbp
	mov	%rbp, %rdi
	CALL(	mpn_invert_limb)
	mov	%rbp, %r8
	mov	%rax, %r9
	mov	%rax, (%r12)
	neg	%r8
	test	%ebx, %ebx
	je	L(c0)
	xor	%ecx, %ecx
	mov	%rax, %rdx
	mov	$1, %eax
	sub	%ebx, %ecx
	shr	%cl, %rdx
	mov	%ebx, %ecx
	sal	%cl, %rax
	or	%rax, %rdx
	imul	%rdx, %r8
L(c0):	mov	%r8, %rax
	mul	%r9
	lea	1(%rdx,%r8), %rdx
	mov	%rax, %rcx
	neg	%rdx
	imul	%rbp, %rdx
	lea	(%rdx,%rbp), %rax
	cmp	%rdx, %rcx
	mov	%ebx, %ecx
	mov	8(%rsp), %rbp
	cmovb	%rax, %rdx
	shr	%cl, %r8
	shr	%cl, %rdx
	mov	%rbx, 8(%r12)
	mov	%r8, 16(%r12)
	mov	%rdx, 24(%r12)
	mov	(%rsp), %rbx
	mov	16(%rsp), %r12
	add	$24, %rsp
-------------- next part --------------


Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26.
Internet email is subject to wholesale government surveillance.

More information about the gmp-devel mailing list