div_qr_1n_pi1

Wed Jul 21 14:59:25 UTC 2021

Torbjörn Granlund <tg at gmplib.org> writes:

> nisse at lysator.liu.se (Niels Möller) writes:
>
>   Same as in the current (from 2013) version. Delaying the write is a bit
>   tricky, since we already use all registers. But it would be better to
>   update the quotient limbs in memory only in the unlikely
>   carry-propagation case. I figure adc to memory is no worse than explicit
>   load, adc, store (or adc from memory, store)?
>
> I did not realise that the register pressure was so bad.  Perhaps trying
> to decrease that would be most helpful.  Sometimes, when values tend to
> naturally migrate, some unrolling can reduce register pressure.

It turned out letting the Q1 register live between iterations wasn't
that difficult.

> If requiring mulx helps, I would for now forget about mul.  All relevant
> CPUs have mulx.

I now have a mulx version working. It's much neater, and slightly
faster. I think it can be sped up a bit more with better scheduling
(some of my intermediate less neat versions were faster than this), but
not really sure how. Could also switch to adox/adcx to get a little more
scheduling freedom.

L(loop):
	mulx(	B2, P0, P1)	C {p1, p0} <-- u1 B2
	mulx(	DINV, T0, T1)	C {t1, t0} <-- u1 dinv

	C {q2, q0} <-- u1 + q0
	xor	Q2, Q2
	add	%rdx, Q0
	adc	$0, Q2

	C {u2, u1, u0} <-- {u0, up[n-1]} + { p1, p0 }
	mov 	U0, %rdx
	mov	-8(UP, UN, 8), U0
	add	P0, U0
	adc	P1, %rdx		C carry represents U2

	C u1 <-- u1 - u2 d
	lea	(%rdx, D), P0
	cmovc	P0, %rdx

	C {q2, q0} <-- {q2, q0} + {q1, t1} + u2
	adc	T1, Q0
	adc	Q1, Q2
	mov	Q2, 8(QP, UN, 8)
	jc	L(q_incr)
L(q_incr_done):
	mov	Q0, Q1
	mov	T0, Q0
	dec	UN
	jnz	L(loop)

The most critical recurrency is the one via %rdx, used for
U1: That's mulx, add, adc, lea, cmov. In addition, each iteration
depends on the previous one via U0, Q0 and Q1.

Complete file attached, if you want to try it out.

Regards,
/Niels

-------------- next part --------------
dnl  x86-64 mpn_div_qr_1n_pi1
dnl  -- Divide an mpn number by a normalized single-limb number,
dnl     using a single-limb inverse.

dnl  Contributed to the GNU project by Niels Möller

dnl  This file is part of the GNU MP Library.
dnl
dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of either:
dnl
dnl    * the GNU Lesser General Public License as published by the Free
dnl      Software Foundation; either version 3 of the License, or (at your
dnl      option) any later version.
dnl
dnl  or
dnl
dnl    * the GNU General Public License as published by the Free Software
dnl      Foundation; either version 2 of the License, or (at your option) any
dnl      later version.
dnl
dnl  or both in parallel, as here.
dnl
dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
dnl  for more details.
dnl
dnl  You should have received copies of the GNU General Public License and the
dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
dnl  see https://www.gnu.org/licenses/.

include(`../config.m4')

C INPUT Parameters
define(`QP', `%rdi')
define(`UP', `%rsi')
define(`UN_INPUT', `%rdx')
define(`U1_INPUT', `%rcx')
define(`D', `%r8')
define(`DINV', `%r9')

C Invariants
define(`B2', `%rbp')

C Variables. Note that U1 is register %rdx, which is special for mulx.
define(`UN', `%rbx')
define(`P0', `%r10')
define(`T1', `%r12')
define(`U0', `%r11')
define(`Q0', `%r13')
define(`Q1', `%r14')
define(`Q2', `%r15')
define(`T0', `%rax')
define(`P1', `%rcx') 		C Overlaps U1_INPUT

ABI_SUPPORT(STD64)

	ASM_START()
	TEXT
	ALIGN(16)
PROLOGUE(mpn_div_qr_1n_pi1)
	FUNC_ENTRY(4)
IFDOS(`	mov	56(%rsp), %r8	')
IFDOS(`	mov	64(%rsp), %r9	')
	dec	UN_INPUT
	jnz	L(first)

	C Just a single 2/1 division.
	C Use scratch registers only
	lea	1(U1_INPUT), %r10
	mov	U1_INPUT, %rdx
	mulx(	DINV, %rax, %r11)
	mov	(UP), %rcx
	add	%rcx, %rax
	adc	%r10, %r11
	mov	%r11, %r10
	imul	D, %r11
	sub	%r11, %rcx
	cmp	%rcx, %rax

	lea	(%rcx, D), %rax	C For return value
	cmovnc	%rcx, %rax
	sbb	$0, %r10
	cmp	D, %rax
	jc	L(single_div_done)
	sub	D, %rax
	add	$1, %r10
L(single_div_done):
	mov	%r10, (QP)
	FUNC_EXIT()
	ret
L(first):
	C FIXME: Could delay some of these until we enter the loop.
	push	%r15
	push	%r14
	push	%r13
	push	%r12
	push	%rbx
	push	%rbp

	neg	D
	mov	D, B2
	imul	DINV, B2

	mov	UN_INPUT, UN
	mov	U1_INPUT, %rdx

	mulx(	DINV, T0, P1)
	mov	T0, Q0
	mov	%rdx, Q1
	add	P1, Q1

	mulx(	B2, T0, P1)
	mov	-8(UP, UN, 8), U0
	mov	(UP, UN, 8), %rdx
	add	T0, U0
	adc	P1, %rdx
	lea	(%rdx, D), P0
	cmovc	P0, %rdx
	adc	$0, Q1
	mov	Q1, (QP, UN, 8)

	dec	UN
	jz	L(final)

	C Algorithm based on DIV_QR_1N_METHOD == 3

	C For the u update:
	C        +-------+
	C        |u1 * B2|
	C        +---+---+
	C      + |u0 |u-1|
	C        +---+---+
	C      - | d |     (conditional on carry)
	C     ---+---+---+
	C        |u1 | u0|
	C        +---+---+

	C For the q update:
	C           +-------+
	C           |u1 * v |
	C           +---+---+
	C           | u1|
	C           +---+
	C           | 1 |  (conditional on {u1, u0} carry)
	C       +---+---+
	C    +  | q1| q0|
	C   +---+---+---+---+
	C   | q3| q2| q1| q0|
	C   +---+---+---+---+

	ALIGN(16)
L(loop):
	mulx(	B2, P0, P1)	C {p1, p0} <-- u1 B2
	mulx(	DINV, T0, T1)	C {t1, t0} <-- u1 dinv

	C {q2, q0} <-- u1 + q0
	xor	Q2, Q2
	add	%rdx, Q0
	adc	$0, Q2

	C {u2, u1, u0} <-- {u0, up[n-1]} + { p1, p0 }
	mov 	U0, %rdx
	mov	-8(UP, UN, 8), U0
	add	P0, U0
	adc	P1, %rdx		C carry represents U2

	C u1 <-- u1 - u2 d
	lea	(%rdx, D), P0
	cmovc	P0, %rdx

	C {q2, q0} <-- {q2, q0} + {q1, t1} + u2
	adc	T1, Q0
	adc	Q1, Q2
	mov	Q2, 8(QP, UN, 8)
	jc	L(q_incr)
L(q_incr_done):
	mov	Q0, Q1
	mov	T0, Q0
	dec	UN
	jnz	L(loop)

L(final):
	neg	D
	xor	Q2, Q2

	mov	%rdx, P0
	sub	D, %rdx
	cmovc	P0, %rdx
	sbb	$-1, Q2

	lea	1(%rdx), P0

	mulx(	DINV, T0, P1)
	add	U0, T0
	adc	P0, P1
	mov	P1, P0
	imul	D, P1
	sub	P1, U0
	cmp	U0, T0
	lea	(U0, D), %rax	C Clobbers T0
	cmovnc	U0, %rax
	sbb	$0, P0
	cmp	D, %rax
	jc	L(div_done)
	sub	D, %rax
	add	$1, P0
L(div_done):
	add	P0, Q0
	mov	Q0, (QP)
	adc	Q2, Q1
	mov	Q1, 8(QP)
	jnc	L(done)

L(final_q_incr):
	addq	$1, 16(QP)
	lea	8(QP), QP
	jc	L(final_q_incr)
L(done):
	pop	%rbp
	pop	%rbx
	pop	%r12
	pop	%r13
	pop	%r14
	pop	%r15
	FUNC_EXIT()
	ret

L(q_incr):
	C Q1 is not live, so use it for indexing
	lea	16(QP, UN, 8), P0
L(q_incr_loop):
	addq	$1, (P0)
	jnc	L(q_incr_done)
	lea	8(P0), P0
	jmp	L(q_incr_loop)
EPILOGUE()
-------------- next part --------------

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid 368C6677.
Internet email is subject to wholesale government surveillance.