div_qr_1 interface

Thu Oct 24 22:50:53 CEST 2013

nisse at lysator.liu.se (Niels Möller) writes:

> Torbjorn Granlund <tg at gmplib.org> writes:
>
>> Basically qp = up won't work, but qp = up + k for any positive k will?
>> Does the C code share that property?
[...]
>> I think it would be good to fix that, since it is surely a common usage
>> scenario.
>
> I agree.

I've checked in some bugfixes. Now it seems to work according to try,
and without the no-overlap patch which it turned out I never pushed into
the repo.

> If/when the code is reorganized to delay the q stores (to avoid the "adc
> Q2,8(QP, UN, 8)" instruction),

Here's a first crude version, which seems to work. I could eliminate
other uses of T, and then use that register to the previous Q1 limb
around

I had to change the logic

	lea	(B2md, U0), T
	...
        and	B2, U2
        add	U2, U0
        cmovnc	U0, T
	adc	U1, Q1

Here, B2md holds B^2 (mod d) - d, and U2 is initially a mask. To use U2
as temporary (it dies at the add), I had to change it to

        and	B2, U2
        add	U2, U0
	lea	(B2md, U0), U2
        cmovnc	U0, U2
	adc	U1, Q1

which gives the same result, provided that B2md is adjusted to just hold
-d. This sequence has less friendly dependencies. Maybe it would be
better to stick some of the loop-invariant constants in memory instead.

It's getting late, so I'll not do any benchmarking right now.

Regards,
/Niels

dnl  x86-64 mpn_div_qr_1n_pi1
dnl  -- Divide an mpn number by a normalized single-limb number,
dnl     using a single-limb inverse.

dnl  Contributed to the GNU project by Niels Möller

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

C		c/l
C AMD K8,K9	13
C AMD K10	13
C AMD bull	16.5
C AMD pile	15
C AMD steam	 ?
C AMD bobcat	16
C AMD jaguar	 ?
C Intel P4	47	poor
C Intel core	19.25
C Intel NHM	18
C Intel SBR	15	poor
C Intel IBR	13
C Intel HWL	11.7
C Intel BWL	 ?
C Intel atom	52	very poor
C VIA nano	19

C INPUT Parameters
define(`QP', `%rdi')
define(`UP', `%rsi')
define(`UN_INPUT', `%rdx')
define(`U1', `%rcx')	C Also in %rax
define(`D', `%r8')
define(`DINV', `%r9')

C Invariants
define(`B2', `%rbp')
define(`B2md', `%rbx')

C Variables
define(`UN', `%r8')	C Overlaps D input
define(`T', `%r10')
define(`U0', `%r11')
define(`U2', `%r12')
define(`Q0', `%r13')
define(`Q1', `%r14')
define(`Q2', `%r15')

ABI_SUPPORT(STD64)

	ASM_START()
	TEXT
	ALIGN(16)
PROLOGUE(mpn_div_qr_1n_pi1)
	FUNC_ENTRY(6)
IFDOS(`	mov	56(%rsp), %r8	')
IFDOS(`	mov	64(%rsp), %r9	')
	dec	UN_INPUT
	jnz	L(first)

	C Just a single 2/1 division.
	C T, U0 are allocated in scratch registers
	lea	1(U1), T
	mov	U1, %rax
	mul	DINV
	mov	(UP), U0
	add	U0, %rax
	adc	T, %rdx
	mov	%rdx, T
	imul	D, %rdx
	sub	%rdx, U0
	cmp	U0, %rax
	lea	(U0, D), %rax
	cmovnc	U0, %rax
	sbb	$0, T
	cmp	D, %rax
	jc	L(single_div_done)
	sub	D, %rax
	add	$1, T
L(single_div_done):
	mov	T, (QP)
	FUNC_EXIT
	ret
L(first):
	C FIXME: Could delay some of these until we enter the loop.
	push	%r15
	push	%r14
	push	%r13
	push	%r12
	push	%rbx
	push	%rbp

	mov	D, B2
	imul	DINV, B2
	neg	B2
C	mov	B2, B2md
C	sub	D, B2md
	mov	D, B2md
	neg	B2md

	C D not needed until final reduction
	push	D
	mov	UN_INPUT, UN	C Clobbers D

	mov	DINV, %rax
	mul	U1
	mov	%rax, Q0
	add	U1, %rdx
	mov	%rdx, Q1

	mov	B2, %rax
	mul	U1
	mov	-8(UP, UN, 8), U0
	mov	(UP, UN, 8), U1
	C mov	Q1, (QP, UN, 8)
	add	%rax, U0
	adc	%rdx, U1
	sbb	U2, U2
	dec	UN
	mov	U1, %rax
	jz	L(final)

	ALIGN(16)

	C Loop is 28 instructions, 30 decoder slots, should run in 10 cycles.
	C At entry, %rax holds an extra copy of U1
L(loop):
	C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2
	C Remains to add in B (U1 + c)
	mov	Q1, T		C Save input value
	mov	DINV, Q1
	mov	U2, Q2
	and	U2, Q1
	neg	Q2
	mul	DINV
	add	%rdx, Q1
	adc	$0, Q2
	add	Q0, Q1
	mov 	%rax, Q0
	mov	B2, %rax
	adc	$0, Q2

	C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u
	mul	U1
	and	B2, U2
	add	U2, U0
	C Could do an earlier U0 + (B2 - d),
	C if we only had a spare register
	lea	(B2md, U0), U2
	cmovnc	U0, U2

	C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c
	adc	U1, Q1
	mov	-8(UP, UN, 8), U0
	adc	T, Q2
	mov	Q2, 8(QP, UN, 8)
	jc	L(q_incr)
L(q_incr_done):
	add	%rax, U0
	mov	U2, %rax
	adc	%rdx, %rax
	C mov	Q1, (QP, UN, 8)
	sbb 	U2, U2
	dec	UN
	mov	%rax, U1
	jnz	L(loop)

L(final):
	pop	D

	mov	Q1, T	C Save input value
	mov	U2, Q1
	and	D, U2
	sub	U2, %rax
	neg	Q1

	mov	%rax, U1
	sub	D, %rax
	cmovc	U1, %rax
	sbb	$-1, Q1

	lea	1(%rax), U2
	mul	DINV
	add	U0, %rax
	adc	U2, %rdx
	mov	%rdx, U2
	imul	D, %rdx
	sub	%rdx, U0
	cmp	U0, %rax
	lea	(U0, D), %rax
	cmovnc	U0, %rax
	sbb	$0, U2
	cmp	D, %rax
	jc	L(div_done)
	sub	D, %rax
	add	$1, U2
L(div_done):
	add	U2, Q0
	mov	Q0, (QP)
	adc	T, Q1
	mov	Q1, 8(QP)
	jnc	L(done)
L(final_q_incr):
	addq	$1, 16(QP)
	lea	8(QP), QP
	jc	L(final_q_incr)

L(done):
	pop	%rbp
	pop	%rbx
	pop	%r12
	pop	%r13
	pop	%r14
	pop	%r15
	FUNC_EXIT
	ret

L(q_incr):
	C U1 is not live, so use it for indexing
	lea	16(QP, UN, 8), U1
L(q_incr_loop):
	addq	$1, (U1)
	jnc	L(q_incr_done)
	lea	8(U1), U1
	jmp	L(q_incr_loop)
EPILOGUE()

> Regards,
> /Niels

-- 
Niels Möller. PGP-encrypted email is preferred. Keyid C0B98E26.
Internet email is subject to wholesale government surveillance.