[Gmp-commit] /var/hg/gmp: x86_64 implementation of mpn_div_qr_1n_pi1.

mercurial at gmplib.org mercurial at gmplib.org
Sun Oct 20 22:52:20 CEST 2013


details:   /var/hg/gmp/rev/8e7653cd8492
changeset: 16060:8e7653cd8492
user:      Niels Möller <nisse at lysator.liu.se>
date:      Sun Oct 20 22:43:38 2013 +0200
description:
x86_64 implementation of mpn_div_qr_1n_pi1.

diffstat:

 ChangeLog                    |    7 +
 mpn/asm-defs.m4              |    1 +
 mpn/x86_64/div_qr_1n_pi1.asm |  216 +++++++++++++++++++++++++++++++++++++++++++
 tune/div_qr_1_tune.c         |    2 +
 4 files changed, 226 insertions(+), 0 deletions(-)

diffs (261 lines):

diff -r 066e1b19ff4f -r 8e7653cd8492 ChangeLog
--- a/ChangeLog	Sun Oct 20 16:34:09 2013 +0200
+++ b/ChangeLog	Sun Oct 20 22:43:38 2013 +0200
@@ -1,5 +1,12 @@
 2013-10-20  Niels Möller  <nisse at lysator.liu.se>
 
+	* mpn/x86_64/div_qr_1n_pi1.asm: New file.
+
+	* tune/div_qr_1_tune.c (__gmpn_div_qr_1n_pi1): Check
+	div_qr_1n_pi1_method only when !HAVE_NATIVE_mpn_div_qr_1n_pi1.
+
+	* mpn/asm-defs.m4 (define_mpn): Add div_qr_1n_pi1.
+
 	* tune/common.c (speed_mpn_div_qr_1): New function, replacing...
 	(speed_mpn_div_qr_1n, speed_mpn_div_qr_1u): ... deleted functions
 	(speed_mpn_div_qr_1n_pi1, speed_mpn_div_qr_1n_pi1_1)
diff -r 066e1b19ff4f -r 8e7653cd8492 mpn/asm-defs.m4
--- a/mpn/asm-defs.m4	Sun Oct 20 16:34:09 2013 +0200
+++ b/mpn/asm-defs.m4	Sun Oct 20 22:43:38 2013 +0200
@@ -1367,6 +1367,7 @@
 define_mpn(copyi)
 define_mpn(count_leading_zeros)
 define_mpn(count_trailing_zeros)
+define_mpn(div_qr_1n_pi1)
 define_mpn(div_qr_2)
 define_mpn(div_qr_2n_pi1)
 define_mpn(div_qr_2u_pi1)
diff -r 066e1b19ff4f -r 8e7653cd8492 mpn/x86_64/div_qr_1n_pi1.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/div_qr_1n_pi1.asm	Sun Oct 20 22:43:38 2013 +0200
@@ -0,0 +1,216 @@
+dnl  x86-64 mpn_div_qr_1n_pi1
+dnl  -- Divide an mpn number by a normalized single-limb number,
+dnl     using a single-limb inverse.
+
+dnl Contributed to the GNU project by Niels Möller
+
+dnl  Copyright 2007, 2008, 2010, 2011, 2012 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C		c/l
+C INPUT Parameters
+define(`QP', `%rdi')
+define(`UP', `%rsi')
+define(`UN_INPUT', `%rdx')
+define(`U1', `%rcx')	C Also in %rax
+define(`D', `%r8')
+define(`DINV', `%r9')
+
+C Invariants
+define(`B2', `%rbp')
+define(`B2md', `%rbx')
+
+C Variables
+define(`UN', `%r8')	C Overlaps D input
+define(`T', `%r10')
+define(`U0', `%r11')
+define(`U2', `%r12')
+define(`Q0', `%r13')
+define(`Q1', `%r14')
+define(`Q2', `%r15')
+
+ABI_SUPPORT(STD64)
+
+	ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_div_qr_1n_pi1)
+	FUNC_ENTRY(6)
+IFDOS(`	mov	56(%rsp), %r8	')
+IFDOS(`	mov	64(%rsp), %r9	')
+	dec	UN_INPUT
+	jnz	L(first)
+
+	C Just a single 2/1 division.
+	C T, U0 are allocated in scratch registers
+	lea	1(U1), T
+	mov	U1, %rax
+	mul	DINV
+	mov	(UP), U0
+	add	U0, %rax
+	adc	T, %rdx
+	imul	D, %rdx
+	sub	%rdx, U0
+	cmp	U0, %rax
+	lea	(U0, D), %rax
+	cmovnc	U0, %rax
+	sbb	$0, T
+	cmp	D, %rax
+	jc	L(single_div_done)
+	sub	D, %rax
+	add	$1, T
+L(single_div_done):
+	mov	T, (QP)
+	FUNC_EXIT
+	ret
+L(first):
+	C FIXME: Could delay some of these until we enter the loop.
+	push	%r15
+	push	%r14
+	push	%r13
+	push	%r12
+	push	%rbx
+	push	%rbp
+
+	mov	D, B2
+	imul	DINV, B2
+	neg	B2
+	mov	B2, B2md
+	sub	D, B2md
+	
+	C D not needed until final reduction
+	push	D
+	mov	UN_INPUT, UN	C Clobbers D
+
+	mov	DINV, %rax
+	mul	U1
+	mov	%rax, Q0
+	add	U1, %rdx
+	mov	%rdx, (QP, UN, 8)
+
+	mov	B2, %rax
+	mul	U1
+	mov	-8(UP, UN, 8), U0
+	mov	(UP, UN, 8), U1
+	add	%rax, U0
+	adc	%rdx, U1
+	sbb	U2, U2
+	dec	UN
+	mov	U1, %rax
+	jz	L(final)
+	
+	ALIGN(16)
+
+	C Loop is 28 instructions, 30 decoder slots, should run in 10 cycles.
+	C At entry, %rax holds an extra copy of U1
+L(loop):
+	C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2
+	C Remains to add in B (U1 + c)
+	mov	DINV, Q1
+	mov	U2, Q2
+	and	U2, Q1
+	neg	Q2
+	mul	DINV 
+	add	%rdx, Q1
+	adc	$0, Q2
+	add	Q0, Q1
+	mov 	%rax, Q0
+	mov	U1, %rax	C 0  7
+	lea	(B2md, U0), T	C    6
+	adc	$0, Q2
+
+	C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u
+	mul	B2		C 1  8
+	and	B2, U2		C    8
+	add	U2, U0		C    9
+	cmovnc	U0, T		C   10
+
+	C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c
+	adc	U1, Q1
+	mov	-8(UP, UN, 8), U0
+	adc	Q2,8(QP, UN, 8)
+	jc	L(q_incr)
+L(q_incr_done):
+	add	%rax, U0	C 5 11
+	mov	T, %rax
+	adc	%rdx, %rax	C 6 12
+	mov	Q1, (QP, UN, 8)
+	sbb 	U2, U2		C 7 13
+	dec	UN
+	mov	%rax, U1 
+	jnz	L(loop)
+
+L(final):
+	pop	D
+
+	mov	U2, Q1
+	and	D, U2
+	sub	U2, %rax
+	neg	Q1
+
+	mov	%rax, U1
+	sub	D, %rax
+	cmovc	U1, %rax
+	sbb	$-1, Q1
+
+	lea	1(%rax), T
+	mul	DINV
+	add	U0, %rax
+	adc	T, %rdx
+	mov	%rdx, T
+	imul	D, %rdx
+	sub	%rdx, U0
+	cmp	U0, %rax
+	lea	(U0, D), %rax
+	cmovnc	U0, %rax
+	sbb	$0, T
+	cmp	D, %rax
+	jc	L(div_done)
+	sub	D, %rax
+	add	$1, T
+L(div_done):	
+	add	T, Q0
+	mov	Q0, (QP)
+	adc	Q1, 8(QP)
+	jnc	L(done)
+L(final_q_incr):
+	addq	$1, 16(QP)
+	lea	8(QP), QP
+	jc	L(final_q_incr)
+
+L(done):
+	pop	%rbp
+	pop	%rbx
+	pop	%r12
+	pop	%r13
+	pop	%r14
+	pop	%r15
+	FUNC_EXIT
+	ret
+
+L(q_incr):
+	C U1 is not live, so use it for indexing
+	lea	16(QP, UN, 8), U1
+L(q_incr_loop):
+	addq	$1, (U1)
+	jnc	L(q_incr_done)
+	lea	8(U1), U1
+	jmp	L(q_incr_loop)
+EPILOGUE()
diff -r 066e1b19ff4f -r 8e7653cd8492 tune/div_qr_1_tune.c
--- a/tune/div_qr_1_tune.c	Sun Oct 20 16:34:09 2013 +0200
+++ b/tune/div_qr_1_tune.c	Sun Oct 20 22:43:38 2013 +0200
@@ -25,8 +25,10 @@
 mp_limb_t mpn_div_qr_1n_pi1_1 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
 mp_limb_t mpn_div_qr_1n_pi1_2 (mp_ptr, mp_srcptr, mp_size_t, mp_limb_t, mp_limb_t, mp_limb_t);
 
+#if !HAVE_NATIVE_mpn_div_qr_1n_pi1
 #define __gmpn_div_qr_1n_pi1 \
   (div_qr_1n_pi1_method == 1 ? mpn_div_qr_1n_pi1_1 : mpn_div_qr_1n_pi1_2)
+#endif
 
 #undef mpn_div_qr_1
 #define mpn_div_qr_1 mpn_div_qr_1_tune


More information about the gmp-commit mailing list