[Gmp-commit] /var/hg/gmp: Moved x86_64/k10/div_qr_1n_pi1.asm to x86_64/k8. Ap...

Tue Oct 22 16:52:21 CEST 2013

details:   /var/hg/gmp/rev/60e7f3167327
changeset: 16066:60e7f3167327
user:      Niels Möller <nisse at lysator.liu.se>
date:      Tue Oct 22 16:51:52 2013 +0200
description:
Moved x86_64/k10/div_qr_1n_pi1.asm to x86_64/k8. Applied k8 tweak by Torbjörn.

diffstat:

 ChangeLog                        |    7 +-
 mpn/x86_64/k10/div_qr_1n_pi1.asm |  218 --------------------------------------
 mpn/x86_64/k8/div_qr_1n_pi1.asm  |  220 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 225 insertions(+), 220 deletions(-)

diffs (truncated from 462 to 300 lines):

diff -r e9a5ec7f4003 -r 60e7f3167327 ChangeLog

--- a/ChangeLog	Tue Oct 22 10:16:16 2013 +0200
+++ b/ChangeLog	Tue Oct 22 16:51:52 2013 +0200
@@ -1,7 +1,10 @@
 2013-10-22  Niels Möller  <nisse at lysator.liu.se>
 
-	* mpn/x86_64/k10/div_qr_1n_pi1.asm: New file. Differs from generic
-	x86_64 version by using cmov.
+	* mpn/x86_64/k8/div_qr_1n_pi1.asm: Moved the below k10 file here.
+	Applied tweak from Torbjörn to get it to run well on k8.
+
+	* mpn/x86_64/k10/div_qr_1n_pi1.asm: New file (renamed above).
+	Differs from generic x86_64 version by using cmov.
 
 	* mpn/x86_64/div_qr_1n_pi1.asm: Reordered arguments to second mul.
 	Deleted misleading cycle annotations.
diff -r e9a5ec7f4003 -r 60e7f3167327 mpn/x86_64/k10/div_qr_1n_pi1.asm
--- a/mpn/x86_64/k10/div_qr_1n_pi1.asm	Tue Oct 22 10:16:16 2013 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,218 +0,0 @@
-dnl  x86-64 mpn_div_qr_1n_pi1
-dnl  -- Divide an mpn number by a normalized single-limb number,
-dnl     using a single-limb inverse.
-
-dnl  Contributed to the GNU project by Niels Möller
-
-dnl  Copyright 2013 Free Software Foundation, Inc.
-
-dnl  This file is part of the GNU MP Library.
-
-dnl  The GNU MP Library is free software; you can redistribute it and/or modify
-dnl  it under the terms of the GNU Lesser General Public License as published
-dnl  by the Free Software Foundation; either version 3 of the License, or (at
-dnl  your option) any later version.
-
-dnl  The GNU MP Library is distributed in the hope that it will be useful, but
-dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
-dnl  License for more details.
-
-dnl  You should have received a copy of the GNU Lesser General Public License
-dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
-
-include(`../config.m4')
-
-
-C		c/l
-C AMD K10	11.25
-
-C INPUT Parameters
-define(`QP', `%rdi')
-define(`UP', `%rsi')
-define(`UN_INPUT', `%rdx')
-define(`U1', `%rcx')	C Also in %rax
-define(`D', `%r8')
-define(`DINV', `%r9')
-
-C Invariants
-define(`B2', `%rbp')
-define(`B2md', `%rbx')
-
-C Variables
-define(`UN', `%r8')	C Overlaps D input
-define(`T', `%r10')
-define(`U0', `%r11')
-define(`U2', `%r12')
-define(`Q0', `%r13')
-define(`Q1', `%r14')
-define(`Q2', `%r15')
-
-ABI_SUPPORT(STD64)
-
-	ASM_START()
-	TEXT
-	ALIGN(16)
-PROLOGUE(mpn_div_qr_1n_pi1)
-	FUNC_ENTRY(6)
-IFDOS(`	mov	56(%rsp), %r8	')
-IFDOS(`	mov	64(%rsp), %r9	')
-	dec	UN_INPUT
-	jnz	L(first)
-
-	C Just a single 2/1 division.
-	C T, U0 are allocated in scratch registers
-	lea	1(U1), T
-	mov	U1, %rax
-	mul	DINV
-	mov	(UP), U0
-	add	U0, %rax
-	adc	T, %rdx
-	imul	D, %rdx
-	sub	%rdx, U0
-	cmp	U0, %rax
-	lea	(U0, D), %rax
-	cmovnc	U0, %rax
-	sbb	$0, T
-	cmp	D, %rax
-	jc	L(single_div_done)
-	sub	D, %rax
-	add	$1, T
-L(single_div_done):
-	mov	T, (QP)
-	FUNC_EXIT
-	ret
-L(first):
-	C FIXME: Could delay some of these until we enter the loop.
-	push	%r15
-	push	%r14
-	push	%r13
-	push	%r12
-	push	%rbx
-	push	%rbp
-
-	mov	D, B2
-	imul	DINV, B2
-	neg	B2
-	mov	B2, B2md
-	sub	D, B2md
-	
-	C D not needed until final reduction
-	push	D
-	mov	UN_INPUT, UN	C Clobbers D
-
-	mov	DINV, %rax
-	mul	U1
-	mov	%rax, Q0
-	add	U1, %rdx
-	mov	%rdx, (QP, UN, 8)
-
-	mov	B2, %rax
-	mul	U1
-	mov	-8(UP, UN, 8), U0
-	mov	(UP, UN, 8), U1
-	add	%rax, U0
-	adc	%rdx, U1
-	sbb	U2, U2
-	dec	UN
-	mov	U1, %rax
-	jz	L(final)
-	
-	ALIGN(16)
-
-	C Loop is 28 instructions, 30 decoder slots, should run in 10 cycles.
-	C At entry, %rax holds an extra copy of U1, and carry holds an extra copy of U2.
-L(loop):
-	C {Q2, Q1, Q0} <-- DINV * U1 + B (Q0 + U2 DINV) + B^2 U2
-	C Remains to add in B (U1 + c)
-	mov	$0, Q1
-	cmovc	DINV, Q1
-	mov	U2, Q2
-	neg	Q2
-	mul	DINV 
-	add	%rdx, Q1
-	adc	$0, Q2
-	add	Q0, Q1
-	mov 	%rax, Q0
-	mov	B2, %rax
-	lea	(B2md, U0), T
-	adc	$0, Q2
-
-	C {U2, U1, U0} <-- (U0 + U2 B2 -c U) B + U1 B2 + u
-	mul	U1
-	and	B2, U2
-	add	U2, U0
-	cmovnc	U0, T
-
-	C {QP+UN, ...} <-- {QP+UN, ...} + {Q2, Q1} + U1 + c
-	adc	U1, Q1
-	mov	-8(UP, UN, 8), U0
-	adc	Q2,8(QP, UN, 8)
-	jc	L(q_incr)
-L(q_incr_done):
-	add	%rax, U0
-	mov	T, %rax
-	adc	%rdx, %rax
-	mov	Q1, (QP, UN, 8)
-	sbb 	U2, U2
-	dec	UN
-	mov	%rax, U1 
-	jnz	L(loop)
-
-L(final):
-	pop	D
-
-	mov	U2, Q1
-	and	D, U2
-	sub	U2, %rax
-	neg	Q1
-
-	mov	%rax, U1
-	sub	D, %rax
-	cmovc	U1, %rax
-	sbb	$-1, Q1
-
-	lea	1(%rax), T
-	mul	DINV
-	add	U0, %rax
-	adc	T, %rdx
-	mov	%rdx, T
-	imul	D, %rdx
-	sub	%rdx, U0
-	cmp	U0, %rax
-	lea	(U0, D), %rax
-	cmovnc	U0, %rax
-	sbb	$0, T
-	cmp	D, %rax
-	jc	L(div_done)
-	sub	D, %rax
-	add	$1, T
-L(div_done):	
-	add	T, Q0
-	mov	Q0, (QP)
-	adc	Q1, 8(QP)
-	jnc	L(done)
-L(final_q_incr):
-	addq	$1, 16(QP)
-	lea	8(QP), QP
-	jc	L(final_q_incr)
-
-L(done):
-	pop	%rbp
-	pop	%rbx
-	pop	%r12
-	pop	%r13
-	pop	%r14
-	pop	%r15
-	FUNC_EXIT
-	ret
-
-L(q_incr):
-	C U1 is not live, so use it for indexing
-	lea	16(QP, UN, 8), U1
-L(q_incr_loop):
-	addq	$1, (U1)
-	jnc	L(q_incr_done)
-	lea	8(U1), U1
-	jmp	L(q_incr_loop)
-EPILOGUE()
diff -r e9a5ec7f4003 -r 60e7f3167327 mpn/x86_64/k8/div_qr_1n_pi1.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/k8/div_qr_1n_pi1.asm	Tue Oct 22 16:51:52 2013 +0200
@@ -0,0 +1,220 @@
+dnl  x86-64 mpn_div_qr_1n_pi1
+dnl  -- Divide an mpn number by a normalized single-limb number,
+dnl     using a single-limb inverse.
+
+dnl  Contributed to the GNU project by Niels Möller
+
+dnl  Copyright 2013 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C		c/l
+C AMD K8,K10	11
+
+C INPUT Parameters
+define(`QP', `%rdi')
+define(`UP', `%rsi')
+define(`UN_INPUT', `%rdx')
+define(`U1', `%rcx')	C Also in %rax
+define(`D', `%r8')
+define(`DINV', `%r9')
+
+C Invariants
+define(`B2', `%rbp')
+define(`B2md', `%rbx')
+
+C Variables
+define(`UN', `%r8')	C Overlaps D input
+define(`T', `%r10')
+define(`U0', `%r11')
+define(`U2', `%r12')
+define(`Q0', `%r13')
+define(`Q1', `%r14')
+define(`Q2', `%r15')
+
+ABI_SUPPORT(STD64)
+
+	ASM_START()
+	TEXT
+	ALIGN(16)
+PROLOGUE(mpn_div_qr_1n_pi1)
+	FUNC_ENTRY(6)
+IFDOS(`	mov	56(%rsp), %r8	')