[Gmp-commit] /var/hg/gmp: 3 new changesets

mercurial at gmplib.org mercurial at gmplib.org
Sun Nov 17 21:26:35 UTC 2019


details:   /var/hg/gmp/rev/b09d3f0c080a
changeset: 17967:b09d3f0c080a
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Sun Nov 17 19:46:23 2019 +0100
description:
Increase alignment; update x/l table.

details:   /var/hg/gmp/rev/c2c0aef2546c
changeset: 17968:c2c0aef2546c
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Sun Nov 17 22:05:10 2019 +0100
description:
Rewrite.

details:   /var/hg/gmp/rev/5d3ad50a7156
changeset: 17969:5d3ad50a7156
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Sun Nov 17 22:06:51 2019 +0100
description:
Rewrite.

diffstat:

 mpn/arm/v6t2/gcd_11.asm      |   23 +--
 mpn/x86_64/bt1/aorsmul_1.asm |  157 ++++++++++++------------
 mpn/x86_64/bt1/mul_1.asm     |  271 +++++++++++++++++++++++++-----------------
 3 files changed, 246 insertions(+), 205 deletions(-)

diffs (truncated from 568 to 300 lines):

diff -r d296f15b0eec -r 5d3ad50a7156 mpn/arm/v6t2/gcd_11.asm
--- a/mpn/arm/v6t2/gcd_11.asm	Sun Nov 17 18:34:30 2019 +0100
+++ b/mpn/arm/v6t2/gcd_11.asm	Sun Nov 17 22:06:51 2019 +0100
@@ -1,9 +1,7 @@
 dnl  ARM v6t2 mpn_gcd_11.
 
-dnl  Based on the K7 gcd_1.asm, by Kevin Ryde.  Rehacked for ARM by Torbjörn
-dnl  Granlund.
-
-dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012 Free Software Foundation, Inc.
+dnl  Copyright 2000-2002, 2005, 2009, 2011, 2012, 2019 Free Software Foundation,
+dnl  Inc.
 
 dnl  This file is part of the GNU MP Library.
 dnl
@@ -36,21 +34,20 @@
 C	     cycles/bit (approx)
 C StrongARM	 -
 C XScale	 -
-C Cortex-A5	 5.75	obsolete
-C Cortex-A7	 6.38	obsolete
-C Cortex-A8	 5.0	obsolete
-C Cortex-A9	 5.3	obsolete
-C Cortex-A15	 2.92	obsolete
-C Cortex-A17	 5.63	obsolete
-C Cortex-A53	 4.25	obsolete
-C Numbers measured with: speed -CD -s8-32 -t24 mpn_gcd_1
+C Cortex-A5	 5.2
+C Cortex-A7	 5.04
+C Cortex-A8	 3.59
+C Cortex-A9	 9.5
+C Cortex-A15	 3.2
+C Cortex-A17	 5.25
+C Cortex-A53	 3.57
 
 define(`u0',    `r0')
 define(`v0',    `r1')
 
 ASM_START()
 	TEXT
-	ALIGN(16)
+	ALIGN(64)
 PROLOGUE(mpn_gcd_11)
 	subs	r3, u0, v0	C			0
 	beq	L(end)		C
diff -r d296f15b0eec -r 5d3ad50a7156 mpn/x86_64/bt1/aorsmul_1.asm
--- a/mpn/x86_64/bt1/aorsmul_1.asm	Sun Nov 17 18:34:30 2019 +0100
+++ b/mpn/x86_64/bt1/aorsmul_1.asm	Sun Nov 17 22:06:51 2019 +0100
@@ -1,6 +1,7 @@
-dnl  AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD bobcat.
+dnl  AMD64 mpn_addmul_1 and mpn_submul_1 optimised for AMD bt1/bt2.
 
-dnl  Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
+dnl  Copyright 2003-2005, 2007, 2008, 2011, 2012, 2018-2019 Free Software
+dnl  Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 dnl
@@ -31,25 +32,29 @@
 include(`../config.m4')
 
 C	     cycles/limb
-C AMD K8,K9      4.52
-C AMD K10        4.51
-C AMD bull       4.66
-C AMD pile       4.57
-C AMD steam
-C AMD excavator
-C AMD bobcat     5.05
-C AMD jaguar     5.22
-C Intel P4      16.8    18.6
-C Intel core2    5.59
-C Intel NHM      5.39
-C Intel SBR      3.93
-C Intel IBR      3.59
-C Intel HWL      3.61
-C Intel BWL      2.76
-C Intel SKL      2.77
-C Intel atom    23
-C Intel SLM      8
-C VIA nano       5.63
+C AMD K8,K9	 4.52		old measurement
+C AMD K10	 4.51		old measurement
+C AMD bd1	 4.66		old measurement
+C AMD bd2	 4.57		old measurement
+C AMD bd3	 ?
+C AMD bd4	 ?
+C AMD zen	 ?
+C AMD bt1	 5.04
+C AMD bt2	 5.07
+C Intel P4	16.8	18.6	old measurement
+C Intel PNR	 5.59		old measurement
+C Intel NHM	 5.39		old measurement
+C Intel SBR	 3.93		old measurement
+C Intel IBR	 3.59		old measurement
+C Intel HWL	 3.61		old measurement
+C Intel BWL	 2.76		old measurement
+C Intel SKL	 2.77		old measurement
+C Intel atom	23		old measurement
+C Intel SLM	 8		old measurement
+C Intel GLM	 ?
+C VIA nano	 5.63		old measurement
+
+C The ALIGNment here might look completely ad-hoc.  They are not.
 
 ABI_SUPPORT(DOS64)
 ABI_SUPPORT(STD64)
@@ -91,7 +96,7 @@
 
 ASM_START()
 	TEXT
-	ALIGN(16)
+	ALIGN(64)
 PROLOGUE(func)
 IFDOS(`	push	%rsi		')
 IFDOS(`	push	%rdi		')
@@ -100,91 +105,85 @@
 	push	%rbx
 	mov	(up), %rax
 
-	lea	-16(rp,n_param,8), rp
-	lea	-16(up,n_param,8), up
-
+	lea	(rp,n_param,8), rp
+	lea	(up,n_param,8), up
 	mov	n_param, n
-	and	$3, R32(n_param)
-	jz	L(b0)
-	cmp	$2, R32(n_param)
-	ja	L(b3)
-	jz	L(b2)
+
+	test	$1, R8(n_param)
+	jne	L(bx1)
 
-L(b1):	mul	v0
-	cmp	$1, n
-	jz	L(n1)
-	mov	%rax, w2
-	mov	%rdx, w3
+L(bx0):	mul	v0
 	neg	n
-	add	$3, n
-	jmp	L(L1)
-L(n1):	ADDSUB	%rax, 8(rp)
-	adc	$0, %rdx
-	mov	%rdx, %rax
-	pop	%rbx
-IFDOS(`	pop	%rdi		')
-IFDOS(`	pop	%rsi		')
-	ret
+	mov	%rax, w0
+	mov	%rdx, w1
+	test	$2, R8(n)
+	jne	L(L2)
 
-L(b3):	mul	v0
-	mov	%rax, w2
+L(b00):	add	$2, n
+	jmp	L(L0)
+
+	ALIGN(16)
+L(bx1):	mul	v0
+	test	$2, R8(n)
+	je	L(b01)
+
+L(b11):	mov	%rax, w2
 	mov	%rdx, w3
 	neg	n
 	inc	n
 	jmp	L(L3)
 
-L(b0):	mul	v0
+	ALIGN(16)
+L(b01):	sub	$3, n
+	jc	L(n1)
+	mov	%rax, w2
+	mov	%rdx, w3
+	neg	n
+
+	ALIGN(16)
+L(top):	mov	-16(up,n,8), %rax
+	mul	v0
 	mov	%rax, w0
 	mov	%rdx, w1
-	neg	n
-	add	$2, n
-	jmp	L(L0)
-
-L(b2):	mul	v0
-	mov	%rax, w0
-	mov	%rdx, w1
-	neg	n
-	jmp	L(L2)
-
-	ALIGN(16)
-L(top):	ADDSUB	w0, -16(rp,n,8)
+	ADDSUB	w2, -24(rp,n,8)
+	adc	w3, w0
+	adc	$0, w1
+L(L0):	mov	-8(up,n,8), %rax
+	mul	v0
+	mov	%rax, w2
+	mov	%rdx, w3
+	ADDSUB	w0, -16(rp,n,8)
 	adc	w1, w2
 	adc	$0, w3
-L(L1):	mov	0(up,n,8), %rax
+L(L3):	mov	(up,n,8), %rax
 	mul	v0
 	mov	%rax, w0
 	mov	%rdx, w1
 	ADDSUB	w2, -8(rp,n,8)
 	adc	w3, w0
 	adc	$0, w1
-L(L0):	mov	8(up,n,8), %rax
+L(L2):	mov	8(up,n,8), %rax
 	mul	v0
 	mov	%rax, w2
 	mov	%rdx, w3
-	ADDSUB	w0, 0(rp,n,8)
+	ADDSUB	w0, (rp,n,8)
 	adc	w1, w2
 	adc	$0, w3
-L(L3):	mov	16(up,n,8), %rax
-	mul	v0
-	mov	%rax, w0
-	mov	%rdx, w1
-	ADDSUB	w2, 8(rp,n,8)
-	adc	w3, w0
-	adc	$0, w1
-L(L2):	mov	24(up,n,8), %rax
-	mul	v0
-	mov	%rax, w2
-	mov	%rdx, w3
 	add	$4, n
 	js	L(top)
 
-L(end):	ADDSUB	w0, (rp)
-	adc	w1, w2
-	adc	$0, w3
-	ADDSUB	w2, 8(rp)
-	adc	$0, w3
-	mov	w3, %rax
+L(end):	xor	R32(%rax), R32(%rax)
+	ADDSUB	w2, -8(rp)
+	adc	w3, %rax
+	pop	%rbx
+IFDOS(`	pop	%rdi		')
+IFDOS(`	pop	%rsi		')
+	ret
 
+	ALIGN(32)
+L(n1):	ADDSUB	%rax, -8(rp)
+	mov	$0, R32(%rax)
+	adc	%rdx, %rax
 	pop	%rbx
 IFDOS(`	pop	%rdi		')
 IFDOS(`	pop	%rsi		')
diff -r d296f15b0eec -r 5d3ad50a7156 mpn/x86_64/bt1/mul_1.asm
--- a/mpn/x86_64/bt1/mul_1.asm	Sun Nov 17 18:34:30 2019 +0100
+++ b/mpn/x86_64/bt1/mul_1.asm	Sun Nov 17 22:06:51 2019 +0100
@@ -1,6 +1,7 @@
-dnl  AMD64 mpn_mul_1 optimised for AMD bobcat.
+dnl  AMD64 mpn_mul_1 optimised for AMD bt1/bt2.
 
-dnl  Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
+dnl  Copyright 2003-2005, 2007, 2008, 2011, 2012, 2019 Free Software
+dnl  Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 dnl
@@ -31,25 +32,27 @@
 include(`../config.m4')
 
 C	     cycles/limb
-C AMD K8,K9      4.53
-C AMD K10        4.53
-C AMD bull       4.56
-C AMD pile       4.47
-C AMD steam
-C AMD excavator
-C AMD bobcat     5.07
-C AMD jaguar     5.23    5.82
-C Intel P4      12.6
-C Intel core2    4.53
-C Intel NHM      4.36
-C Intel SBR      3.0
-C Intel IBR      2.55
-C Intel HWL      2.28
-C Intel BWL      2.36
-C Intel SKL      2.39
-C Intel atom    21.0
-C Intel SLM      9
-C VIA nano
+C AMD K8,K9	 4.53		old measurement
+C AMD K10	 4.53		old measurement
+C AMD bd1	 4.56		old measurement
+C AMD bd2	 4.47		old measurement
+C AMD bd3	 ?


More information about the gmp-commit mailing list