[Gmp-commit] /var/hg/gmp: 4 new changesets

Sun May 21 22:24:18 UTC 2017

details:   /var/hg/gmp/rev/765815cc0f79
changeset: 17394:765815cc0f79
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Mon May 22 00:19:54 2017 +0200
description:
Rewrite.

details:   /var/hg/gmp/rev/7c843e23d272
changeset: 17395:7c843e23d272
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Mon May 22 00:23:00 2017 +0200
description:
Tweak header comment.

details:   /var/hg/gmp/rev/069ff6facc35
changeset: 17396:069ff6facc35
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Mon May 22 00:23:32 2017 +0200
description:
New grabber file.

details:   /var/hg/gmp/rev/020ab7920eab
changeset: 17397:020ab7920eab
user:      Torbjorn Granlund <tg at gmplib.org>
date:      Mon May 22 00:24:15 2017 +0200
description:
ChangeLog

diffstat:

 ChangeLog                    |    8 ++
 mpn/x86_64/core2/com.asm     |   37 ++++++++++
 mpn/x86_64/core2/copyd.asm   |    2 +-
 mpn/x86_64/core2/copyi.asm   |    2 +-
 mpn/x86_64/core2/lshift.asm  |  134 ++++++++++++++++++-------------------
 mpn/x86_64/core2/lshiftc.asm |  152 +++++++++++++++++++++---------------------
 mpn/x86_64/core2/rshift.asm  |  136 ++++++++++++++++++-------------------
 7 files changed, 254 insertions(+), 217 deletions(-)

diffs (truncated from 649 to 300 lines):

diff -r cd7b647bdabe -r 020ab7920eab ChangeLog

--- a/ChangeLog	Sat May 20 16:03:50 2017 +0200
+++ b/ChangeLog	Mon May 22 00:24:15 2017 +0200
@@ -1,3 +1,11 @@
+2017-05-22  TorbjÃ¶rn Granlund  <tg at gmplib.org>
+
+	* mpn/x86_64/core2/com.asm: New grabber file.
+
+	* mpn/x86_64/core2/lshift.asm: Rewrite.
+	* mpn/x86_64/core2/rshift.asm: Rewrite.
+	* mpn/x86_64/core2/lshiftc.asm: Rewrite.
+
 2017-05-16  Niels MÃ¶ller  <nisse at lysator.liu.se>
 
 	* mpn/generic/divis.c (mpn_divisible_p): Updated the divisibility
diff -r cd7b647bdabe -r 020ab7920eab mpn/x86_64/core2/com.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/x86_64/core2/com.asm	Mon May 22 00:24:15 2017 +0200
@@ -0,0 +1,37 @@
+dnl  X86-64 mpn_com.
+
+dnl  Copyright 2017 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of either:
+dnl
+dnl    * the GNU Lesser General Public License as published by the Free
+dnl      Software Foundation; either version 3 of the License, or (at your
+dnl      option) any later version.
+dnl
+dnl  or
+dnl
+dnl    * the GNU General Public License as published by the Free Software
+dnl      Foundation; either version 2 of the License, or (at your option) any
+dnl      later version.
+dnl
+dnl  or both in parallel, as here.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+dnl  for more details.
+dnl
+dnl  You should have received copies of the GNU General Public License and the
+dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
+dnl  see https://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+ABI_SUPPORT(DOS64)
+ABI_SUPPORT(STD64)
+
+MULFUNC_PROLOGUE(mpn_com)
+include_mpn(`x86_64/fastsse/com-palignr.asm')
diff -r cd7b647bdabe -r 020ab7920eab mpn/x86_64/core2/copyd.asm
--- a/mpn/x86_64/core2/copyd.asm	Sat May 20 16:03:50 2017 +0200
+++ b/mpn/x86_64/core2/copyd.asm	Mon May 22 00:24:15 2017 +0200
@@ -1,4 +1,4 @@
-dnl  X86-64 mpn_copyd optimised for Intel Sandy Bridge.
+dnl  X86-64 mpn_copyd.
 
 dnl  Copyright 2012 Free Software Foundation, Inc.
 
diff -r cd7b647bdabe -r 020ab7920eab mpn/x86_64/core2/copyi.asm
--- a/mpn/x86_64/core2/copyi.asm	Sat May 20 16:03:50 2017 +0200
+++ b/mpn/x86_64/core2/copyi.asm	Mon May 22 00:24:15 2017 +0200
@@ -1,4 +1,4 @@
-dnl  X86-64 mpn_copyi optimised for Intel Sandy Bridge.
+dnl  X86-64 mpn_copyi.
 
 dnl  Copyright 2012 Free Software Foundation, Inc.
 
diff -r cd7b647bdabe -r 020ab7920eab mpn/x86_64/core2/lshift.asm
--- a/mpn/x86_64/core2/lshift.asm	Sat May 20 16:03:50 2017 +0200
+++ b/mpn/x86_64/core2/lshift.asm	Mon May 22 00:24:15 2017 +0200
@@ -1,6 +1,6 @@
-dnl  x86-64 mpn_lshift optimized for "Core 2".
+dnl  x86-64 mpn_lshift optimised for Conroe/Penryn and Nehalem.
 
-dnl  Copyright 2007, 2009, 2011, 2012 Free Software Foundation, Inc.
+dnl  Copyright 2007, 2009, 2011, 2012, 2017 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 dnl
@@ -30,17 +30,27 @@
 
 include(`../config.m4')
 
-
 C	     cycles/limb
-C AMD K8,K9	 4.25
-C AMD K10	 4.25
-C Intel P4	14.7
-C Intel core2	 1.27
-C Intel NHM	 1.375	(up to about n = 260, then 1.5)
-C Intel SBR	 1.87
-C Intel atom	 ?
-C VIA nano	 ?
-
+C AMD K8,K9	 
+C AMD K10	 
+C AMD bd1	 
+C AMD bd2	 
+C AMD bd3
+C AMD bd4
+C AMD zen
+C AMD bobcat	 
+C AMD jaguar	 
+C Intel P4	 
+C Intel core2	 1.32
+C Intel NHM	 1.30	(drops to 2.5 for n > 256)
+C Intel SBR	 
+C Intel IBR	 
+C Intel HWL	 
+C Intel BWL	 
+C Intel SKL	 
+C Intel atom	 
+C Intel SLM	 
+C VIA nano	 
 
 C INPUT PARAMETERS
 define(`rp',	`%rdi')
@@ -56,69 +66,55 @@
 	ALIGN(16)
 PROLOGUE(mpn_lshift)
 	FUNC_ENTRY(4)
-	lea	-8(rp,n,8), rp
-	lea	-8(up,n,8), up
 
-	mov	R32(%rdx), R32(%rax)
-	and	$3, R32(%rax)
-	jne	L(nb00)
-L(b00):	C n = 4, 8, 12, ...
+	xor	R32(%rax), R32(%rax)
+
+	test	$1, R8(n)
+	jnz	L(bx1)
+L(bx0):	test	$2, R8(n)
+	jnz	L(b10)
+
+L(b00):	lea	-8(up,n,8), up
+	lea	16(rp,n,8), rp
 	mov	(up), %r10
 	mov	-8(up), %r11
-	xor	R32(%rax), R32(%rax)
 	shld	R8(cnt), %r10, %rax
 	mov	-16(up), %r8
-	lea	24(rp), rp
-	sub	$4, n
+	shr	$2, n
 	jmp	L(00)
 
-L(nb00):C n = 1, 5, 9, ...
-	cmp	$2, R32(%rax)
-	jae	L(nb01)
-L(b01):	mov	(up), %r9
-	xor	R32(%rax), R32(%rax)
-	shld	R8(cnt), %r9, %rax
-	sub	$2, n
-	jb	L(le1)
-	mov	-8(up), %r10
-	mov	-16(up), %r11
-	lea	-8(up), up
-	lea	16(rp), rp
-	jmp	L(01)
-L(le1):	shl	R8(cnt), %r9
-	mov	%r9, (rp)
-	FUNC_EXIT()
-	ret
+L(bx1):	test	$2, R8(n)
+	jnz	L(b11)
 
-L(nb01):C n = 2, 6, 10, ...
-	jne	L(b11)
-L(b10):	mov	(up), %r8
-	mov	-8(up), %r9
-	xor	R32(%rax), R32(%rax)
+L(b01):	lea	-16(up,n,8), up
+	lea	8(rp,n,8), rp
+	mov	8(up), %r9
+	shld	R8(cnt), %r9, %rax
+	shr	$2, n
+	jz	L(1)
+	mov	(up), %r10
+	mov	-8(up), %r11
+	jmp	L(01)
+
+L(b10):	lea	-24(up,n,8), up
+	lea	(rp,n,8), rp
+	mov	16(up), %r8
+	mov	8(up), %r9
 	shld	R8(cnt), %r8, %rax
-	sub	$3, n
-	jb	L(le2)
-	mov	-16(up), %r10
-	lea	-16(up), up
-	lea	8(rp), rp
+	shr	$2, n
+	jz	L(2)
+	mov	(up), %r10
 	jmp	L(10)
-L(le2):	shld	R8(cnt), %r9, %r8
-	mov	%r8, (rp)
-	shl	R8(cnt), %r9
-	mov	%r9, -8(rp)
-	FUNC_EXIT()
-	ret
 
-	ALIGN(16)			C performance critical!
-L(b11):	C n = 3, 7, 11, ...
-	mov	(up), %r11
-	mov	-8(up), %r8
-	xor	R32(%rax), R32(%rax)
+	ALIGN(16)
+L(b11):	lea	-32(up,n,8), up
+	lea	-8(rp,n,8), rp
+	mov	24(up), %r11
+	mov	16(up), %r8
+	mov	8(up), %r9
 	shld	R8(cnt), %r11, %rax
-	mov	-16(up), %r9
-	lea	-24(up), up
-	sub	$4, n
-	jb	L(end)
+	shr	$2, n
+	jz	L(end)
 
 	ALIGN(16)
 L(top):	shld	R8(cnt), %r8, %r11
@@ -132,17 +128,17 @@
 	mov	%r9, -16(rp)
 L(00):	shld	R8(cnt), %r11, %r10
 	mov	-24(up), %r9
-	mov	%r10, -24(rp)
 	add	$-32, up
-	lea	-32(rp), rp
-	sub	$4, n
-	jnc	L(top)
+	mov	%r10, -24(rp)
+	add	$-32, rp
+	dec	n
+	jnz	L(top)
 
 L(end):	shld	R8(cnt), %r8, %r11
 	mov	%r11, (rp)
-	shld	R8(cnt), %r9, %r8
+L(2):	shld	R8(cnt), %r9, %r8
 	mov	%r8, -8(rp)
-	shl	R8(cnt), %r9
+L(1):	shl	R8(cnt), %r9
 	mov	%r9, -16(rp)
 	FUNC_EXIT()
 	ret
diff -r cd7b647bdabe -r 020ab7920eab mpn/x86_64/core2/lshiftc.asm
--- a/mpn/x86_64/core2/lshiftc.asm	Sat May 20 16:03:50 2017 +0200
+++ b/mpn/x86_64/core2/lshiftc.asm	Mon May 22 00:24:15 2017 +0200
@@ -1,6 +1,6 @@
-dnl  x86-64 mpn_lshiftc optimized for "Core 2".
+dnl  x86-64 mpn_lshiftc optimised for Conroe/Penryn and Nehalem.
 
-dnl  Copyright 2007, 2009, 2011, 2012 Free Software Foundation, Inc.
+dnl  Copyright 2007, 2009, 2011, 2012, 2017 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 dnl
@@ -30,17 +30,27 @@
 
 include(`../config.m4')
 
-
 C	     cycles/limb
-C AMD K8,K9	 ?
-C AMD K10	 ?
-C Intel P4	 ?
-C Intel core2	 1.5
-C Intel NHM	 2.25	(up to about n = 260, then 1.875)
-C Intel SBR	 2.25
-C Intel atom	 ?
-C VIA nano	 ?
-
+C AMD K8,K9	 
+C AMD K10	 
+C AMD bd1	 
+C AMD bd2	 
+C AMD bd3
+C AMD bd4
+C AMD zen
+C AMD bobcat	 
+C AMD jaguar	 
+C Intel P4	 
+C Intel core2	 1.52
+C Intel NHM	 1.78	(just 2.15 for n < 256)
+C Intel SBR	 
+C Intel IBR	 
+C Intel HWL	 
+C Intel BWL	 
+C Intel SKL	 
+C Intel atom