[Gmp-commit] /home/hgfiles/gmp: Various mpn/alpha and mpn/powerpc64 cleanups.

Sun Dec 6 19:58:08 CET 2009

details:   /home/hgfiles/gmp/rev/7f7985d6b8af
changeset: 12998:7f7985d6b8af
user:      Torbjorn Granlund <tege at gmplib.org>
date:      Sun Dec 06 19:58:04 2009 +0100
description:
Various mpn/alpha and mpn/powerpc64 cleanups.

diffstat:

 ChangeLog                        |   12 +
 configure.in                     |   11 +-
 mpn/alpha/add_n.asm              |  203 ++++++++++++++----------
 mpn/alpha/com_n.asm              |  165 ++++++++++++++++++++
 mpn/alpha/diveby3.asm            |  322 ----------------------------------------
 mpn/alpha/ev5/add_n.asm          |  146 ------------------
 mpn/alpha/ev5/com_n.asm          |  165 --------------------
 mpn/alpha/ev5/diveby3.asm        |  321 ++++++++++++++++++++++++++++++++++++++++
 mpn/alpha/ev5/lshift.asm         |  171 ---------------------
 mpn/alpha/ev5/rshift.asm         |  169 ---------------------
 mpn/alpha/ev5/sub_n.asm          |  146 ------------------
 mpn/alpha/lshift.asm             |  160 ++++++++++++++-----
 mpn/alpha/rshift.asm             |  160 ++++++++++++++-----
 mpn/alpha/sub_n.asm              |  207 ++++++++++++++-----------
 mpn/powerpc64/mode64/diveby3.asm |   83 ----------
 15 files changed, 973 insertions(+), 1468 deletions(-)

diffs (truncated from 2620 to 300 lines):

diff -r ebb297b863a9 -r 7f7985d6b8af ChangeLog

--- a/ChangeLog	Sun Dec 06 18:16:58 2009 +0100
+++ b/ChangeLog	Sun Dec 06 19:58:04 2009 +0100
@@ -1,5 +1,17 @@
 2009-12-06  Torbjorn Granlund  <tege at gmplib.org>
 
+	* configure.in: Don't include ev5 directory for ev6* abd ev7.  Misc
+	alpha path cleanups.
+	* mpn/alpha/add_n.asm: Replaced by mpn/alpha/ev5/add_n.asm.
+	* mpn/alpha/sub_n.asm: Replaced by mpn/alpha/ev5/sub_n.asm.
+	* mpn/alpha/lshift.asm: Replaced by mpn/alpha/ev5/lshift.asm.
+	* mpn/alpha/rshift.asm: Replaced by mpn/alpha/ev5/rshift.asm.
+	* mpn/alpha/com_n.asm: New, moved from mpn/alpha/ev5/rshift.asm.
+	* mpn/alpha/ev5/diveby3.asm: New, moved from mpn/alpha/diveby3.asm.
+
+	* mpn/powerpc64/mode64/diveby3.asm: Remove, slower than mpn_bdiv_dbm1c
+	on all hardware.
+
 	* mpn/generic/powm_sec.c: Rework logic for mpn_sqr_basecase size limit.
 
 	* gmp-impl.h (mpn_redc_1_sec): Declare.
diff -r ebb297b863a9 -r 7f7985d6b8af configure.in
--- a/configure.in	Sun Dec 06 18:16:58 2009 +0100
+++ b/configure.in	Sun Dec 06 19:58:04 2009 +0100
@@ -407,11 +407,14 @@
   alpha*-*-*)
     AC_DEFINE(HAVE_HOST_CPU_FAMILY_alpha)
     case $host_cpu in
-      alphaev5* | alphapca5*) path="alpha/ev5 alpha" ;;
+      alphaev5* | alphapca5*)
+      	path="alpha/ev5 alpha" ;;
       alphaev67 | alphaev68 | alphaev7*)
-        path="alpha/ev67 alpha/ev6 alpha/ev5 alpha" ;;
-      alphaev6* | alphaev7*)  path="alpha/ev6 alpha/ev5 alpha" ;;
-      *)                      path="alpha" ;;
+        path="alpha/ev67 alpha/ev6 alpha" ;;
+      alphaev6)
+	path="alpha/ev6 alpha" ;;
+      *)
+        path="alpha" ;;
     esac
     extra_functions="cntlz"
     gcc_cflags_optlist="asm cpu oldas" # need asm ahead of cpu, see below
diff -r ebb297b863a9 -r 7f7985d6b8af mpn/alpha/add_n.asm
--- a/mpn/alpha/add_n.asm	Sun Dec 06 18:16:58 2009 +0100
+++ b/mpn/alpha/add_n.asm	Sun Dec 06 19:58:04 2009 +0100
@@ -1,7 +1,7 @@
 dnl  Alpha mpn_add_n -- Add two limb vectors of the same length > 0 and
 dnl  store sum in a third limb vector.
 
-dnl  Copyright 1995, 2000, 2002, 2005 Free Software Foundation, Inc.
+dnl  Copyright 1995, 1999, 2000, 2005 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -21,97 +21,126 @@
 include(`../config.m4')
 
 C      cycles/limb
-C EV4:     7.75
-C EV5:     5.75
-C EV6:     4
+C EV4:     ?
+C EV5:     4.75
+C EV6:     3
 
-C  INPUT PARAMETERS
-C  rp	r16
-C  up	r17
-C  vp	r18
-C  n	r19
+dnl  INPUT PARAMETERS
+dnl  res_ptr	r16
+dnl  s1_ptr	r17
+dnl  s2_ptr	r18
+dnl  size	r19
 
 ASM_START()
 PROLOGUE(mpn_add_n)
-	ldq	r3,0(r17)
-	ldq	r4,0(r18)
+	bis	r31,r31,r25		C clear cy
+	subq	r19,4,r19		C decr loop cnt
+	blt	r19,$Lend2		C if less than 4 limbs, goto 2nd loop
+C Start software pipeline for 1st loop
+	ldq	r0,0(r18)
+	ldq	r4,0(r17)
+	ldq	r1,8(r18)
+	ldq	r5,8(r17)
+	addq	r17,32,r17		C update s1_ptr
+	ldq	r2,16(r18)
+	addq	r0,r4,r20		C 1st main add
+	ldq	r3,24(r18)
+	subq	r19,4,r19		C decr loop cnt
+	ldq	r6,-16(r17)
+	cmpult	r20,r0,r25		C compute cy from last add
+	ldq	r7,-8(r17)
+	addq	r1,r5,r28		C 2nd main add
+	addq	r18,32,r18		C update s2_ptr
+	addq	r28,r25,r21		C 2nd carry add
+	cmpult	r28,r5,r8		C compute cy from last add
+	blt	r19,$Lend1		C if less than 4 limbs remain, jump
+C 1st loop handles groups of 4 limbs in a software pipeline
+	ALIGN(16)
+$Loop:	cmpult	r21,r28,r25		C compute cy from last add
+	ldq	r0,0(r18)
+	bis	r8,r25,r25		C combine cy from the two adds
+	ldq	r1,8(r18)
+	addq	r2,r6,r28		C 3rd main add
+	ldq	r4,0(r17)
+	addq	r28,r25,r22		C 3rd carry add
+	ldq	r5,8(r17)
+	cmpult	r28,r6,r8		C compute cy from last add
+	cmpult	r22,r28,r25		C compute cy from last add
+	stq	r20,0(r16)
+	bis	r8,r25,r25		C combine cy from the two adds
+	stq	r21,8(r16)
+	addq	r3,r7,r28		C 4th main add
+	addq	r28,r25,r23		C 4th carry add
+	cmpult	r28,r7,r8		C compute cy from last add
+	cmpult	r23,r28,r25		C compute cy from last add
+		addq	r17,32,r17		C update s1_ptr
+	bis	r8,r25,r25		C combine cy from the two adds
+		addq	r16,32,r16		C update res_ptr
+	addq	r0,r4,r28		C 1st main add
+	ldq	r2,16(r18)
+	addq	r25,r28,r20		C 1st carry add
+	ldq	r3,24(r18)
+	cmpult	r28,r4,r8		C compute cy from last add
+	ldq	r6,-16(r17)
+	cmpult	r20,r28,r25		C compute cy from last add
+	ldq	r7,-8(r17)
+	bis	r8,r25,r25		C combine cy from the two adds
+	subq	r19,4,r19		C decr loop cnt
+	stq	r22,-16(r16)
+	addq	r1,r5,r28		C 2nd main add
+	stq	r23,-8(r16)
+	addq	r25,r28,r21		C 2nd carry add
+		addq	r18,32,r18		C update s2_ptr
+	cmpult	r28,r5,r8		C compute cy from last add
+	bge	r19,$Loop
+C Finish software pipeline for 1st loop
+$Lend1:	cmpult	r21,r28,r25		C compute cy from last add
+	bis	r8,r25,r25		C combine cy from the two adds
+	addq	r2,r6,r28		C 3rd main add
+	addq	r28,r25,r22		C 3rd carry add
+	cmpult	r28,r6,r8		C compute cy from last add
+	cmpult	r22,r28,r25		C compute cy from last add
+	stq	r20,0(r16)
+	bis	r8,r25,r25		C combine cy from the two adds
+	stq	r21,8(r16)
+	addq	r3,r7,r28		C 4th main add
+	addq	r28,r25,r23		C 4th carry add
+	cmpult	r28,r7,r8		C compute cy from last add
+	cmpult	r23,r28,r25		C compute cy from last add
+	bis	r8,r25,r25		C combine cy from the two adds
+	addq	r16,32,r16		C update res_ptr
+	stq	r22,-16(r16)
+	stq	r23,-8(r16)
+$Lend2:	addq	r19,4,r19		C restore loop cnt
+	beq	r19,$Lret
+C Start software pipeline for 2nd loop
+	ldq	r0,0(r18)
+	ldq	r4,0(r17)
+	subq	r19,1,r19
+	beq	r19,$Lend0
+C 2nd loop handles remaining 1-3 limbs
+	ALIGN(16)
+$Loop0:	addq	r0,r4,r28		C main add
+	ldq	r0,8(r18)
+	cmpult	r28,r4,r8		C compute cy from last add
+	ldq	r4,8(r17)
+	addq	r28,r25,r20		C carry add
+	addq	r18,8,r18
+	addq	r17,8,r17
+	stq	r20,0(r16)
+	cmpult	r20,r28,r25		C compute cy from last add
+	subq	r19,1,r19		C decr loop cnt
+	bis	r8,r25,r25		C combine cy from the two adds
+	addq	r16,8,r16
+	bne	r19,$Loop0
+$Lend0:	addq	r0,r4,r28		C main add
+	addq	r28,r25,r20		C carry add
+	cmpult	r28,r4,r8		C compute cy from last add
+	cmpult	r20,r28,r25		C compute cy from last add
+	stq	r20,0(r16)
+	bis	r8,r25,r25		C combine cy from the two adds
 
-	subq	r19,1,r19
-	and	r19,4-1,r2	C number of limbs in first loop
-	bis	r31,r31,r0
-	beq	r2,$L0		C if multiple of 4 limbs, skip first loop
-
-	subq	r19,r2,r19
-
-$Loop0:	subq	r2,1,r2
-	ldq	r5,8(r17)
-	addq	r4,r0,r4
-	ldq	r6,8(r18)
-	cmpult	r4,r0,r1
-	addq	r3,r4,r4
-	cmpult	r4,r3,r0
-	stq	r4,0(r16)
-	bis	r0,r1,r0
-
-	addq	r17,8,r17
-	addq	r18,8,r18
-	bis	r5,r5,r3
-	bis	r6,r6,r4
-	addq	r16,8,r16
-	bne	r2,$Loop0
-
-$L0:	beq	r19,$Lend
-
-	ALIGN(8)
-$Loop:	subq	r19,4,r19
-
-	ldq	r5,8(r17)
-	addq	r4,r0,r4
-	ldq	r6,8(r18)
-	cmpult	r4,r0,r1
-	addq	r3,r4,r4
-	cmpult	r4,r3,r0
-	stq	r4,0(r16)
-	bis	r0,r1,r0
-
-	ldq	r3,16(r17)
-	addq	r6,r0,r6
-	ldq	r4,16(r18)
-	cmpult	r6,r0,r1
-	addq	r5,r6,r6
-	cmpult	r6,r5,r0
-	stq	r6,8(r16)
-	bis	r0,r1,r0
-
-	ldq	r5,24(r17)
-	addq	r4,r0,r4
-	ldq	r6,24(r18)
-	cmpult	r4,r0,r1
-	addq	r3,r4,r4
-	cmpult	r4,r3,r0
-	stq	r4,16(r16)
-	bis	r0,r1,r0
-
-	ldq	r3,32(r17)
-	addq	r6,r0,r6
-	ldq	r4,32(r18)
-	cmpult	r6,r0,r1
-	addq	r5,r6,r6
-	cmpult	r6,r5,r0
-	stq	r6,24(r16)
-	bis	r0,r1,r0
-
-	addq	r17,32,r17
-	addq	r18,32,r18
-	addq	r16,32,r16
-	bne	r19,$Loop
-
-$Lend:	addq	r4,r0,r4
-	cmpult	r4,r0,r1
-	addq	r3,r4,r4
-	cmpult	r4,r3,r0
-	stq	r4,0(r16)
-	bis	r0,r1,r0
+$Lret:	bis	r25,r31,r0		C return cy
 	ret	r31,(r26),1
 EPILOGUE(mpn_add_n)
 ASM_END()
diff -r ebb297b863a9 -r 7f7985d6b8af mpn/alpha/com_n.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpn/alpha/com_n.asm	Sun Dec 06 19:58:04 2009 +0100
@@ -0,0 +1,165 @@
+dnl  Alpha mpn_com_n -- mpn one's complement.
+
+dnl  Copyright 2003 Free Software Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C      cycles/limb
+C EV4:    4.75
+C EV5:    2.0
+C EV6:    1.5
+
+