[PATCH] Optimize 32-bit sparc T1 multiply routines.

David Miller davem at davemloft.net
Thu Jan 3 11:11:45 CET 2013


2013-01-03  David S. Miller  <davem at davemloft.net>

	* mpn/sparc32/ultrasparct1/mul_1.asm (mpn_mul_1): Unroll main loop
	one time, align code on 32-byte boundary, add T2/T3/T4 timings.
	* mpn/sparc32/ultrasparct1/addmul_1.asm (mpn_addmul_1): Likewise.
	* mpn/sparc32/ultrasparct1/submul_1.asm (mpn_submul_1): Likewise.

diff --git a/mpn/sparc32/ultrasparct1/addmul_1.asm b/mpn/sparc32/ultrasparct1/addmul_1.asm
index 5001726..98df2bb 100644
--- a/mpn/sparc32/ultrasparct1/addmul_1.asm
+++ b/mpn/sparc32/ultrasparct1/addmul_1.asm
@@ -1,6 +1,6 @@
 dnl  SPARC T1 32-bit mpn_addmul_1.
 
-dnl  Copyright 2010 Free Software Foundation, Inc.
+dnl  Copyright 2010, 2013 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -20,33 +20,60 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 include(`../config.m4')
 
 C		   cycles/limb
-C UltraSPARC T1:       27
+C UltraSPARC T1:       24
+C UltraSPARC T2:       19
+C UltraSPARC T3:       19
+C UltraSPARC T4:       5
 
 C INPUT PARAMETERS
-define(`rp',	`%o0')
-define(`up',	`%o1')
-define(`n',	`%o2')
-define(`v0',	`%o3')
+define(`rp',	`%i0')
+define(`up',	`%i1')
+define(`n',	`%i2')
+define(`v0',	`%i3')
 
 ASM_START()
+	ALIGN(32)
 PROLOGUE(mpn_addmul_1)
-	mov	0, %g4
+	save	%sp, -96, %sp
+	srl	n, 0, %o4
 	srl	v0, 0, v0
-	srl	n, 0, n
-	dec	n			C n--
-
-L(top):	lduw	[up+0], %g1
-	add	up, 4, up		C up++
-	mulx	%g1, v0, %g3
-	lduw	[rp+0], %g2
-	add	%g2, %g3, %g3
-	add	%g4, %g3, %g3
+	subcc	%o4, 1, %o4
+	be	L(final_one)
+	 clr	%o5
+
+L(top):
+	lduw	[up+0], %l0
+	lduw	[rp+0], %l2
+	lduw	[up+4], %l1
+	lduw	[rp+4], %l3
+	mulx	%l0, v0, %g3
+	add	up, 8, up
+	mulx	%l1, v0, %o3
+	sub	%o4, 2, %o4
+	add	rp, 8, rp
+	add	%l2, %g3, %g3
+	add	%o5, %g3, %g3
+	stw	%g3, [rp-8]
+	srlx	%g3, 32, %o5
+	add	%l3, %o3, %o3
+	add	%o5, %o3, %o3
+	stw	%o3, [rp-4]
+	brgz	%o4, L(top)
+	 srlx	%o3, 32, %o5
+
+	brlz,pt	%o4, L(done)
+	 nop
+
+L(final_one):
+	lduw	[up+0], %l0
+	lduw	[rp+0], %l2
+	mulx	%l0, v0, %g3
+	add	%l2, %g3, %g3
+	add	%o5, %g3, %g3
 	stw	%g3, [rp+0]
-	add	rp, 4, rp		C rp++
-	srlx	%g3, 32, %g4
-	brnz	n, L(top)
-	dec	n			C n--
+	srlx	%g3, 32, %o5
 
-	retl
-	mov	%g4, %o0		C return value
+L(done):
+	ret
+	 restore %o5, 0, %o0
 EPILOGUE()
diff --git a/mpn/sparc32/ultrasparct1/mul_1.asm b/mpn/sparc32/ultrasparct1/mul_1.asm
index fcde0c7..a002292 100644
--- a/mpn/sparc32/ultrasparct1/mul_1.asm
+++ b/mpn/sparc32/ultrasparct1/mul_1.asm
@@ -1,6 +1,6 @@
 dnl  SPARC T1 32-bit mpn_mul_1.
 
-dnl  Copyright 2010 Free Software Foundation, Inc.
+dnl  Copyright 2010, 2013 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -20,7 +20,10 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 include(`../config.m4')
 
 C		   cycles/limb
-C UltraSPARC T1:       23
+C UltraSPARC T1:       20
+C UltraSPARC T2:       18
+C UltraSPARC T3:       18
+C UltraSPARC T4:       4
 
 C INPUT PARAMETERS
 define(`rp',	`%o0')
@@ -29,22 +32,41 @@ define(`n',	`%o2')
 define(`v0',	`%o3')
 
 ASM_START()
+	ALIGN(32)
 PROLOGUE(mpn_mul_1)
-	mov	0, %g4
-	srl	v0, 0, v0
 	srl	n, 0, n
-	dec	n			C n--
+	srl	v0, 0, v0
+	subcc	n, 1, n
+	be	L(final_one)
+	 clr	%o5
+
+L(top):
+	lduw	[up+0], %g1
+	lduw	[up+4], %g2
+	mulx	%g1, v0, %g3
+	add	up, 8, up
+	mulx	%g2, v0, %o4
+	sub	n, 2, n
+	add	rp, 8, rp
+	add	%o5, %g3, %g3
+	stw	%g3, [rp-8]
+	srlx	%g3, 32, %o5
+	add	%o5, %o4, %o4
+	stw	%o4, [rp-4]
+	brgz	n, L(top)
+	 srlx	%o4, 32, %o5
+
+	brlz,pt	n, L(done)
+	 nop
 
-L(top):	lduw	[up+0], %g1
-	add	up, 4, up		C up++
+L(final_one):
+	lduw	[up+0], %g1
 	mulx	%g1, v0, %g3
-	add	%g4, %g3, %g3
+	add	%o5, %g3, %g3
 	stw	%g3, [rp+0]
-	add	rp, 4, rp		C rp++
-	srlx	%g3, 32, %g4
-	brnz	n, L(top)
-	dec	n			C n--
+	srlx	%g3, 32, %o5
 
+L(done):
 	retl
-	mov	%g4, %o0		C return value
+	 mov	%o5, %o0
 EPILOGUE()
diff --git a/mpn/sparc32/ultrasparct1/submul_1.asm b/mpn/sparc32/ultrasparct1/submul_1.asm
index 605a882..084c61c 100644
--- a/mpn/sparc32/ultrasparct1/submul_1.asm
+++ b/mpn/sparc32/ultrasparct1/submul_1.asm
@@ -1,6 +1,6 @@
 dnl  SPARC T1 32-bit mpn_submul_1.
 
-dnl  Copyright 2010 Free Software Foundation, Inc.
+dnl  Copyright 2010, 2013 Free Software Foundation, Inc.
 
 dnl  This file is part of the GNU MP Library.
 
@@ -20,33 +20,61 @@ dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
 include(`../config.m4')
 
 C		   cycles/limb
-C UltraSPARC T1:       27
+C UltraSPARC T1:       24
+C UltraSPARC T2:       19
+C UltraSPARC T3:       19
+C UltraSPARC T4:       5
 
 C INPUT PARAMETERS
-define(`rp',	`%o0')
-define(`up',	`%o1')
-define(`n',	`%o2')
-define(`v0',	`%o3')
+define(`rp',	`%i0')
+define(`up',	`%i1')
+define(`n',	`%i2')
+define(`v0',	`%i3')
 
 ASM_START()
+	ALIGN(32)
 PROLOGUE(mpn_submul_1)
-	subcc	%g0, %g0, %g4		C clear CF and g4
+	save	%sp, -96, %sp
+	srl	n, 0, %o4
 	srl	v0, 0, v0
-	srl	n, 0, n
-	dec	n			C n--
-
-L(top):	lduw	[up+0], %g1
-	add	up, 4, up		C up++
-	mulx	%g1, v0, %g3
-	lduw	[rp+0], %g2
-	addx	%g4, %g3, %g3
-	srlx	%g3, 32, %g4
-	subcc	%g2, %g3, %g3
+	subcc	%o4, 1, %o4
+	be	L(final_one)
+	 subcc	%g0, 0, %o5
+
+L(top):
+	lduw	[up+0], %l0
+	lduw	[rp+0], %l2
+	lduw	[up+4], %l1
+	lduw	[rp+4], %l3
+	mulx	%l0, v0, %g3
+	add	up, 8, up
+	mulx	%l1, v0, %o3
+	sub	%o4, 2, %o4
+	add	rp, 8, rp
+	addx	%o5, %g3, %g3
+	srlx	%g3, 32, %o5
+	subcc	%l2, %g3, %g3
+	stw	%g3, [rp-8]
+	addx	%o5, %o3, %o3
+	srlx	%o3, 32, %o5
+	subcc	%l3, %o3, %o3
+	brgz	%o4, L(top)
+	 stw	%o3, [rp-4]
+
+	brlz,pt	%o4, L(done)
+	 nop
+
+L(final_one):
+	lduw	[up+0], %l0
+	lduw	[rp+0], %l2
+	mulx	%l0, v0, %g3
+	addx	%o5, %g3, %g3
+	srlx	%g3, 32, %o5
+	subcc	%l2, %g3, %g3
 	stw	%g3, [rp+0]
-	add	rp, 4, rp		C rp++
-	brnz	n, L(top)
-	dec	n			C n--
 
-	retl
-	addx	%g4, 0, %o0		C return value
+L(done):
+	addx	%o5, 0, %o5
+	ret
+	 restore %o5, 0, %o0
 EPILOGUE()
-- 
1.7.10.4



More information about the gmp-devel mailing list