[PATCH] Optimize 32-bit sparc T1 multiply routines.
David Miller
davem at davemloft.net
Thu Jan 3 11:11:45 CET 2013
2013-01-03 David S. Miller <davem at davemloft.net>
* mpn/sparc32/ultrasparct1/mul_1.asm (mpn_mul_1): Unroll main loop
one time, align code on 32-byte boundary, add T2/T3/T4 timings.
* mpn/sparc32/ultrasparct1/addmul_1.asm (mpn_addmul_1): Likewise.
* mpn/sparc32/ultrasparct1/submul_1.asm (mpn_submul_1): Likewise.
diff --git a/mpn/sparc32/ultrasparct1/addmul_1.asm b/mpn/sparc32/ultrasparct1/addmul_1.asm
index 5001726..98df2bb 100644
--- a/mpn/sparc32/ultrasparct1/addmul_1.asm
+++ b/mpn/sparc32/ultrasparct1/addmul_1.asm
@@ -1,6 +1,6 @@
dnl SPARC T1 32-bit mpn_addmul_1.
-dnl Copyright 2010 Free Software Foundation, Inc.
+dnl Copyright 2010, 2013 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -20,33 +20,60 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C UltraSPARC T1: 27
+C UltraSPARC T1: 24
+C UltraSPARC T2: 19
+C UltraSPARC T3: 19
+C UltraSPARC T4: 5
C INPUT PARAMETERS
-define(`rp', `%o0')
-define(`up', `%o1')
-define(`n', `%o2')
-define(`v0', `%o3')
+define(`rp', `%i0')
+define(`up', `%i1')
+define(`n', `%i2')
+define(`v0', `%i3')
ASM_START()
+ ALIGN(32)
PROLOGUE(mpn_addmul_1)
- mov 0, %g4
+ save %sp, -96, %sp
+ srl n, 0, %o4
srl v0, 0, v0
- srl n, 0, n
- dec n C n--
-
-L(top): lduw [up+0], %g1
- add up, 4, up C up++
- mulx %g1, v0, %g3
- lduw [rp+0], %g2
- add %g2, %g3, %g3
- add %g4, %g3, %g3
+ subcc %o4, 1, %o4
+ be L(final_one)
+ clr %o5
+
+L(top):
+ lduw [up+0], %l0
+ lduw [rp+0], %l2
+ lduw [up+4], %l1
+ lduw [rp+4], %l3
+ mulx %l0, v0, %g3
+ add up, 8, up
+ mulx %l1, v0, %o3
+ sub %o4, 2, %o4
+ add rp, 8, rp
+ add %l2, %g3, %g3
+ add %o5, %g3, %g3
+ stw %g3, [rp-8]
+ srlx %g3, 32, %o5
+ add %l3, %o3, %o3
+ add %o5, %o3, %o3
+ stw %o3, [rp-4]
+ brgz %o4, L(top)
+ srlx %o3, 32, %o5
+
+ brlz,pt %o4, L(done)
+ nop
+
+L(final_one):
+ lduw [up+0], %l0
+ lduw [rp+0], %l2
+ mulx %l0, v0, %g3
+ add %l2, %g3, %g3
+ add %o5, %g3, %g3
stw %g3, [rp+0]
- add rp, 4, rp C rp++
- srlx %g3, 32, %g4
- brnz n, L(top)
- dec n C n--
+ srlx %g3, 32, %o5
- retl
- mov %g4, %o0 C return value
+L(done):
+ ret
+ restore %o5, 0, %o0
EPILOGUE()
diff --git a/mpn/sparc32/ultrasparct1/mul_1.asm b/mpn/sparc32/ultrasparct1/mul_1.asm
index fcde0c7..a002292 100644
--- a/mpn/sparc32/ultrasparct1/mul_1.asm
+++ b/mpn/sparc32/ultrasparct1/mul_1.asm
@@ -1,6 +1,6 @@
dnl SPARC T1 32-bit mpn_mul_1.
-dnl Copyright 2010 Free Software Foundation, Inc.
+dnl Copyright 2010, 2013 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -20,7 +20,10 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C UltraSPARC T1: 23
+C UltraSPARC T1: 20
+C UltraSPARC T2: 18
+C UltraSPARC T3: 18
+C UltraSPARC T4: 4
C INPUT PARAMETERS
define(`rp', `%o0')
@@ -29,22 +32,41 @@ define(`n', `%o2')
define(`v0', `%o3')
ASM_START()
+ ALIGN(32)
PROLOGUE(mpn_mul_1)
- mov 0, %g4
- srl v0, 0, v0
srl n, 0, n
- dec n C n--
+ srl v0, 0, v0
+ subcc n, 1, n
+ be L(final_one)
+ clr %o5
+
+L(top):
+ lduw [up+0], %g1
+ lduw [up+4], %g2
+ mulx %g1, v0, %g3
+ add up, 8, up
+ mulx %g2, v0, %o4
+ sub n, 2, n
+ add rp, 8, rp
+ add %o5, %g3, %g3
+ stw %g3, [rp-8]
+ srlx %g3, 32, %o5
+ add %o5, %o4, %o4
+ stw %o4, [rp-4]
+ brgz n, L(top)
+ srlx %o4, 32, %o5
+
+ brlz,pt n, L(done)
+ nop
-L(top): lduw [up+0], %g1
- add up, 4, up C up++
+L(final_one):
+ lduw [up+0], %g1
mulx %g1, v0, %g3
- add %g4, %g3, %g3
+ add %o5, %g3, %g3
stw %g3, [rp+0]
- add rp, 4, rp C rp++
- srlx %g3, 32, %g4
- brnz n, L(top)
- dec n C n--
+ srlx %g3, 32, %o5
+L(done):
retl
- mov %g4, %o0 C return value
+ mov %o5, %o0
EPILOGUE()
diff --git a/mpn/sparc32/ultrasparct1/submul_1.asm b/mpn/sparc32/ultrasparct1/submul_1.asm
index 605a882..084c61c 100644
--- a/mpn/sparc32/ultrasparct1/submul_1.asm
+++ b/mpn/sparc32/ultrasparct1/submul_1.asm
@@ -1,6 +1,6 @@
dnl SPARC T1 32-bit mpn_submul_1.
-dnl Copyright 2010 Free Software Foundation, Inc.
+dnl Copyright 2010, 2013 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
@@ -20,33 +20,61 @@ dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
-C UltraSPARC T1: 27
+C UltraSPARC T1: 24
+C UltraSPARC T2: 19
+C UltraSPARC T3: 19
+C UltraSPARC T4: 5
C INPUT PARAMETERS
-define(`rp', `%o0')
-define(`up', `%o1')
-define(`n', `%o2')
-define(`v0', `%o3')
+define(`rp', `%i0')
+define(`up', `%i1')
+define(`n', `%i2')
+define(`v0', `%i3')
ASM_START()
+ ALIGN(32)
PROLOGUE(mpn_submul_1)
- subcc %g0, %g0, %g4 C clear CF and g4
+ save %sp, -96, %sp
+ srl n, 0, %o4
srl v0, 0, v0
- srl n, 0, n
- dec n C n--
-
-L(top): lduw [up+0], %g1
- add up, 4, up C up++
- mulx %g1, v0, %g3
- lduw [rp+0], %g2
- addx %g4, %g3, %g3
- srlx %g3, 32, %g4
- subcc %g2, %g3, %g3
+ subcc %o4, 1, %o4
+ be L(final_one)
+ subcc %g0, 0, %o5
+
+L(top):
+ lduw [up+0], %l0
+ lduw [rp+0], %l2
+ lduw [up+4], %l1
+ lduw [rp+4], %l3
+ mulx %l0, v0, %g3
+ add up, 8, up
+ mulx %l1, v0, %o3
+ sub %o4, 2, %o4
+ add rp, 8, rp
+ addx %o5, %g3, %g3
+ srlx %g3, 32, %o5
+ subcc %l2, %g3, %g3
+ stw %g3, [rp-8]
+ addx %o5, %o3, %o3
+ srlx %o3, 32, %o5
+ subcc %l3, %o3, %o3
+ brgz %o4, L(top)
+ stw %o3, [rp-4]
+
+ brlz,pt %o4, L(done)
+ nop
+
+L(final_one):
+ lduw [up+0], %l0
+ lduw [rp+0], %l2
+ mulx %l0, v0, %g3
+ addx %o5, %g3, %g3
+ srlx %g3, 32, %o5
+ subcc %l2, %g3, %g3
stw %g3, [rp+0]
- add rp, 4, rp C rp++
- brnz n, L(top)
- dec n C n--
- retl
- addx %g4, 0, %o0 C return value
+L(done):
+ addx %o5, 0, %o5
+ ret
+ restore %o5, 0, %o0
EPILOGUE()
--
1.7.10.4
More information about the gmp-devel
mailing list